o
    wZhgp                     @   sT  U d dl Z d dlmZmZmZ ddlmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlZddgZedZedZe jjZdd Zi Z e!eef e"d< dd Z#dUdeeeef geeef f fddZ$e$ej%ddde&fddZ'e$ej(dVde&fddZ)e$ej*dVde&fddZ+e$ej,dVde&fdd Z-e$ej.					dWde&fd!d"Z/	dUd#e0e& d$e0e& d%e0e& d&e1de&f
d'd(Z2e$ej3ej4gddde&fd)d*Z5e$ej6de&fd+d,Z7d-d. Z8e$ej9ej:ej;gddde&fd/d0Z<d1d2 Z=dd3dee>e>e&d4f e>e&d4f e>e&d4f e	e>e&d4f  f  fd5d6Z?dd3dee>e>e&d4f e>e&d4f e>e&d4f e	e>e&d4f  f  fd7d8Z@e$ejAd9d:ddde&fd;d<ZBe$ejCd9d:de&fd=d>ZDd?d@ ZEe$ejFejGejHgddde&fdAdBZIe$ejJd9d:de&fdCdDZKe$ejLd9d:de&fdEdFZMi ej%e'ej(e)ej*e+ej,e-ej.e/ej3e5ej4e5ej6e7ej9e<ej:e<ej;e<ejFeIejGeIejHeIejAeBejCeDejJeKejLeMiZ dGdH ZNg dIZOdJdK ZPdLdM ZQdNdO ZRdPdQ ZSG dRd dZTG dSdT dTeZUdS )X    N)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyOptionalUnionTypeVarCallable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_Pc                 C   s   t | tjr	| jS | S N)
isinstancetorchZTensorshape)i r   G/var/www/auris/lib/python3.10/site-packages/torch/utils/flop_counter.py	get_shape   s   r   flop_registryc                    s   t  d d fdd
}|S )N)out_valc                    s(   t t||| f\}}} |d|i|S )N	out_shape)r   r   )r!   argskwargsr"   fr   r   nf   s   zshape_wrapper.<locals>.nfr   r&   r'   r   r%   r   shape_wrapper   s   r)   Freturnc                    s,   dt ttf dt ttf f fdd}|S )Nflop_formular*   c                    s,   st    fdd}tjj|  S )Nc                    sH   t | tjjstd|  dt|  | tv rtd|   t| < d S )Nzlregister_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), got z which is of type zduplicate registrations for )r   r   Z_opsZOpOverloadPacket
ValueErrortyper    RuntimeError)targetr+   r   r   register(   s   z=register_flop_formula.<locals>.register_fun.<locals>.register)r)   r   utilsZ_pytreeZ	tree_map_)r+   r1   get_rawtargetsr0   r   register_fun$   s
   z+register_flop_formula.<locals>.register_fun)r   r   r   )r5   r4   r6   r   r3   r   r   #   s   ()r"   c          	      O   s,   | \}}|\}}||ksJ || d | S )zCount flops for matmul.   r   )	a_shapeb_shaper"   r#   r$   mkk2nr   r   r   mm_flop9   s   r>   c                 K   
   t ||S )zCount flops for addmm.r>   Z
self_shaper8   r9   r"   r$   r   r   r   
addmm_flopD   s   
rB   c                 K   sD   | \}}}|\}}}	||ksJ ||ksJ || |	 d | }
|
S )z"Count flops for the bmm operation.r7   r   )r8   r9   r"   r$   br:   r;   b2r<   r=   flopr   r   r   bmm_flopI   s   

rF   c                 K   r?   )z&Count flops for the baddbmm operation.rF   rA   r   r   r   baddbmm_flopV   s   
rH   c	           
      K   s
   t | |S )zCount flops for _scaled_mm.r@   )
r8   r9   Zscale_a_shapeZscale_b_shapeZ
bias_shapeZscale_result_shapeZ	out_dtypeZuse_fast_accumr"   r$   r   r   r   _scaled_mm_flop]   s   
rI   x_shapew_shaper"   
transposedc           
      C   sL   | d }|r| n|dd }|^}}}	 t |t | | | | d }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   r7   Nr   )
rJ   rK   r"   rL   Z
batch_sizeZ
conv_shapeZc_outZc_inZfilter_sizerE   r   r   r   conv_flop_countn   s   
 rM   c          
      O   s   t | |||dS )zCount flops for convolution.rL   )rM   )
rJ   rK   _bias_stride_padding	_dilationrL   r"   r#   r$   r   r   r   	conv_flop   s   rS   c                 C   s   dd }d}	 |
d rt |d }|t| ||| 7 }|
d rIt |d }|r9|t|| ||||dd7 }|S |t|||| ||dd7 }|S )Nc                 S   s    | d | d gt | dd   S )Nr   r   r7   )list)r   r   r   r   t   s    zconv_backward_flop.<locals>.tr   r   FrN   )r   rM   )grad_out_shaperJ   rK   rO   rP   rQ   rR   rL   Z_output_paddingZ_groupsZoutput_maskr"   rU   
flop_countZgrad_input_shapeZgrad_weight_shaper   r   r   conv_backward_flop   s   F  rX   c                 C   s   | \}}}}|\}}}	}
|\}}}}||  kr|kr8n J ||  kr)|kr8n J ||
kr8|	|kr8||
ks:J d}|t || ||f|| ||	f7 }|t || ||	f|| |	|f7 }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    r   rG   )query_shape	key_shapevalue_shaperC   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopsr   r   r   sdpa_flop_count  s   P""rh   c                O   s   t | ||S )Count flops for self-attention.rh   )rY   rZ   r[   r"   r#   r$   r   r   r   	sdpa_flop  s   rk   c                 C   sR   ddl m} ddlm} t| ||fs| jjdkr|   S |g| 	dd  S )z
    If the offsets tensor is fake, then we don't know the actual lengths.
    In that case, we can just assume the worst case; each batch has max length.
    r   )
FakeTensor)FunctionalTensormetar   )
Ztorch._subclasses.fake_tensorrl   Z#torch._subclasses.functional_tensorrm   r   devicer-   difftolistsize)offsetsmax_lenrl   rm   r   r   r   _offsets_to_lengths  s
   ru   )grad_out.c                 c   s&   |durt |jdksJ t |jdksJ |du s#|j| jks#J | j\}}	}
|j\}}}|j\}}}|dus;J |dusAJ |j|jksIJ t||}t||}t||D ]%\}}d|	||
f}d|||f}d|||f}|durt|nd}||||fV  qXdS | j|j|j|dur|jndfV  dS )a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   lenr   ru   zip)querykeyvaluerv   	cum_seq_q	cum_seq_kmax_qmax_k_h_qr^   h_kd_kh_vrf   Zseq_q_lengthsZseq_k_lengthsZ	seq_q_lenZ	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shaper   r   r   %_unpack_flash_attention_nested_shapes*  s*   

&r   c                 c   s,   |durt |jdksJ t |jdksJ |du s#|j| jks#J | j\}}}	}
|j\}}}}|j\}}}}|dus>J |dusDJ |j|jksLJ t||}t||}t||D ]%\}}d|	||
f}d|||f}d|||f}|durw|nd}||||fV  q[dS | j|j|j|dur|jndfV  dS )a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   rx   )r{   r|   r}   rv   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r^   r   r   r   rf   Z	seqlens_qZ	seqlens_klen_qZlen_kr   r   r   r   r   r   r   )_unpack_efficient_attention_nested_shapesX  s*   

&r   T)r4   c             	   O   s(   t | ||||||d}
tdd |
D S )ri   )r{   r|   r}   r~   r   r   r   c                 s   $    | ]\}}}}t |||V  qd S r   rj   .0rY   rZ   r[   r   r   r   r   	<genexpr>  
    


z0_flash_attention_forward_flop.<locals>.<genexpr>r   sum)r{   r|   r}   r~   r   r   r   r"   r#   r$   sizesr   r   r   _flash_attention_forward_flop     	r   c              	   O   s(   t | ||||||d}
tdd |
D S )ri   )r{   r|   r}   r   r   r   r   c                 s   r   r   rj   r   r   r   r   r     r   z4_efficient_attention_forward_flop.<locals>.<genexpr>r   r   )r{   r|   r}   biasr   r   r   r   r#   r$   r   r   r   r   !_efficient_attention_forward_flop  r   r   c                 C   sV  d}|\}}}}|\}	}
}}|\}}}}| \}}}}||	  kr)|  kr)|krBn J ||
  kr;|  kr;|krBn J ||ksDJ ||krP||krP||ksRJ d}|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|S )Nr   rG   )rV   rY   rZ   r[   rg   rC   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   Z_b4Z_h4Z_s4Z_d4r   r   r   sdpa_backward_flop_count  s   T"""""r   c                O   s   t | |||S )z(Count flops for self-attention backward.r   )rV   rY   rZ   r[   r"   r#   r$   r   r   r   sdpa_backward_flop  s   r   c
              
   O   *   t |||| ||||	d}tdd |D S )N)r{   r|   r}   rv   r~   r   r   r   c                 s   &    | ]\}}}}t ||||V  qd S r   r   r   rY   rZ   r[   rV   r   r   r   r     
    

z1_flash_attention_backward_flop.<locals>.<genexpr>r   )rv   r{   r|   r}   outZ	logsumexpr~   r   r   r   r#   r$   shapesr   r   r   _flash_attention_backward_flop     
r   c
              
   O   r   )N)r{   r|   r}   rv   r   r   r   r   c                 s   r   r   r   r   r   r   r   r   &  r   z5_efficient_attention_backward_flop.<locals>.<genexpr>r   )rv   r{   r|   r}   r   r   r   r   r   r   r#   r$   r   r   r   r   "_efficient_attention_backward_flop  r   r   c                 C   s   t | ts| fS | S r   )r   tuple)xr   r   r   normalize_tupleA  s   
r   ) KMBTc                 C   s0   t dtttd tt| d d }t| S )Nr   r   r7   rw   )maxminry   suffixesstr)numberindexr   r   r   get_suffix_strJ  s   (r   c                 C   s&   t |}| d|  d}|t |  S )Ni  z.3f)r   r   )r   suffixr   r}   r   r   r   convert_num_with_suffixQ  s   
r   c                 C   s   |dkrdS | | dS )Nr   0%z.2%r   )numdenomr   r   r   convert_to_percent_strX  s   r   c                    s   t   fdd}|S )Nc                    s   t | \}} | }t||S r   )r   r   )r#   Z	flat_argsspecr   r%   r   r   r'   ^  s   
z)_pytreeify_preserve_structure.<locals>.nfr   r(   r   r%   r   _pytreeify_preserve_structure]  s   r   c                       s   e Zd ZdZ				ddeeejje	ejj f  de
dedeeeef  f fd	d
Zde
fddZdeeeee
f f fddZdddZdd Zdd Zdd Z  ZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    Nr7   Tmodsdepthdisplaycustom_mappingc                    st   t    tdd | _|| _|| _d | _|d u ri }|d ur&tjddd i t	dd |
 D | _	t | _d S )Nc                   S   s   t tS r   )r   intr   r   r   r   <lambda>  s    z*FlopCounterMode.__init__.<locals>.<lambda>z<mods argument is not needed anymore, you can stop passing itr7   )
stacklevelc                 S   s*   i | ]\}}|t |d dr|nt|qS )Z_get_rawF)getattrr)   r   r;   vr   r   r   
<dictcomp>  s   * z,FlopCounterMode.__init__.<locals>.<dictcomp>)super__init__r   flop_countsr   r   modewarningswarnr    itemsr   mod_tracker)selfr   r   r   r   	__class__r   r   r   {  s   
zFlopCounterMode.__init__r*   c                 C   s   t | jd  S )NGlobal)r   r   valuesr   r   r   r   get_total_flops  s   zFlopCounterMode.get_total_flopsc                 C   s   dd | j  D S )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        c                 S   s   i | ]	\}}|t |qS r   )dictr   r   r   r   r     s    z3FlopCounterMode.get_flop_counts.<locals>.<dictcomp>)r   r   r   r   r   r   get_flop_counts  s   
zFlopCounterMode.get_flop_countsc           
         s  |d u rj }|d u rd}dd l}d|_g d}g }  t d fdd}tj D ]}|dkr;q4|d	d
 }||krGq4|||d
 }|	| q4djv roso|D ]
}	d|	d  |	d< q]|dd| }t
|dkrzg dg}|j||ddS )Ni?B r   T)ModuleZFLOPz% TotalFc                    s   t j|   }| kO d| }g }|||  t|t| g j|   D ]\}}||d t| t|t| g q,|S )N z - )r   r   r   appendr   r   r   r   )mod_namer   rg   paddingr   r;   r   Zglobal_flopsZglobal_suffixZis_global_subsumedr   r   r   process_mod  s    z.FlopCounterMode.get_table.<locals>.process_modr   .r   r   )r   0r   )leftrightr   )headersZcolalign)r   tabulateZPRESERVE_WHITESPACEr   r   sortedr   keyscountextendry   )
r   r   r   headerr   r   modZ	mod_depthZ
cur_valuesr}   r   r   r   	get_table  s6   
zFlopCounterMode.get_tablec                 C   s,   | j   | j  t| | _| j  | S r   )r   clearr   	__enter___FlopCounterModer   r   r   r   r   r     s
   



zFlopCounterMode.__enter__c                 G   sD   | j d usJ | j j| }d | _ | j  | jr t| | j |S r   )r   __exit__r   r   printr   r   )r   r#   rC   r   r   r   r     s   
zFlopCounterMode.__exit__c                 C   sV   || j v r)| j | }||i |d|i}t| jjD ]}| j| |  |7  < q|S )Nr!   )r    setr   parentsr   )r   Zfunc_packetr   r#   r$   Zflop_count_funcrW   parr   r   r   _count_flops  s   

zFlopCounterMode._count_flops)Nr7   TNr   )__name__
__module____qualname____doc__r   r	   r   nnr   rT   r   boolr   r   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r   g  s*    
=	c                   @   s$   e Zd ZdefddZdddZdS )	r   counterc                 C   s
   || _ d S r   )r  )r   r  r   r   r   r     s   
z_FlopCounterMode.__init__r   Nc                 C   s0  |r|ni }|t jjjjt jjjjt jjjjt jjjjt jjjjt jjj	jt jjj
jt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjhv rRtS || jjvr|t jjjjur|  |j|i |}|turx|W  d    S W d    n1 sw   Y  ||i |}| j|j|||S r   )r   opsatenZis_contiguousdefaultZmemory_formatZis_strides_like_formatZis_non_overlapping_and_denserr   Zsym_sizeZstrideZ
sym_strideZstorage_offsetZsym_storage_offsetZnumelZ	sym_numeldimZprimZlayoutNotImplementedr  r    ro   Z	decomposer   Z_overloadpacket)r   functypesr#   r$   rr   r   r   r   __torch_dispatch__  s6   












z#_FlopCounterMode.__torch_dispatch__)r   N)r   r   r   r   r   r  r   r   r   r   r     s    r   )Fr   )NNNFN)Vr   Ztorch.utils._pytreer   r   r   Zmodule_trackerr   typingr   r   r	   r
   r   collections.abcr   Ztyping_extensionsr   collectionsr   Ztorch.utils._python_dispatchr   mathr   	functoolsr   r   __all__r   r   r  r  r   r    r   __annotations__r)   r   mmr   r>   ZaddmmrB   ZbmmrF   ZbaddbmmrH   Z
_scaled_mmrI   rT   r  rM   ZconvolutionZ_convolutionrS   Zconvolution_backwardrX   rh   Z'_scaled_dot_product_efficient_attentionZ#_scaled_dot_product_flash_attentionZ#_scaled_dot_product_cudnn_attentionrk   ru   r   r   r   Z_flash_attention_forwardr   Z_efficient_attention_forwardr   r   Z0_scaled_dot_product_efficient_attention_backwardZ,_scaled_dot_product_flash_attention_backwardZ,_scaled_dot_product_cudnn_attention_backwardr   Z_flash_attention_backwardr   Z_efficient_attention_backwardr   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s  
*

'g6

36

0
  	

 