a
    hgp                  $   @   s  U d dl Z d dlmZmZmZ ddlmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlZddgZedZedZe jjZdd Zi Z e!eef e"d< dd Z#dReeeef geeef f dddZ$e$ej%dde&dddZ'e$ej(dSe&dddZ)e$ej*dTe&dddZ+e$ej,dUe&ddd Z-e$ej.dVe&dd!d"Z/dWe0e& e0e& e0e& e1e&d#d$d%Z2e$ej3ej4gdde&dd&d'Z5e$ej6e&dd(d)Z7d*d+ Z8e$ej9ej:ej;gdde&dd,d-Z<d.d/ Z=dd0ee>e>e&d1f e>e&d1f e>e&d1f e	e>e&d1f  f  dd2d3Z?dd0ee>e>e&d1f e>e&d1f e>e&d1f e	e>e&d1f  f  dd4d5Z@e$ejAd6d7dde&dd8d9ZBe$ejCd6d7e&dd:d;ZDd<d= ZEe$ejFejGejHgdde&dd>d?ZIe$ejJd6d7e&dd@dAZKe$ejLd6d7e&ddBdCZMej%e'ej(e)ej*e+ej,e-ej.e/ej3e5ej4e5ej6e7ej9e<ej:e<ej;e<ejFeIejGeIejHeIejAeBejCeDejJeKejLeMiZ dDdE ZNg dFZOdGdH ZPdIdJ ZQdKdL ZRdMdN ZSG dOd dZTG dPdQ dQeZUdS )X    N)tree_maptree_flattentree_unflatten   )ModuleTracker)AnyOptionalUnionTypeVarCallable)Iterator)	ParamSpec)defaultdict)TorchDispatchModeprodwrapsFlopCounterModeregister_flop_formula_T_Pc                 C   s   t | tjr| jS | S N)
isinstancetorchZTensorshape)i r   F/var/www/auris/lib/python3.9/site-packages/torch/utils/flop_counter.py	get_shape   s    r   flop_registryc                    s   t  d d fdd
}|S )N)out_valc                    s(   t t||| f\}}} |d|i|S )N	out_shape)r   r   )r!   argskwargsr"   fr   r   nf   s    zshape_wrapper.<locals>.nfr   r&   r'   r   r%   r   shape_wrapper   s    r)   Freturnc                    s*   t ttf t ttf d fdd}|S )N)flop_formular+   c                    s,   st    fdd}tjj|  S )Nc                    sH   t | tjjs&td|  dt|  | tv r<td|   t| < d S )Nzlregister_flop_formula(targets): expected each target to be OpOverloadPacket (i.e. torch.ops.mylib.foo), got z which is of type zduplicate registrations for )r   r   Z_opsZOpOverloadPacket
ValueErrortyper    RuntimeError)targetr,   r   r   register(   s    z=register_flop_formula.<locals>.register_fun.<locals>.register)r)   r   utilsZ_pytreeZ	tree_map_)r,   r2   get_rawtargetsr1   r   register_fun$   s
    z+register_flop_formula.<locals>.register_fun)r   r   r   )r6   r5   r7   r   r4   r   r   #   s    &)r"   c          	      O   s,   | \}}|\}}||ksJ || d | S )zCount flops for matmul.   r   )	a_shapeb_shaper"   r#   r$   mkk2nr   r   r   mm_flop9   s    r?   c                 K   s
   t ||S )zCount flops for addmm.r?   Z
self_shaper9   r:   r"   r$   r   r   r   
addmm_flopD   s    rB   c                 K   sD   | \}}}|\}}}	||ks J ||ks,J || |	 d | }
|
S )z"Count flops for the bmm operation.r8   r   )r9   r:   r"   r$   br;   r<   b2r=   r>   flopr   r   r   bmm_flopI   s    

rF   c                 K   s
   t ||S )z&Count flops for the baddbmm operation.rF   rA   r   r   r   baddbmm_flopV   s    rH   c	           
      K   s
   t | |S )zCount flops for _scaled_mm.r@   )
r9   r:   Zscale_a_shapeZscale_b_shapeZ
bias_shapeZscale_result_shapeZ	out_dtypeZuse_fast_accumr"   r$   r   r   r   _scaled_mm_flop]   s    rI   )x_shapew_shaper"   
transposedr+   c           
      C   sJ   | d }|r| n|dd }|^}}}t |t | | | | d }	|	S )a  Count flops for convolution.

    Note only multiplication is
    counted. Computation for bias are ignored.
    Flops for a transposed convolution are calculated as
    flops = (x_shape[2:] * prod(w_shape) * batch_size).
    Args:
        x_shape (list(int)): The input shape before convolution.
        w_shape (list(int)): The filter shape.
        out_shape (list(int)): The output shape after convolution.
        transposed (bool): is the convolution transposed
    Returns:
        int: the number of flops
    r   r8   Nr   )
rJ   rK   r"   rL   Z
batch_sizeZ
conv_shapeZc_outZc_inZfilter_sizerE   r   r   r   conv_flop_countn   s
    
 rM   c          
      O   s   t | |||dS )zCount flops for convolution.rL   )rM   )
rJ   rK   _bias_stride_padding	_dilationrL   r"   r#   r$   r   r   r   	conv_flop   s    rS   c                 C   s   dd }d}|
d r4t |d }|t| ||| 7 }|
d rt |d }|rn|t|| ||||dd7 }n |t|||| ||dd7 }|S )Nc                 S   s    | d | d gt | dd   S )Nr   r   r8   )list)r   r   r   r   t   s    zconv_backward_flop.<locals>.tr   r   FrN   )r   rM   )grad_out_shaperJ   rK   rO   rP   rQ   rR   rL   Z_output_paddingZ_groupsZoutput_maskr"   rU   
flop_countZgrad_input_shapeZgrad_weight_shaper   r   r   conv_backward_flop   s    H" rX   c                 C   s   | \}}}}|\}}}	}
|\}}}}||  kr8|krln n0||  krP|krln n||
krl|	|krl||
kspJ d}|t || ||f|| ||	f7 }|t || ||	f|| |	|f7 }|S )z^
    Count flops for self-attention.

    NB: We can assume that value_shape == key_shape
    r   rG   )query_shape	key_shapevalue_shaperC   hs_qd_q_b2_h2s_k_d2_b3_h3_s3d_vtotal_flopsr   r   r   sdpa_flop_count  s    L""rh   c                O   s   t | ||S )Count flops for self-attention.rh   )rY   rZ   r[   r"   r#   r$   r   r   r   	sdpa_flop  s    rk   c                 C   sR   ddl m} ddlm} t| ||fs>| jjdkr>|   S |g| 	dd  S )z
    If the offsets tensor is fake, then we don't know the actual lengths.
    In that case, we can just assume the worst case; each batch has max length.
    r   )
FakeTensor)FunctionalTensormetar   )
Ztorch._subclasses.fake_tensorrl   Z#torch._subclasses.functional_tensorrm   r   devicer.   difftolistsize)offsetsmax_lenrl   rm   r   r   r   _offsets_to_lengths  s
    ru   )grad_out.c                 c   s&  |durt |jdksJ t |jdks,J |du sD|j| jksDJ | j\}}	}
|j\}}}|j\}}}|dustJ |dusJ |j|jksJ t||}t||}t||D ]J\}}d|	||
f}d|||f}d|||f}|dur|nd}||||fV  qdS | j|j|j|dur|jndfV  dS )a;  
    Given inputs to a flash_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   lenr   ru   zip)querykeyvaluerv   	cum_seq_q	cum_seq_kmax_qmax_k_h_qr^   h_kd_kh_vrf   Zseq_q_lengthsZseq_k_lengthsZ	seq_q_lenZ	seq_k_lennew_query_shapenew_key_shapenew_value_shapenew_grad_out_shaper   r   r   %_unpack_flash_attention_nested_shapes*  s(    

r   c                 c   s.  |durt |jdksJ t |jdks.J |du sF|j| jksFJ | j\}}}	}
|j\}}}}|j\}}}}|dus|J |dusJ |j|jksJ t||}t||}t||D ]J\}}d|	||
f}d|||f}d|||f}|dur|nd}||||fV  qdS | j|j|j|dur"|jndfV  dS )a?  
    Given inputs to a efficient_attention_(forward|backward) kernel, this will handle behavior for
    NestedTensor inputs by effectively unbinding the NestedTensor and yielding the shapes for
    each batch element.

    In the case that this isn't a NestedTensor kernel, then it just yields the original shapes.
    N   r   rx   )r{   r|   r}   rv   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_kr   r   r^   r   r   r   rf   Z	seqlens_qZ	seqlens_klen_qZlen_kr   r   r   r   r   r   r   )_unpack_efficient_attention_nested_shapesX  s(    


r   T)r5   c             	   O   s(   t | ||||||d}
tdd |
D S )ri   )r{   r|   r}   r~   r   r   r   c                 s   s"   | ]\}}}}t |||V  qd S r   rj   .0rY   rZ   r[   r   r   r   r   	<genexpr>  s   
z0_flash_attention_forward_flop.<locals>.<genexpr>r   sum)r{   r|   r}   r~   r   r   r   r"   r#   r$   sizesr   r   r   _flash_attention_forward_flop  s    	r   c              	   O   s(   t | ||||||d}
tdd |
D S )ri   )r{   r|   r}   r   r   r   r   c                 s   s"   | ]\}}}}t |||V  qd S r   rj   r   r   r   r   r     s   
z4_efficient_attention_forward_flop.<locals>.<genexpr>r   r   )r{   r|   r}   biasr   r   r   r   r#   r$   r   r   r   r   !_efficient_attention_forward_flop  s    	r   c                 C   sR  d}|\}}}}|\}	}
}}|\}}}}| \}}}}||	  krR|  krR|krn n*||
  krt|  krt|krn n||ksJ ||kr||kr||ksJ d}|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|t || ||f|| ||f7 }|S )Nr   rG   )rV   rY   rZ   r[   rg   rC   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   Z_b4Z_h4Z_s4Z_d4r   r   r   sdpa_backward_flop_count  s    P"""""r   c                O   s   t | |||S )z(Count flops for self-attention backward.r   )rV   rY   rZ   r[   r"   r#   r$   r   r   r   sdpa_backward_flop  s    r   c
              
   O   s*   t |||| ||||	d}tdd |D S )N)r{   r|   r}   rv   r~   r   r   r   c                 s   s$   | ]\}}}}t ||||V  qd S r   r   r   rY   rZ   r[   rV   r   r   r   r     s   
z1_flash_attention_backward_flop.<locals>.<genexpr>r   )rv   r{   r|   r}   outZ	logsumexpr~   r   r   r   r#   r$   shapesr   r   r   _flash_attention_backward_flop  s    
r   c
              
   O   s*   t |||| ||||	d}tdd |D S )N)r{   r|   r}   rv   r   r   r   r   c                 s   s$   | ]\}}}}t ||||V  qd S r   r   r   r   r   r   r   &  s   
z5_efficient_attention_backward_flop.<locals>.<genexpr>r   )rv   r{   r|   r}   r   r   r   r   r   r   r#   r$   r   r   r   r   "_efficient_attention_backward_flop  s    
r   c                 C   s   t | ts| fS | S r   )r   tuple)xr   r   r   normalize_tupleA  s    
r   ) KMBTc                 C   s0   t dtttd tt| d d }t| S )Nr   r   r8   rw   )maxminry   suffixesstr)numberindexr   r   r   get_suffix_strJ  s    (r   c                 C   s&   t |}| d|  d}|t |  S )Ni  z.3f)r   r   )r   suffixr   r}   r   r   r   convert_num_with_suffixQ  s    
r   c                 C   s   |dkrdS | | dS )Nr   0%z.2%r   )numZdenomr   r   r   convert_to_percent_strX  s    r   c                    s   t   fdd}|S )Nc                    s   t | \}} | }t||S r   )r   r   )r#   Z	flat_argsspecr   r%   r   r   r'   ^  s    z)_pytreeify_preserve_structure.<locals>.nfr   r(   r   r%   r   _pytreeify_preserve_structure]  s    r   c                       s   e Zd ZdZdeeejje	ejj f  e
eeeeef  d fddZe
dd	d
Zeeeee
f f dddZdddZdd Zdd Zdd Z  ZS )r   a  
    ``FlopCounterMode`` is a context manager that counts the number of flops within its context.

    It does this using a ``TorchDispatchMode``.

    It also supports hierarchical output by passing a module (or list of
    modules) to FlopCounterMode on construction. If you do not need hierarchical
    output, you do not need to use it with a module.

    Example usage

    .. code-block:: python

        mod = ...
        with FlopCounterMode(mod) as flop_counter:
            mod.sum().backward()

    Nr8   T)modsdepthdisplaycustom_mappingc                    st   t    tdd | _|| _|| _d | _|d u r6i }|d urLtjddd i t	dd |
 D | _	t | _d S )Nc                   S   s   t tS r   )r   intr   r   r   r   <lambda>      z*FlopCounterMode.__init__.<locals>.<lambda>z<mods argument is not needed anymore, you can stop passing itr8   )
stacklevelc                 S   s*   i | ]"\}}|t |d dr|nt|qS )Z_get_rawF)getattrr)   r   r<   vr   r   r   
<dictcomp>  r   z,FlopCounterMode.__init__.<locals>.<dictcomp>)super__init__r   flop_countsr   r   modewarningswarnr    itemsr   mod_tracker)selfr   r   r   r   	__class__r   r   r   {  s    
zFlopCounterMode.__init__r*   c                 C   s   t | jd  S )NGlobal)r   r   valuesr   r   r   r   get_total_flops  s    zFlopCounterMode.get_total_flopsc                 C   s   dd | j  D S )a  Return the flop counts as a dictionary of dictionaries.

        The outer
        dictionary is keyed by module name, and the inner dictionary is keyed by
        operation name.

        Returns:
            Dict[str, Dict[Any, int]]: The flop counts as a dictionary.
        c                 S   s   i | ]\}}|t |qS r   )dictr   r   r   r   r     r   z3FlopCounterMode.get_flop_counts.<locals>.<dictcomp>)r   r   r   r   r   r   get_flop_counts  s    
zFlopCounterMode.get_flop_countsc           
         s  |d u rj }|d u rd}dd l}d|_g d}g }  t d fdd}tj D ]>}|dkrvqh|d	d
 }||krqh|||d
 }|	| qhdjv rވs|D ]}	d|	d  |	d< q|dd| }t
|dkrg dg}|j||ddS )Ni?B r   T)ModuleZFLOPz% TotalFc                    s   t j|   }| kO d| }g }|||  t|t| g j|   D ]0\}}||d t| t|t| g qX|S )N z - )r   r   r   appendr   r   r   r   )mod_namer   rg   paddingr   r<   r   Zglobal_flopsZglobal_suffixZis_global_subsumedr   r   r   process_mod  s     z.FlopCounterMode.get_table.<locals>.process_modr   .r   r   )r   0r   )leftrightr   )headersZcolalign)r   tabulateZPRESERVE_WHITESPACEr   r   sortedr   keyscountextendry   )
r   r   r   headerr   r   modZ	mod_depthZ
cur_valuesr}   r   r   r   	get_table  s6    
zFlopCounterMode.get_tablec                 C   s,   | j   | j  t| | _| j  | S r   )r   clearr   	__enter___FlopCounterModer   r   r   r   r   r     s
    



zFlopCounterMode.__enter__c                 G   sD   | j d usJ | j j| }d | _ | j  | jr@t| | j |S r   )r   __exit__r   r   printr   r   )r   r#   rC   r   r   r   r     s    
zFlopCounterMode.__exit__c                 C   sV   || j v rR| j | }||i |d|i}t| jjD ]}| j| |  |7  < q6|S )Nr!   )r    setr   parentsr   )r   Zfunc_packetr   r#   r$   Zflop_count_funcrW   parr   r   r   _count_flops  s    

zFlopCounterMode._count_flops)Nr8   TN)N)__name__
__module____qualname____doc__r   r	   r   nnr   rT   r   boolr   r   r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r   g  s"       
=	c                   @   s$   e Zd ZedddZdddZdS )	r   counterc                 C   s
   || _ d S r   r   )r   r   r   r   r   r     s    z_FlopCounterMode.__init__r   Nc                 C   s6  |r|ni }|t jjjjt jjjjt jjjjt jjjjt jjjjt jjj	jt jjj
jt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjt jjjjhv rtS || jjvr|t jjjjur| : |j|i |}|tur|W  d    S W d    n1 s
0    Y  ||i |}| j|j|||S r   )r   opsatenZis_contiguousdefaultZmemory_formatZis_strides_like_formatZis_non_overlapping_and_denserr   Zsym_sizeZstrideZ
sym_strideZstorage_offsetZsym_storage_offsetZnumelZ	sym_numelZdimZprimZlayoutNotImplementedr   r    ro   Z	decomposer   Z_overloadpacket)r   functypesr#   r$   rr   r   r   r   __torch_dispatch__  s0    












 2z#_FlopCounterMode.__torch_dispatch__)r   N)r   r   r   r   r   r  r   r   r   r   r     s   r   )F)N)N)N)NNNFN)F)Vr   Ztorch.utils._pytreer   r   r   Zmodule_trackerr   typingr   r   r	   r
   r   collections.abcr   Ztyping_extensionsr   collectionsr   Ztorch.utils._python_dispatchr   mathr   	functoolsr   r   __all__r   r   r   r   r   r    r   __annotations__r)   r   mmr   r?   ZaddmmrB   ZbmmrF   ZbaddbmmrH   Z
_scaled_mmrI   rT   r   rM   ZconvolutionZ_convolutionrS   Zconvolution_backwardrX   rh   Z'_scaled_dot_product_efficient_attentionZ#_scaled_dot_product_flash_attentionZ#_scaled_dot_product_cudnn_attentionrk   ru   r   r   r   Z_flash_attention_forwardr   Z_efficient_attention_forwardr   r   Z0_scaled_dot_product_efficient_attention_backwardZ,_scaled_dot_product_flash_attention_backwardZ,_scaled_dot_product_cudnn_attention_backwardr   Z_flash_attention_backwardr   Z_efficient_attention_backwardr   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s   
*
      'g
63
60
 !
 