
    eThA&                        S r SSKJrJrJr  SSKrSSKJr  SSKJ	r	  SSK
Jr  \	" 5       (       a  SSKJrJr  SS	KJr   " S
 S5      r\\R$                  \4   r    S!S\R$                  S\\   S\\\\4      SS4S jjr\R,                  R/                  SS9 S"S\R$                  S\R$                  S\R$                  S\R$                  4S jj5       rS\R$                  S\S\R$                  4S jr   S#S\R4                  R6                  S\R$                  S\R$                  S\R$                  S\\R$                  S4   S\\   S\\   S\\R$                     S\\R$                  \R$                  4   4S  jjrg)$a7  
Partially inspired by torchtune's flex attention implementation

Citation:
@software{torchtune,
  title = {torchtune: PyTorch's finetuning library},
  author = {torchtune maintainers and contributors},
  url = {https//github.com/pytorch/torchtune},
  license = {BSD-3-Clause},
  month = apr,
  year = {2024}
}
    )OptionalTupleUnionN)version   )is_torch_flex_attn_available)_torch_version)	BlockMaskflex_attention)create_block_maskc                   |   ^  \ rS rSrSrSrSrSrU 4S jr\	R                  R                  SS9S 5       rS rS	rU =r$ )
WrappedFlexAttention-   z`
We are doing a singleton class so that flex attention is compiled once when it's first called.
NFc                 ^   > U R                   c  [        TU ]	  U 5      U l         U R                   $ N)	_instancesuper__new__)clsargskwargs	__class__s      `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/integrations/flex_attention.pyr   WrappedFlexAttention.__new__6   s'    == !GOC0CM}}    	recursivec                 <   U R                   (       a  XR                  :w  a|  Xl        [        R                  " [        5      R
                  S:X  a'  U(       a   [        R                  " [        SSS9U l	        O[        R                  " [        5      U l	        SU l         gg)z.
Initialize or update the singleton instance.
z2.6.0Fzmax-autotune-no-cudagraphs)dynamicmodeTN)
_is_flex_compiledtrainingr   parser	   base_versiontorchcompiler   _compiled_flex_attention)selfr"   s     r   __init__WrappedFlexAttention.__init__<   sp    
 %%]])B %M}}^,99WD05"E8T1- 16n0M-%)D" *Cr   c                     U R                   $ r   )r'   )r(   s    r   __call__WrappedFlexAttention.__call__N   s    ,,,r   )r'   r!   r"   )__name__
__module____qualname____firstlineno____doc__r   r!   r'   r   r%   compilerdisabler)   r,   __static_attributes____classcell__)r   s   @r   r   r   -   sP     I# ^^e,* -*"- -r   r   attention_mask_2dattention_chunk_sizeoffsetsreturnr
   c           
        ^ ^
^^^^^ T R                   u  pVU(       d  UnU(       d  Un[        R                  R                  R	                  T SSU4S9m T R
                  nT R                  5       mUb4  TR                  5       R                  S5      R                  S5      S-
  U-  mU U4S jm
U
U4S jnUc  T
OUmUb  US   mUS   mUUU4S jn	OTn	[        U	USUUUS	S
9$ )a  
Create a block causal document mask for a batch of sequences, both packed and unpacked.
Create Block causal logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
The resultant BlockMask is a compressed representation of the full block causal
mask. BlockMask is essential for performant computation of flex attention.
See: https://pytorch.org/blog/flexattention/

Args:
    attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
    of shape (batch_size, total_seq_len). e.g.

    For unpacked sequence:
    [[1, 1, 1, 1, 0, 0, 0],
     [1, 1, 1, 1, 1, 0, 0]]

    For packed sequence:
    [[1, 1, 1, 2, 2, 2, 0],
     [1, 1, 2, 2, 2, 3, 3]]

Returns:
    BlockMask
r   )valuepadN   c                 J   > X#:  nT	X4   T	X4   :H  nTX4   S:  nXF-  U-  nU$ )z
Defines the logic of a block causal mask by combining both a standard causal mask
and a block diagonal document mask.

See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
for an illustration.
r    )
	batch_idxhead_idxq_idxkv_idxcausal_maskdocument_maskpadding_mask
final_maskr7   document_idss
           r   causal_mask_mod4make_flex_block_causal_mask.<locals>.causal_mask_mod   sK     o$Y%56,yGX:YY()9:Q> /-?
r   c                 8   > TX4   TX4   :H  nT" XX#5      nXE-  $ )zE
Combines the chunk mask with the causal mask for chunked attention.
rA   )rB   rC   rD   rE   
chunk_maskcausal_doc_maskrK   
chunk_idxss         r   chunk_causal_mask_mod:make_flex_block_causal_mask.<locals>.chunk_causal_mask_mod   s4      	 01Z	@Q5RR
))uM++r   c                 *   > UT-   nUT-   nT" XXE5      $ r   rA   )	rB   rC   rD   rE   offset_q	offset_kv	kv_offsetmask_mod_maybe_combinedq_offsets	         r   mask_mod-make_flex_block_causal_mask.<locals>.mask_mod   s$    x'H*I*9TTr   T)rY   BHQ_LENKV_LENdevice_compile)
shaper%   nn
functionalr=   r_   clonefill_cumsumcreate_block_causal_mask_flex)r7   r8   query_length
key_lengthr9   
batch_sizetotal_seq_lenr_   rQ   rY   rK   rP   rJ   rV   rW   rX   s   `         @@@@@@r   make_flex_block_causal_maskrl   U   s   : !2 7 7J"
$++//0AQRT^P_/`%%F$**,L'"((*003::2>BH\]
, 2F1MoSh1:AJ		U 	U
 +(

 r   Fr   querykeyr<   c                 8    [        U5      " 5       nU" U UU40 UD6$ r   )r   )rm   rn   r<   r"   r   flex_attention_compileds         r   compile_friendly_flex_attentionrq      s2     38<>" 	 r   hidden_statesn_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r>   N)ra   expandreshape)rr   rs   batchnum_key_value_headsslenhead_dims         r   	repeat_kvr{      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr   moduleattention_maskscalingsoftcap	head_maskc                 N  ^^^ S n	S m[        U[        5      (       a  Un	OUmTb  TS S 2S S 2S S 2S UR                  S   24   mUUU4S jn
SnUR                  S   nXS-
  -  S:X  dR  [        X!R                  S   UR                  S   -  5      n[        X1R                  S   UR                  S   -  5      nSnUR	                  SS 5      n[        UUUU
U	UUUSU R                  S9
u  pUR                  UR                  5      nUR                  SS	5      R                  5       nX4$ )
Nc                    > Tb  T[         R                  " U T-  5      -  n Tb  U TU   S   U   U   -   n Tb  U TU   U   S   S   -   n U $ )Nr   )r%   tanh)scorerB   rC   rD   rE   rF   r   r   s        r   	score_mod)flex_attention_forward.<locals>.score_mod   sm    ejj99E"K	215e<VDDE Ii0:1=a@@Er   Tr>   r   Fkernel_options)r   
block_mask
enable_gqascaler   
return_lser"   r   )
isinstancer
   ra   r{   getrq   r"   todtype	transpose
contiguous)r|   rm   rn   r<   r}   r~   r   r   r   r   r   r   num_local_query_headsr   attn_outputattention_weightsrF   s         ``        @r   flex_attention_forwardr      s7    JK.),,#
$!!Q?SYYr]?":; J!KKN #a&?@QF[[^syy|;<%Q5;;q>!AB
ZZ 0$7N%D% &"K *,,U[[9''1-88:K))r   )NNNN)F)NNN)r2   typingr   r   r   r%   	packagingr   utilsr   utils.import_utilsr	   !torch.nn.attention.flex_attentionr
   r   r   rg   r   TensorintOffsetrl   r3   r4   rq   r{   rb   Modulefloatr   rA   r   r   <module>r      s  8 * )   0 /  !!K
"- "-J 
u||S 	!
 +//3Y||Y"3-Y
 eFFN+,Y Yx %(
 	<<	 << \\ )"	UU\\ 	U# 	U%,, 	U$  $#(,:*HHOO:*<<:* 
:* <<	:*
 %,,34:* e_:* e_:* %:* 5<<%&:*r   