
    eTh+I              &       t   S SK r S SKrS SKJrJr  S SKrS SKJs  Jr	  SSK
JrJrJrJrJr  \R                   " \5      rSr\" 5       (       a  S SKJrJrJr  S SKJrJr  S SKJr  \" 5       (       a  S SKJr  SSKJrJrJr  SS	KJr  SS
KJ r  \(       a%  S\!" \ RD                  " \5      RF                  5      ;   r$S r%S r&S\RN                  S\(\RN                  \RN                  \)4   4S jr*S\RN                  S\RN                  S\RN                  S\RN                  S\)4
S jr+S r, S4S\RN                  S\RN                  S\RN                  S\\RZ                     4S jjr.\" S5      r/\R`                  Rc                  SS5      S:H  r2            S5S!\RN                  S"\RN                  S#\RN                  S\\RN                     S\)S$\3S%\4S&\\RN                     S'\\4   S(\\)   S)\3S*\\4   S+\\3   S,\\Rj                     S-\\Rj                     S.\\)   S/\\)   S\\RZ                     4$S0 jjr6 " S1 S2\S S39r7g)6    N)Optional	TypedDict   )is_flash_attn_2_availableis_flash_attn_greater_or_equal#is_flash_attn_greater_or_equal_2_10is_torch_npu_availablelogging)index_first_axis	pad_inputunpad_input)flash_attn_funcflash_attn_varlen_func)apply_rotary_emb)npu_rotary_mul)npu_flash_attn_func)npu_flash_attn_varlen_funcwindow_sizec                  D    [        5       (       a  g[        5       (       a  gg)z5Determine whether flash-attention can be used or not.TF)r   r	        c/var/www/auris/envauris/lib/python3.13/site-packages/transformers/modeling_flash_attention_utils.pyis_flash_attn_availabler   6   s!     !"" r   c                  x    [        5       (       a  [        5       (       + $ [        5       (       a  SSKJn   U " 5       $ g)zBDetermine whether flash-attention uses top-left or down-right maskr   'is_npu_fa2_top_left_aligned_causal_maskF)r   r   r	    integrations.npu_flash_attentionr   r   s    r   !flash_attn_supports_top_left_maskr   D   s1     !""6888]688r   attention_maskreturnc                 X   U R                  S[        R                  S9n[        R                  " U R	                  5       SS9R	                  5       nUR                  5       R                  5       n[        R                  " [        R                  " US[        R                  S9S5      nUUU4$ )aA  
Retrieves indexing data required to repad unpadded (ragged) tensors.

Arguments:
    attention_mask (`torch.Tensor`):
        Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

Return:
    indices (`torch.Tensor`):
        The indices of non-masked tokens from the flattened input sequence.
    cu_seqlens (`torch.Tensor`):
        The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
    max_seqlen_in_batch (`int`):
        Maximum sequence length in batch.
)dimdtypeF)as_tupler   )r   r   )
sumtorchint32nonzeroflattenmaxitemFpadcumsum)r   seqlens_in_batchindicesmax_seqlen_in_batch
cu_seqlenss        r   _get_unpad_datar4   T   s      &))b)DmmN224uEMMOG*..0557u||$4!5;;OQWXJ r   query_layer	key_layervalue_layerquery_lengthc                    [        U5      u  pVnUR                  u  pp[        UR                  X-  X5      U5      n[        UR                  X-  X5      U5      nXI:X  a&  [        U R                  X-  SU5      U5      n UnUnUnOjUS:X  aJ  Sn[        R
                  " US-   [        R                  U R                  S9nUSS nU R                  S5      n OUSS2U* S24   n[        X5      tppnU UUUX4X44$ )a5  
Unpads query, key, and values tensors, using a single dimension for all tokens even though they belong to different batches.

This function is used instead of `flash_attn.bert_padding.unpad_input` in order to avoid the recomputation of the same intermediary
tensors for query, key, value tensors.

Arguments:
    query_layer (`torch.Tensor`):
        Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
    key_layer (`torch.Tensor`):
        Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
    value_layer (`torch.Tensor`):
        Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
    attention_mask (`torch.Tensor`):
        Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
    query_length (`int`):
        Target length.

Return:
    query_layer (`torch.Tensor`):
        Query state without padding. Shape: (total_target_length, num_heads, head_dim).
    key_layer (`torch.Tensor`):
        Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
    value_layer (`torch.Tensor`):
        Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
    indices_q (`torch.Tensor`):
        The indices of non-masked tokens from the flattened input target sequence.
    (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
        The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
    (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
        Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
r"   r   )r$   deviceN)
r4   shaper   reshaper'   aranger(   r:   squeezer   )r5   r6   r7   r   r8   	indices_kcu_seqlens_kmax_seqlen_in_batch_k
batch_size
kv_seq_lennum_key_value_headshead_dimcu_seqlens_qmax_seqlen_in_batch_q	indices_q_s                   r   _upad_inputrJ   o   s7   N 6E^5T2I2<EOO9J/ !2!2:3JL_!jluvI"J35HSU^K !&{':'::;RTVX`'aclm# 5			 !||N%++k6H6H
 !"%	!))!, (L=>(9:JUVaJrGa 		$	6 r   c           	         U R                  SU R                  S5      U R                  S5      5      n UR                  5       R                  SUR                  S5      UR                  S5      5      nUR                  5       R                  SUR                  S5      UR                  S5      5      nUR                  5       n[        R
                  " UR                  S5      UR                  [        R                  S9n[        R                  " XCS:H     [        R                  " UR                  5       UR                  [        R                  S945      nUR                  5       S-   nXX$XU4Xf44$ )a  
This function returns necessary arguments to call `flash_attn_varlen_func`.
All three query, key, value states will be flattened.
Cumulative lengths of each examples in the batch will be extracted from position_ids.

NOTE: ideally cumulative lengths should be prepared at the data collator stage

Arguments:
    query (`torch.Tensor`):
        Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim).
    key (`torch.Tensor`):
        Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
    value (`torch.Tensor`):
        Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim).
    position_ids (`torch.Tensor`):
        Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.

Return:
    query (`torch.Tensor`):
        Query state without padding. Shape: (total_target_length, num_heads, head_dim).
    key (`torch.Tensor`):
        Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
    value (`torch.Tensor`):
        Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
    indices_q (`torch.Tensor`):
        The indices of non-masked tokens from the flattened input target sequence.
    (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`):
        The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
    (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`):
        Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value).
r"   r   )r:   r$   r   )viewsize
contiguousr*   r'   r=   r:   r(   cattensorr+   )querykeyvalueposition_idsrH   cu_seq_lens
max_lengths          r   prepare_fa2_from_position_idsrX      s%   @ JJr5::b>5::b>:E
..


CHHRL#((2,
?C##B

2

2GE'')L\..q1,:M:MUZU`U`aI))a'(LL**,\5H5HPUP[P[\	
K !!#a'J;*DzF^__r   rR   rS   rT   target_dtypec                     Uc  XU4$ U R                   nU[        R                  :X  aL  [        R	                  SU S35        U R                  U5      n UR                  U5      nUR                  U5      nXU4$ )a  
PEFT usually casts the layer norms in float32 for training stability reasons
therefore the input hidden states gets silently casted in float32. Hence, we need
cast them back in float16 / bfloat16 just to be sure everything works as expected.
This might slowdown training & inference so it is recommended to not cast the LayerNorms!

Args:
    query (`torch.Tensor`):
        Input query states to be passed to Flash Attention API
    key (`torch.Tensor`):
        Input key states to be passed to Flash Attention API
    value (`torch.Tensor`):
        Input value states to be passed to Flash Attention API
    target_dtype (`torch.dtype`, *optional*):
        The dtype to convert the attention tensors to. Conversion can be ignored by
        not providing the target dtype.
zThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .)r$   r'   float32loggerwarning_onceto)rR   rS   rT   rY   input_dtypes        r   fa_peft_integration_checkra      s    . 5  ++Kemm#~Q 	
 &ff\"&ur   z2.4.1FLASH_ATTENTION_DETERMINISTIC01Fquery_states
key_statesvalue_states	is_causaldropoutrU   softmax_scalesliding_windowuse_top_left_masksoftcapdeterministiccu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kc                 f   U
(       d  UnOU=(       a    US:g  n[         =(       a    U	SL=(       a    UR                  S   U	:  nU(       a  SX40O0 n[        (       a  Uc  [        nUUS'   Ub  UUS'   [	        XUU5      u  pnUbR  U R                  S   n[        XX#U5      u  pnnnnUu  nnUu  nn[        U UU4UUUUUUUS.UD6n[        UUUU5      nU$ UGbF  Uc3  US:w  Ga<  [        R                  " USS	9S:  R                  5       (       Gd  U R                  S5      nUb  Uc  [        XX'5      u  pnnnnUu  pUu  nnOU R                  SU R                  S
5      U R                  S5      5      n UR                  SUR                  S
5      UR                  S5      5      nUR                  SUR                  S
5      UR                  S5      5      n[        U UU4UUUUUUUS.UD6nUR                  USUR                  S
5      UR                  S5      5      nU$ [        XX&4UUS.UD6nU$ )ax  
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.

Args:
    query_states (`torch.Tensor`):
        Input query states to be passed to Flash Attention API
    key_states (`torch.Tensor`):
        Input key states to be passed to Flash Attention API
    value_states (`torch.Tensor`):
        Input value states to be passed to Flash Attention API
    attention_mask (`torch.Tensor`, *optional*):
        The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
        position of padding tokens and 1 for the position of non-padding tokens.
    dropout (`float`):
        Attention dropout
    softmax_scale (`float`, *optional*):
        The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
    use_top_left_mask (`bool`, defaults to `False`):
        flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
    softcap (`float`, *optional*):
        Softcap for the attention logits, used e.g. in gemma2.
    deterministic (`bool`, *optional*):
        Determines if the deterministic option introduced in flash_attn>=2.4.1 is enabled.
r   Nr   rn   rm   r   )rF   r@   max_seqlen_qmax_seqlen_k	dropout_prj   causalr"   )r#   rL   )rj   rw   )_flash_supports_window_sizer;   	flash_241deterministic_gra   rJ   r   r   r'   diffallrN   rX   r<   rM   r   ) re   rf   rg   r   r8   rh   ri   rU   rj   rk   rl   rm   rn   ro   rp   rq   rr   rY   kwargsrw   use_sliding_windowsflash_kwargsrB   rH   rV   max_seq_lensrF   r@   rG   rA   attn_output_unpadattn_outputs                                    r   _flash_attention_forwardr     s   \  0|q0 	$kd(BkzGWGWXYGZ]kGk  I\MN#CDacLy +M(5_%")Y .G,.*Ll
 !!''*
WblLX
T,	; &1"l7C442
 &%..'
 
   19j,WX O 
	! \Q%6

<]_@`de@e?j?j?l?l!&&q)
 M$9-lc YLlI{L ,7(M)5&L, (//L4E4Eb4I<K\K\]_K`aL#++B
0CZ__UWEXYJ'//L4E4Eb4I<K\K\]_K`aL,
 '&%%'
 
 "&&z2{7G7G7K[M]M]^`Mab 	 &l
KXag
kw
 r   c                       \ rS rSr% Sr\\R                     \S'   \\R                     \S'   \\	   \S'   \\	   \S'   Sr
g)	FlashAttentionKwargsi  a  
Keyword arguments for Flash Attention with Compile.

Attributes:
    cu_seq_lens_q (`torch.LongTensor`, *optional*)
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`, *optional*)
        Gets cumulative sequence length for key state.
    max_length_q (`int`, *optional*):
        Maximum sequence length for query state.
    max_length_k (`int`, *optional*):
        Maximum sequence length for key state.
ro   rp   rq   rr   r   N)__name__
__module____qualname____firstlineno____doc__r   r'   
LongTensor__annotations__int__static_attributes__r   r   r   r   r     s?     E,,--E,,--3-3-r   r   )total)N)g        NNNFNNNNNNN)8inspectostypingr   r   r'   torch.nn.functionalnn
functionalr-   utilsr   r   r   r	   r
   
get_loggerr   r]   r   flash_attn.bert_paddingr   r   r   
flash_attnr   flash_attn.layers.rotaryr   	torch_npur   r   r   r   list	signature
parametersrx   r   r   Tensortupler   r4   rJ   rX   r$   ra   ry   environgetrz   boolfloatr   r   r   r   r   r   <module>r      s    	 &     
		H	% PPB9 <ZZXf "/48I8I/8Z8e8e3f"f ELL U5<<WZ;Z5[ 6FF||F F LL	F
 FR/`l +/	&<<&	& <<& 5;;'	&R +73	**..!@#F#M +/%)$(##$(0404"&"&*.%I,,II ,,I U\\*	I
 I I I 5<<(I E?I SMI I e_I D>I E,,-I E,,-I  3-!I" 3-#I$ 5;;'%IX 9E  r   