
    eTh$                        S SK r S SKrS SKJs  Jr  SSKJr  \" 5       (       a  S SKrS SK	r	S SK
JrJr  SrSr\" \ R                   " S\S95      r\\\4;  a  \" S5      eS	 r " S
 S\R(                  R*                  5      r\R.                  r " S S\R(                  R*                  5      r\R.                  rS rSS jr   SS jr     SS jrg)    N   )is_torch_npu_available)	rearrangerepeat   NPU_FA2_SPARSE_MODE)defaultzEnvironment variable `NPU_FA2_SPARSE_MODE` can only be set as 2 (top-left aligned causal mask) or 3 (down-right aligned causal mask).c                  >    [        5       (       a  [        [        :H  $ S$ )NF)r   SPARSE_MODE!TOP_LEFT_ALIGNED_CAUSAL_MASK_MODE     e/var/www/auris/envauris/lib/python3.13/site-packages/transformers/integrations/npu_flash_attention.py'is_npu_fa2_top_left_aligned_causal_maskr   )   s    ?U?W?W;;;b]bbr   c                   4    \ rS rSr\S 5       r\S 5       rSrg)IndexFirstAxis.   c           
      (   U R                  U5        UR                  S:  d   eUR                  S   UR                  SS  sU l        nUR	                  5       n[
        R                  " [        US5      S[        USUS95      R                  " S/UQ76 $ )Nr   r      b ... -> b (...)z -> z dd)
save_for_backwardndimshapefirst_axis_dimnumeltorchgatherr   r   reshape)ctxinputindicesother_shape
second_dims        r   forwardIndexFirstAxis.forward/   s    g&zzQ*/++a.%++ab/'K &&(
 ||e/0!VGZS]5^

'$"$ 	$r   c           	         U R                   u  nUR                  S:  d   eUR                  SS  n[        US5      n[        R
                  " U R                  UR                  S   /UR                  UR                  S9nUR                  S[        USUR                  S   S9U5        UR                  " U R                  /UQ76 S 4$ )Nr   r   r   devicedtyper   r   r   )saved_tensorsr   r   r   r    zerosr   r,   r-   scatter_r   r"   )r#   grad_outputr%   r&   
grad_inputs        r   backwardIndexFirstAxis.backward;   s    &&
1$$$!''+-?@[[!2!21!56%%##

 	Avgz[=N=Nq=QRT_`!!#"4"4C{CTIIr   r   N__name__
__module____qualname____firstlineno__staticmethodr(   r3   __static_attributes__r   r   r   r   r   .   s*    	$ 	$ J Jr   r   c                   4    \ rS rSr\S 5       r\S 5       rSrg)IndexPutFirstAxisP   c                     U R                  U5        UR                  S:X  d   eUR                  S:  d   e[        R                  " U/UR                  SS  Q7UR
                  UR                  S.6nXU'   U$ )Nr   r   r+   )r   r   r    r/   r   r,   r-   )r#   valuesr%   r   outputs        r   r(   IndexPutFirstAxis.forwardQ   sp    g&||q   {{a^ifll12.>iv}}\b\h\hi wr   c                 0    U R                   u  nX   nUS S 4$ N)r.   )r#   r1   r%   grad_valuess       r   r3   IndexPutFirstAxis.backward\   s$    &&
!*D$&&r   r   Nr5   r   r   r   r=   r=   P   s(      ' 'r   r=   c                 4    [        XX#-  5      n[        USUS9$ )a  
Arguments:
    hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
    indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
    batch: int, batch size for the padded sequence.
    seqlen: int, maximum sequence length for the padded sequence.
Return:
    hidden_states: (batch, seqlen, ...)
z(b s) ... -> b s ...)b)index_put_first_axisr   )hidden_statesr%   batchseqlenrA   s        r   	pad_inputrM   i   s"     "-%.IFV3u==r   c                    Ub  X-   OUnUR                  S[        R                  S9nUR                  S[        R                  S9n[        R                  " UR	                  5       SS9R	                  5       nUR                  5       R                  5       n[        R                  " [        R                  " US[        R                  S9S5      n[        [        U S5      U5      UUUU4$ )a  
Arguments:
    hidden_states: (batch, seqlen, ...)
    attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
    unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
Return:
    hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
    indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
    cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
    max_seqlen_in_batch: int
    seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
r   )dimr-   F)as_tupler   )r   r   zb s ... -> (b s) ...)sumr    int32nonzeroflattenmaxitemFpadcumsumindex_first_axisr   )	rJ   attention_maskunused_mask	all_masksseqlens_in_batchused_seqlens_in_batchr%   max_seqlen_in_batch
cu_seqlenss	            r   unpad_inputrb   {   s     3>2I-~I }}5;;}?*..2U[[.ImmI--/%@HHJG*..0557u||$4!5;;OQWXJ 	=2HI7S r   c                    SU-
  nUc&  S[         R                  " U R                  S   5      -  nU(       d+  U R                  S   n[        R                  " XX(SXtS9S   n	U	$ [
        R                  " [
        R                  " SS/U R                  S9S	S
9R                  5       n
U R                  S   n[        R                  " U UUUSUUU
[        S9	S   n	U	$ )N      ?r   r   BSND)	keep_probscaler      r,   r   diagonal)rf   rg   
atten_masksparse_mode)mathsqrtr   	torch_npunpu_fusion_attentionr    triuonesr,   boolr   )qkv	dropout_psoftmax_scalecausalkwargsrf   head_numrA   attn_mask_npus              r   npu_flash_attn_funcr~      s     iIdii44771://a6U^tuvw  M 

5::tTl188#LWXY^^`771://$#

 
 Mr   c
                 t   SU-
  nUc&  S[         R                  " U R                  S   5      -  nU	(       d  U R                  S   n[        R                  " U UUUS S UUS[        USS  R                  5       R                  5       R                  5       5      [        USS  R                  5       R                  5       R                  5       5      S9S   nU$ [        R                  " [        R                  " SS/U R                  S9SS	9R                  5       nU R                  S   n[        R                  " U UUUS S UUUS[        USS  R                  5       R                  5       R                  5       5      [        USS  R                  5       R                  5       R                  5       5      [        S
9S   nU$ )Nrd   r   r   TND)pserl   rg   rf   input_layoutactual_seq_qlenactual_seq_kvlenr   rh   ri   rj   )	r   padding_maskrl   rg   rf   r   r   r   rm   )rn   ro   r   rp   rq   tuplecpunumpytolistr    rr   rs   r,   rt   r   )ru   rv   rw   cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_krx   ry   rz   r{   rf   r|   rA   r}   s                  r   npu_flash_attn_varlen_funcr      s    iIdii44771://!,qr"2"6"6"8">">"@"G"G"IJ"<#3#7#7#9#?#?#A#H#H#JK
 @ M% 

5::tTl188#LWXY^^`771://$!,qr"2"6"6"8">">"@"G"G"IJ"<#3#7#7#9#?#?#A#H#H#JK#
   Mr   rD   )        NF)NNr   NF)osr    torch.nn.functionalnn
functionalrW   utils.import_utilsr   rn   rp   einopsr   r   r   #DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODEintgetenvr   
ValueErrorr   autogradFunctionr   applyrZ   r=   rI   rM   rb   r~   r   r   r   r   <module>r      s    
    7 (
 %& !&' #"))1;^_`8:]^^
	1 c
JU^^,, J< "'' '// '* ).. >$J  R 4r   