o
    Zh$                     @   s   d dl Z d dlZd dlm  mZ ddlmZ e r*d dlZd dl	Z	d dl
mZmZ dZdZee jdedZeeefvrAedd	d
 ZG dd dejjZejZG dd dejjZejZdd ZdddZ			dddZ					dddZdS )    N   )is_torch_npu_available)	rearrangerepeat   ZNPU_FA2_SPARSE_MODE)defaultzEnvironment variable `NPU_FA2_SPARSE_MODE` can only be set as 2 (top-left aligned causal mask) or 3 (down-right aligned causal mask).c                   C   s   t  rttkS dS )NF)r   SPARSE_MODE!TOP_LEFT_ALIGNED_CAUSAL_MASK_MODE r
   r
   \/var/www/auris/lib/python3.10/site-packages/transformers/integrations/npu_flash_attention.py'is_npu_fa2_top_left_aligned_causal_mask)   s   r   c                   @   $   e Zd Zedd Zedd ZdS )IndexFirstAxisc              	   C   sh   |  | |jdksJ |jd |jdd  | _}| }tt|ddt|d|dj	dg|R  S )Nr   r      b ... -> b (...)z -> z dd)
save_for_backwardndimshapefirst_axis_dimZnumeltorchgatherr   r   reshape)ctxinputindicesother_shapeZ
second_dimr
   r
   r   forward/   s   
zIndexFirstAxis.forwardc                 C   s   | j \}|jdksJ |jdd  }t|d}tj| j|jd g|j|jd}|	dt
|d|jd d| |j| jg|R  d fS )Nr   r   r   devicedtyper   r   r   )saved_tensorsr   r   r   r   zerosr   r"   r#   Zscatter_r   r   )r   grad_outputr   r   Z
grad_inputr
   r
   r   backward;   s   
zIndexFirstAxis.backwardN__name__
__module____qualname__staticmethodr    r'   r
   r
   r
   r   r   .   s
    
r   c                   @   r   )IndexPutFirstAxisc                 C   sZ   |  | |jdksJ |jdksJ tj|g|jdd  R |j|jd}|||< |S )Nr   r   r!   )r   r   r   r%   r   r"   r#   )r   valuesr   r   outputr
   r
   r   r    Q   s   
(zIndexPutFirstAxis.forwardc                 C   s   | j \}|| }|d d fS N)r$   )r   r&   r   Zgrad_valuesr
   r
   r   r'   \   s   
zIndexPutFirstAxis.backwardNr(   r
   r
   r
   r   r-   P   s
    

r-   c                 C   s   t | ||| }t|d|dS )a  
    Arguments:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens in selected in attention_mask.
        indices: (total_nnz), the indices that represent the non-masked tokens of the original padded input sequence.
        batch: int, batch size for the padded sequence.
        seqlen: int, maximum sequence length for the padded sequence.
    Return:
        hidden_states: (batch, seqlen, ...)
    z(b s) ... -> b s ...)b)index_put_first_axisr   )hidden_statesr   batchZseqlenr/   r
   r
   r   	pad_inputi   s   r5   c           	      C   s   |dur|| n|}|j dtjd}|j dtjd}tj| dd }|  }ttj	|dtjdd}t
t| d|||||fS )	a  
    Arguments:
        hidden_states: (batch, seqlen, ...)
        attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
        unused_mask: (batch, seqlen), bool / int, 1 means the element is allocated but unused.
    Return:
        hidden_states: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask + unused_mask.
        indices: (total_nnz), the indices of masked tokens from the flattened input sequence.
        cu_seqlens: (batch + 1), the cumulative sequence lengths, used to index into hidden_states.
        max_seqlen_in_batch: int
        seqused: (batch), returns the number of tokens selected in attention_mask + unused_mask.
    Nr   )dimr#   F)as_tupler   )r   r   zb s ... -> (b s) ...)sumr   Zint32ZnonzeroflattenmaxitemFpadZcumsumindex_first_axisr   )	r3   Zattention_maskZunused_maskZ	all_masksZseqlens_in_batchZused_seqlens_in_batchr   Zmax_seqlen_in_batchZ
cu_seqlensr
   r
   r   unpad_input{   s   r?           Fc                 K   s   d| }|d u rdt | jd  }|s)| jd }tj| |||d||dd }	|	S tjtjddg| jdd	d
	 }
| jd }tj| |||d|||
t
d	d }	|	S )N      ?r   r   ZBSND)	keep_probscaler      r"   r   Zdiagonal)rB   rC   
atten_masksparse_mode)mathsqrtr   	torch_npunpu_fusion_attentionr   triuonesr"   boolr   )qkv	dropout_psoftmax_scalecausalkwargsrB   head_numr/   attn_mask_npur
   r
   r   npu_flash_attn_func   s.   	
"

rY   c
                 K   s  d| }|d u rdt | jd  }|	sE| jd }tj| |||d d ||dt|dd     t|dd     dd }|S t	j
t	jddg| jddd	 }| jd }tj| |||d d |||dt|dd     t|dd     td
d }|S )NrA   r   r   ZTND)pserG   rC   rB   input_layoutactual_seq_qlenactual_seq_kvlenr   rD   rE   rF   )	rZ   Zpadding_maskrG   rC   rB   r[   r\   r]   rH   )rI   rJ   r   rK   rL   tuplecpunumpytolistr   rM   rN   r"   rO   r   )rP   rQ   rR   Zcu_seqlens_qZcu_seqlens_kZmax_seqlen_qZmax_seqlen_krS   rT   rU   rV   rB   rW   r/   rX   r
   r
   r   npu_flash_attn_varlen_func   sR   
 "
rb   r0   )r@   NF)NNr@   NF)osr   Ztorch.nn.functionalnnZ
functionalr<   Zutils.import_utilsr   rI   rK   Zeinopsr   r   r	   Z#DOWN_RIGHT_ALIGNED_CAUSAL_MASK_MODEintgetenvr   
ValueErrorr   ZautogradFunctionr   applyr>   r-   r2   r5   r?   rY   rb   r
   r
   r
   r   <module>   s@   
%
)