o
    ZŽhåR  ã                   @   s<  d dl mZ d dlmZmZ d dlZddlmZ eG dd„ dƒƒZ	ddeej	 d	eej
eef d
ej	dedee f
dd„Z	ddeej	 d	eej
eef d
ej	dedee f
dd„Zddej	dejdee fdd„Zddej	dejdee fdd„Z	 	dd	eej
eef dejdejdedee deej	 fdd„ZdS )é    )Ú	dataclass)ÚOptionalÚUnionNé   )Úis_torchdynamo_compilingc                   @   sD  e Zd ZU dZeed< eed< d'dedee fdd„Z	d(ded	ed
ede	j
dee	jdf dee	j fdd„Z	d'de	jd	ede	j
d
ee de	jf
dd„Ze		d)de	jde	j
de	jdedee f
dd„ƒZed'de	jde	j
dee fdd„ƒZede	jdefdd „ƒZe		!d*d"ee	j d#e	jdedee d$edefd%d&„ƒZdS )+ÚAttentionMaskConvertera9  
    A utility attention mask class that allows one to:
        - Create a causal 4d mask
        - Create a causal 4d mask with slided window
        - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
          key_value_length) that can be multiplied with attention scores

    Examples:

    ```python
    >>> import torch
    >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter

    >>> converter = AttentionMaskConverter(True)
    >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
    tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00, -3.4028e+38],
            [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00]]]])
    ```

    Parameters:
        is_causal (`bool`):
            Whether the attention mask should be a uni-directional (causal) or bi-directional mask.

        sliding_window (`int`, *optional*):
            Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
    Ú	is_causalÚsliding_windowNc                 C   s:   || _ || _| jd ur| jdkrtd| j› dƒ‚d S d S )Nr   zaMake sure that when passing `sliding_window` that its value is a strictly positive integer, not `ú`)r   r	   Ú
ValueError)Úselfr   r	   © r   úT/var/www/auris/lib/python3.10/site-packages/transformers/modeling_attn_mask_utils.pyÚ__init__9   s   ÿÿzAttentionMaskConverter.__init__ÚcpuÚ
batch_sizeÚquery_lengthÚkey_value_lengthÚdtypeÚdeviceÚstrÚreturnc           	      C   s\   | j std| j› dƒ‚||f}|| }d}|d dks!| jdur,| j||||| jd}|S )z¿
        Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
        bias to upper right hand triangular matrix (causal mask).
        z"Please use `to_causal_4d` only if z has `is_causal` set to True.Néÿÿÿÿr   ©r   Úpast_key_values_lengthr	   )r   r   Ú	__class__r	   Ú_make_causal_mask)	r   r   r   r   r   r   Úinput_shaper   Úcausal_4d_maskr   r   r   Úto_causal_4dB   s   ûz#AttentionMaskConverter.to_causal_4dÚattention_mask_2dc           
      C   s¶   |j d |f}d}|d dks| jdur0| jr0|du rtdƒ‚|| }| j|||j|| jd}n	| jdur9tdƒ‚| j|||d d |j¡}|durW| 	| 
¡ t |¡j¡}|}	|	S )	a  
        Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
        key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
        causal, a causal mask will be added.
        r   Nr   r   zpThis attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask.r   z?Sliding window is currently only implemented for causal masking)Útgt_len)Úshaper	   r   r   r   r   ÚNotImplementedErrorÚ_expand_maskÚtoÚmasked_fillÚboolÚtorchÚfinfoÚmin)
r   r    r   r   r   r   r   r   Zexpanded_attn_maskÚexpanded_4d_maskr   r   r   Úto_4dc   s0   ÿû
ÿzAttentionMaskConverter.to_4dr   Úinput_ids_shaper   c                 C   s  | \}}t j||ft  |¡j|d}t j| d¡|d}| ||d  | d¡d¡k d¡ | |¡}|dkrFt j	t j
||||d|gdd}|durn|| d }	t jt j|t jd|	d	}
tƒ rd| ¡ }| |
t  |¡j¡ |dddd…dd…f  |d||| ¡S )
zJ
        Make causal mask used for bi-directional self-attention.
        )r   r   r   r   ©r   r   )ÚdimN)r   )Údiagonal)r(   Úfullr)   r*   ZarangeÚsizeZmasked_fill_Úviewr%   ÚcatZzerosZtrilZ	ones_liker'   r   ÚcloneÚexpand)r-   r   r   r   r	   Úbszr!   ÚmaskZ	mask_condr0   Zcontext_maskr   r   r   r   ’   s   "
 (z(AttentionMaskConverter._make_causal_maskr8   r!   c                 C   sj   |   ¡ \}}|dur|n|}| dd…dddd…f  |d||¡ |¡}d| }| | tj¡t |¡j¡S )zg
        Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
        Nr   ç      ð?)r2   r6   r%   r&   r(   r'   r)   r*   )r8   r   r!   r7   Zsrc_lenÚexpanded_maskÚinverted_maskr   r   r   r$   ´   s
   *z#AttentionMaskConverter._expand_maskr:   Ú	min_dtypec                 C   s0   | j tjkr
tdƒ‚|  tj| |kddd ¡S )aÏ  
        Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
        using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
        Details: https://github.com/pytorch/pytorch/issues/110213

        `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
        `attention_mask` is [bsz, src_seq_len].

        The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.

        For example, if `expanded_mask` is (e.g. here left-padding case)
        ```
        [[[[0, 0, 0],
           [0, 0, 0],
           [0, 0, 1]]],
         [[[1, 0, 0],
           [1, 1, 0],
           [1, 1, 1]]],
         [[[0, 0, 0],
           [0, 1, 0],
           [0, 1, 1]]]]
        ```
        then the modified `expanded_mask` will be
        ```
        [[[[1, 1, 1],   <-- modified
           [1, 1, 1],   <-- modified
           [0, 0, 1]]],
         [[[1, 0, 0],
           [1, 1, 0],
           [1, 1, 1]]],
         [[[1, 1, 1],   <-- modified
           [0, 1, 0],
           [0, 1, 1]]]]
        ```
        z\AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor.r   T)r/   Zkeepdim)r   r(   r'   r   ÚmulÚall)r:   r<   r   r   r   Ú_unmask_unattendedÂ   s
   *ÿz)AttentionMaskConverter._unmask_unattendedFÚattention_maskÚinputs_embedsÚis_trainingc           
      C   sÄ   |j d |j d }}|| }tj ¡ pt|tjjƒptƒ }d}	| du r<|s(|s:|dks0||kr:|du s8||k r:d}	|	S |du sD||k r`t| j ƒdkrMdS |s`t 	| dk¡r`|dks^||kr`d}	|	S )a9  
        Detects whether the optional user-specified attention_mask & the automatically created causal mask can be
        ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.

        In case no token is masked in the `attention_mask` argument, if `query_length == 1` or
        `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
        allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
        passed).
        r   r   FNTé   )
r"   r(   ÚjitÚ
is_tracingÚ
isinstanceÚfxÚProxyr   Úlenr>   )
r@   rA   r   r	   rB   Ú_r   r   rE   Úignore_causal_maskr   r   r   Ú_ignore_causal_mask_sdpaó   s(   ÿÿòz/AttentionMaskConverter._ignore_causal_mask_sdpa©N)r   ©r   N)NF)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r'   Ú__annotations__Úintr   r   r(   r   r   r   ÚTensorr   r,   ÚstaticmethodÚSizer   r$   ZFloatTensorÚfloatr?   rL   r   r   r   r   r      sŠ   
 úþýüûú
ù&ûþýüû
ú/ûÿþýüû!"ÿþ0ûÿþýüûúr   r@   r   rA   r   r	   c           	      C   sì   t d|d}|d | }| dur%t| jƒdkr%|j| |d ||jd} | S | durdt| jƒdkrd|d d	|d	 |f}t| jƒ|krOtd
t| jƒ› d|› dƒ‚d|  }| | t	j
¡t	 |j¡j¡} | S |j|d |d ||j|jd} | S )añ  
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`

    Args:
        attention_mask (`torch.Tensor` or `None`):
            A 2D attention mask of shape `(batch_size, key_value_length)`
        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
            The input shape should be a tuple that defines `(batch_size, query_length)`.
        inputs_embeds (`torch.Tensor`):
            The embedded inputs as a torch Tensor.
        past_key_values_length (`int`):
            The length of the key value cache.
        sliding_window (`int`, *optional*):
            If the model uses windowed attention, a sliding window should be passed.
    T©r   r	   r   Né   )r   r   rC   r   r   z#Incorrect 4D attention_mask shape: z; expected: Ú.r9   r.   )r   rI   r"   r,   r   Útupler   r&   r%   r(   r'   r)   r*   r   r   )	r@   r   rA   r   r	   Úattn_mask_converterr   Zexpected_shaper;   r   r   r   Ú!_prepare_4d_causal_attention_mask.  s,   ÿïÿÿüÿr^   c           
      C   sÖ   t d|d}|d | }tj ¡ pt|tjjƒptƒ }t j| |||d}|r*d}	|	S | du r@|j	|d |d ||j
|jd}	|	S |  ¡ dkrI| }	n|j| |d |j
|d	}	|si|	jjd
krit j|	t |j
¡jd}	|	S )aé  
    Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.

    In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
    `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
    TrY   r   )r@   rA   r   r	   Nr   r.   rC   )r   r   Úcuda)r<   )r   r(   rD   rE   rF   rG   rH   r   rL   r   r   r   r/   r,   Útyper?   r)   r*   )
r@   r   rA   r   r	   r]   r   rE   rK   r+   r   r   r   Ú*_prepare_4d_causal_attention_mask_for_sdpac  s<   üéÿîü
ÿra   r8   r   r!   c                 C   s   t j| ||dS )áÎ  
    Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`

    Args:
        mask (`torch.Tensor`):
            A 2D attention mask of shape `(batch_size, key_value_length)`
        dtype (`torch.dtype`):
            The torch dtype the created mask shall have.
        tgt_len (`int`):
            The target length or query length the created mask shall have.
    ©r8   r   r!   )r   r$   rc   r   r   r   Ú_prepare_4d_attention_mask  s   rd   c                 C   s^   | j \}}|dur|n|}tj ¡ pt| tjjƒptƒ }|s't | dk¡r'dS t	j
| ||dS )rb   Nr   rc   )r"   r(   rD   rE   rF   rG   rH   r   r>   r   r$   )r8   r   r!   rJ   r   rE   r   r   r   Ú#_prepare_4d_attention_mask_for_sdpa­  s   
re   r   r   c                 C   s8   t d|d}|| d  }|j| d | d |||d}|S )a/  
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`

    Args:
        input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
            The input shape should be a tuple that defines `(batch_size, query_length)`.
        dtype (`torch.dtype`):
            The torch dtype the created mask shall have.
        device (`int`):
            The torch device the created mask shall have.
        sliding_window (`int`, *optional*):
            If the model uses windowed attention, a sliding window should be passed.
    TrY   r   r   r.   )r   r   )r   r   r   r   r	   r]   r   r@   r   r   r   Ú _create_4d_causal_attention_maskÆ  s   ÿrf   rM   rN   )Údataclassesr   Útypingr   r   r(   Zutils.import_utilsr   r   rU   rW   r\   ÚlistrT   r^   ra   r   rd   re   r   rf   r   r   r   r   Ú<module>   sb     ûÿþýü
û:ûÿþýü
û : ûÿþýüûú