
    eThR                     d   S SK Jr  S SKJrJr  S SKrSSKJr  \ " S S5      5       r SS\\R                     S	\\R                  \\4   S
\R                  S\S\\   4
S jjr SS\\R                     S	\\R                  \\4   S
\R                  S\S\\   4
S jjrSS\R                  S\R                   S\\   4S jjrSS\R                  S\R                   S\\   4S jjr  SS	\\R                  \\4   S\R                   S\R&                  S\S\\   S\\R                     4S jjrg)    )	dataclass)OptionalUnionN   )is_torchdynamo_compilingc                      \ rS rSr% Sr\\S'   \\S'   SS\S\\   4S jjr	 S S\S\S	\S
\
R                  S\\
R                  S4   S\\
R                     4S jjr SS\
R                  S\S
\
R                  S	\\   S\
R                  4
S jjr\  S!S\
R$                  S
\
R                  S\
R                  S\S\\   4
S jj5       r\SS\
R                  S
\
R                  S\\   4S jj5       r\S\
R*                  S\4S j5       r\  S"S\\
R                     S\
R                  S\S\\   S\S\4S jj5       rSrg)#AttentionMaskConverter   a  
A utility attention mask class that allows one to:
    - Create a causal 4d mask
    - Create a causal 4d mask with slided window
    - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length,
      key_value_length) that can be multiplied with attention scores

Examples:

```python
>>> import torch
>>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter

>>> converter = AttentionMaskConverter(True)
>>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
        [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
        [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
        [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00, -3.4028e+38],
        [-3.4028e+38, -3.4028e+38, -3.4028e+38,  0.0000e+00,  0.0000e+00]]]])
```

Parameters:
    is_causal (`bool`):
        Whether the attention mask should be a uni-directional (causal) or bi-directional mask.

    sliding_window (`int`, *optional*):
        Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer.
	is_causalsliding_windowNc                     Xl         X l        U R                  b*  U R                  S::  a  [        SU R                   S35      eg g )Nr   zaMake sure that when passing `sliding_window` that its value is a strictly positive integer, not ``)r   r   
ValueError)selfr   r   s      ]/var/www/auris/envauris/lib/python3.13/site-packages/transformers/modeling_attn_mask_utils.py__init__AttentionMaskConverter.__init__9   sd    ",*t/B/Ba/Gstx  uH  uH  tI  IJ  K  0H*    
batch_sizequery_lengthkey_value_lengthdtypedevicestrreturnc                     U R                   (       d  [        SU R                   S35      eX4nX2-
  nSnUS   S:  d  U R                  b  U R	                  UUUUU R                  S9nU$ )z
Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
bias to upper right hand triangular matrix (causal mask).
z"Please use `to_causal_4d` only if z has `is_causal` set to True.Nr   r   past_key_values_lengthr   )r   r   	__class__r   _make_causal_mask)	r   r   r   r   r   r   input_shaper   causal_4d_masks	            r   to_causal_4d#AttentionMaskConverter.to_causal_4dB   s     ~~A$..AQQnopp "0!1!@ r?Q$"5"5"A!33'=#22 4 N r   attention_mask_2dc                    UR                   S   U4nSnUS   S:  d  U R                  bK  U R                  (       a:  Uc  [        S5      eXB-
  nU R	                  UUUR
                  UU R                  S9nOU R                  b  [        S5      eU R                  XUS   S9R                  UR
                  5      nUb>  UR                  UR                  5       [        R                  " U5      R                  5      nUn	U	$ )	z
Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
causal, a causal mask will be added.
r   Nr   r   zpThis attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask.r   z?Sliding window is currently only implemented for causal masking)tgt_len)shaper   r   r   r!   r   NotImplementedError_expand_masktomasked_fillbooltorchfinfomin)
r   r&   r   r   r   r"   r#   r   expanded_attn_maskexpanded_4d_masks
             r   to_4dAttentionMaskConverter.to_4dc   s%    )..q1<@ Oa4#6#6#B'  G  &6%D"!33(//'=#22 4 N   ,%&ghh "../@Q\]_Q`.add$$
 %!/!;!;<N<S<S<UW\WbWbchWiWmWm!n .r   input_ids_shaper   c           	      
   U u  pV[         R                  " Xf4[         R                  " U5      R                  US9n[         R                  " UR                  S5      US9nUR                  XS-   R                  UR                  S5      S5      :  S5        UR                  U5      nUS:  a*  [         R                  " [         R                  " XcXS9U/SS9nUb  X4-
  S-
  n	[         R                  " [         R                  " U[         R                  S9U	S	9n
[        5       (       a  UR                  5       nUR                  U
[         R                  " U5      R                  5        USSSS2SS24   R!                  USXfU-   5      $ )
z:
Make causal mask used for bi-directional self-attention.
)r   r   r   r   r   r   )dimN)r   )diagonal)r/   fullr0   r1   arangesizemasked_fill_viewr,   catzerostril	ones_liker.   r   cloneexpand)r6   r   r   r   r   bszr(   mask	mask_condr:   context_masks              r   r!   (AttentionMaskConverter._make_causal_mask   s>    'zz7,ekk%.@.D.DVTLL2v>	)1}&:&:499R=!&LLaPwwu~!A%99ekk'QVfhlmsuvD %->BH ::eood%**&MX`aL ())zz|lEKK,>,B,BCD$1$%,,S!WH^>^__r   rG   r(   c                 2   U R                  5       u  p4Ub  UOUnU SS2SSSS24   R                  USX$5      R                  U5      nSU-
  nUR                  UR                  [        R
                  5      [        R                  " U5      R                  5      $ )zW
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
Nr         ?)r=   rE   r,   r-   r/   r.   r0   r1   )rG   r   r(   rF   src_lenexpanded_maskinverted_masks          r   r+   #AttentionMaskConverter._expand_mask   s    
 yy{$0'gQdA-.55c1gORRSXYm+(()9)9%**)Eu{{SXGYG]G]^^r   rN   	min_dtypec                     U R                   [        R                  :X  a  [        S5      eU R	                  [        R
                  " X:H  SSS9) 5      $ )a  
Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
Details: https://github.com/pytorch/pytorch/issues/110213

`expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
`attention_mask` is [bsz, src_seq_len].

The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.

For example, if `expanded_mask` is (e.g. here left-padding case)
```
[[[[0, 0, 0],
   [0, 0, 0],
   [0, 0, 1]]],
 [[[1, 0, 0],
   [1, 1, 0],
   [1, 1, 1]]],
 [[[0, 0, 0],
   [0, 1, 0],
   [0, 1, 1]]]]
```
then the modified `expanded_mask` will be
```
[[[[1, 1, 1],   <-- modified
   [1, 1, 1],   <-- modified
   [0, 0, 1]]],
 [[[1, 0, 0],
   [1, 1, 0],
   [1, 1, 1]]],
 [[[1, 1, 1],   <-- modified
   [0, 1, 0],
   [0, 1, 1]]]]
```
z\AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor.r   T)r9   keepdim)r   r/   r.   r   mulall)rN   rQ   s     r   _unmask_unattended)AttentionMaskConverter._unmask_unattended   sN    T %**,n    %))M,FBX\"]!]^^r   attention_maskinputs_embedsis_trainingc                    UR                   S   UR                   S   peXb-   n[        R                  R                  5       =(       d5    [	        U[        R
                  R                  5      =(       d
    [        5       nSn	U c%  U(       d  U(       d  US:X  d  Xv:X  a
  Ub  Xs:  a  Sn	U	$ Ub  Xs:  aL  [        U R                   5      S:X  a  gU(       d+  [        R                  " U S:H  5      (       a  US:X  d  Xv:X  a  Sn	U	$ )a  
Detects whether the optional user-specified attention_mask & the automatically created causal mask can be
ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument.

In case no token is masked in the `attention_mask` argument, if `query_length == 1` or
`key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
passed).
r   r   FT   )
r)   r/   jit
is_tracing
isinstancefxProxyr   lenrU   )
rX   rY   r   r   rZ   _r   r   r^   ignore_causal_masks
             r   _ignore_causal_mask_sdpa/AttentionMaskConverter._ignore_causal_mask_sdpa   s    $ (--a0-2E2Ea2H<'@YY))+vz-/Xv\t\v
"! J!Q&*:*J#+/?/P%)" "! #'7'H>''(A-EIIn.A$B$B1$(8(H)-& "!r   r   r   N)cpur   N)NF)__name__
__module____qualname____firstlineno____doc__r.   __annotations__intr   r   r/   r   r   r   Tensorr$   r4   staticmethodSizer!   r+   FloatTensorfloatrV   re   __static_attributes__ r   r   r	   r	      s   < O$   .3  	
 {{ ellE)* 
%,,	L +/-  <<-  -  {{	- 
 #3--  
- ^ 
 '((,``{{` ` !$	`
 !` `B _5<< _ _hsm _ _ ._((._._ ._` 
 )-!7" .7"||7" !$7" !	7"
 7" 
7" 7"r   r	   rX   r"   rY   r   r   c                    [        SUS9nUS   U-   nU b8  [        U R                  5      S:X  a  UR                  XS   XbR                  S9n U $ U b  [        U R                  5      S:X  a  US   SUS   U4n[        U R                  5      U:w  a%  [        S	[        U R                  5       S
U S35      eSU -
  nUR                  UR                  [        R                  5      [        R                  " UR                  5      R                  5      n  U $ UR                  US   US   XbR                  UR                  S9n U $ )a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`

Args:
    attention_mask (`torch.Tensor` or `None`):
        A 2D attention mask of shape `(batch_size, key_value_length)`
    input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
        The input shape should be a tuple that defines `(batch_size, query_length)`.
    inputs_embeds (`torch.Tensor`):
        The embedded inputs as a torch Tensor.
    past_key_values_length (`int`):
        The length of the key value cache.
    sliding_window (`int`, *optional*):
        If the model uses windowed attention, a sliding window should be passed.
Trg   r      )r   r   r\   r   r   z#Incorrect 4D attention_mask shape: z; expected: .rL   r8   )r	   rb   r)   r4   r   tupler   r-   r,   r/   r.   r0   r1   r$   r   )	rX   r"   rY   r   r   attn_mask_converterr   expected_shaperO   s	            r   !_prepare_4d_causal_attention_maskr   .  sg   . 14P^_"2)?? !c.*>*>&?1&D,22O>NViVi 3 
( # 
	#N,@,@(AQ(F%a.![^=MN%%&.85eN<P<P6Q5RR^_m^nnop 
  .0M*66  ,ekk-:M:M.N.R.RN 	 -99NKO-=EXEXanauau : 
 r   c                    [        SUS9nUS   U-   n[        R                  R                  5       =(       d5    [	        U[        R
                  R                  5      =(       d
    [        5       n[         R                  U UUUS9nU(       a  Sn	U	$ U c.  UR                  US   US   XbR                  UR                  S9n	U	$ U R                  5       S:X  a  U n	OUR                  U US   UR                  US	9n	U(       dV  U	R                  R                  S
:X  a<  [         R                  U	[        R                   " UR                  5      R"                  S9n	U	$ )a  
Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.

In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
`key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
Trg   r   )rX   rY   r   r   Nr   r8   r\   )r   r   cuda)rQ   )r	   r/   r]   r^   r_   r`   ra   r   re   r$   r   r   r9   r4   typerV   r0   r1   )
rX   r"   rY   r   r   r}   r   r^   rd   r3   s
             r   *_prepare_4d_causal_attention_mask_for_sdpar   c  se    14P^_"2)??
 %%'r:mUXX^^+TrXpXrJ/HH%#5%	 I  0 / 
	.;;NKO-=EXEXanauau < 
, % 1$-288B#))!1	  9   .55::fD5HH EKK8K8K,L,P,P  I   r   rG   r   r(   c                 *    [         R                  XUS9$ )  
Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`

Args:
    mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)`
    dtype (`torch.dtype`):
        The torch dtype the created mask shall have.
    tgt_len (`int`):
        The target length or query length the created mask shall have.
rG   r   r(   )r	   r+   r   s      r   _prepare_4d_attention_maskr     s     "..Dw.WWr   c                 T   U R                   u  p4Ub  UOUn[        R                  R                  5       =(       d5    [	        U [        R
                  R                  5      =(       d
    [        5       nU(       d  [        R                  " U S:H  5      (       a  g[        R                  XUS9$ )r   Nr   r   )r)   r/   r]   r^   r_   r`   ra   r   rU   r	   r+   )rG   r   r(   rc   r   r^   s         r   #_prepare_4d_attention_mask_for_sdpar     s}     **A ,g2BG%%'i:dEHHNN+KiOgOiJ %))DAI..%22SZ2[[r   r   r   c                 X    [        SUS9nX0S   -   nUR                  U S   U S   XaUS9nU$ )a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`

Args:
    input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
        The input shape should be a tuple that defines `(batch_size, query_length)`.
    dtype (`torch.dtype`):
        The torch dtype the created mask shall have.
    device (`int`):
        The torch device the created mask shall have.
    sliding_window (`int`, *optional*):
        If the model uses windowed attention, a sliding window should be passed.
Trg   r   r   r8   )r	   r$   )r"   r   r   r   r   r}   r   rX   s           r    _create_4d_causal_attention_maskr     sN    ( 14P^_-B?(55AB)9v 6 N r   rh   rj   )dataclassesr   typingr   r   r/   utils.import_utilsr   r	   rr   rt   r|   listrq   r   r   r   r   r   r   r   rx   r   r   <module>r      s   " "  8 T" T" T"x %)1U\\*1uzz5$./1 <<1  	1
 SM1t %)7U\\*7uzz5$./7 <<7  	7
 SM7tXU\\ X%++ XPXY\P] X \ell \5;; \YabeYf \: #$$(uzz5$./;; LL  	
 SM ellr   