
    eTh_:                    :   S SK Jr  S SKrS SKJrJr  S SKJr  S SKrS SK	J
r
Jr  S SKJr  SSKJrJrJrJr  \R$                  /r\R(                  " \5      r\" S	S
S9r\" SS
S9r\" SS
S9r\" SS
S9r\" SS
S9r\" SS
S9r\" SS
S9r\" SS
S9r\R>                  RA                  5       r!\" S5      (       a  \!(       a   S r"S!S"S jjr# " S S\RH                  5      r%S#S$S jjr&S%S&S jjr'        S'S jr(          S(S jr)SS.S)S jjr*S*S jr+S+S jr,S  r-g),    )annotationsN)	lru_cachewraps)Callable)storage_ptrstorage_size)nn   )is_torch_greater_or_equalis_torch_xla_availableis_torchdynamo_compilingloggingz2.6T)
accept_devz2.4z2.3z2.2z2.1z2.0z1.13z1.12z2.5c                J    SSK Jn  U" XU R                  UR                  5      $ )z
A function that calls the internal `_softmax_backward_data` PyTorch method and that adjusts the arguments according
to the torch version detected.
r   )_softmax_backward_data)torchr   dimdtype)parentgrad_outputoutputr   selfr   s         R/var/www/auris/envauris/lib/python3.13/site-packages/transformers/pytorch_utils.pysoftmax_backward_datar   1   s     -!+vzz4::NN    c                   UR                  U R                  R                  5      nU R                  R                  X!5      R	                  5       R                  5       nU R                  bZ  US:X  a)  U R                  R	                  5       R                  5       nO+U R                  U   R	                  5       R                  5       n[        U R                  R                  5       5      n[        U5      XR'   [        R                  " US   US   U R                  SLS9R                  U R                  R                  5      nSUR                  l        UR                  R                  UR                  5       5        SUR                  l        U R                  bK  SUR                  l        UR                  R                  WR                  5       5        SUR                  l        U$ )a|  
Prune a linear layer to keep only entries in index.

Used to remove heads.

Args:
    layer (`torch.nn.Linear`): The layer to prune.
    index (`torch.LongTensor`): The indices to keep in the layer.
    dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices.

Returns:
    `torch.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`.
Nr
   r   )biasFT)toweightdeviceindex_selectdetachcloner   listsizelenr	   Linearrequires_gradcopy_
contiguouslayerindexr   Wbnew_size	new_layers          r   prune_linear_layerr2   <   sa    HHU\\(()E!!#-446<<>Azz!8

!!#))+A

5!((*002AELL%%'(HJHM		(1+x{49OPSSTYT`T`TgTghI%*I"1<<>*%)I"zz',	$Q\\^,'+	$r   c                  <   ^  \ rS rSrSrU 4S jrSS jrS rSrU =r	$ )Conv1D^   a  
1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).

Basically works like a linear layer but the weights are transposed.

Args:
    nf (`int`): The number of output features.
    nx (`int`): The number of input features.
c                F  > [         TU ]  5         Xl        X l        [        R
                  " [        R                  " X!5      5      U l        [        R
                  " [        R                  " U5      5      U l
        [        R                  R                  U R                  SS9  g )Ng{Gz?)std)super__init__nfnxr	   	Parameterr   emptyr   zerosr   initnormal_)r   r:   r;   	__class__s      r   r9   Conv1D.__init__i   sa    ll5;;r#67LLR1	
.r   c                :    SR                   " S0 U R                  D6$ )NzConv1D(nf={nf}, nx={nx}) )format__dict__)r   s    r   __repr__Conv1D.__repr__q   s    )00A4==AAr   c           	         UR                  5       S S U R                  4-   n[        R                  " U R                  UR                  SUR                  S5      5      U R                  5      nUR                  U5      nU$ )N)r%   r:   r   addmmr   viewr   )r   xsize_outs      r   forwardConv1D.forwardt   s^    668CR=DGG:-KK		166"affRj#94;;GFF8r   )r   r:   r;   r   )returnstr)
__name__
__module____qualname____firstlineno____doc__r9   rG   rO   __static_attributes____classcell__)rA   s   @r   r4   r4   ^   s    /B r   r4   c                h   UR                  U R                  R                  5      nU R                  R                  X!5      R	                  5       R                  5       nUS:X  a)  U R                  R	                  5       R                  5       nO+U R                  U   R	                  5       R                  5       n[        U R                  R                  5       5      n[        U5      XR'   [        US   US   5      R                  U R                  R                  5      nSUR                  l        UR                  R                  UR                  5       5        SUR                  l        SUR                  l        UR                  R                  UR                  5       5        SUR                  l        U$ )a  
Prune a Conv1D layer to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights
are transposed.

Used to remove heads.

Args:
    layer ([`~pytorch_utils.Conv1D`]): The layer to prune.
    index (`torch.LongTensor`): The indices to keep in the layer.
    dim (`int`, *optional*, defaults to 1): The dimension on which to keep the indices.

Returns:
    [`~pytorch_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`.
r   r
   FT)r   r   r    r!   r"   r#   r   r$   r%   r&   r4   r(   r)   r*   r+   s          r   prune_conv1d_layerr[   {   s;    HHU\\(()E!!#-446<<>A
axJJ%%'JJu$$&,,.ELL%%'(HJHMx{HQK033ELL4G4GHI%*I"1<<>*%)I"#(INN NN(#'INN r   c                    [        U [        R                  5      (       a  [        XUc  SS9$ US9$ [        U [        5      (       a  [        XUc  SS9$ US9$ [        SU R                   35      e)a  
Prune a Conv1D or linear layer to keep only entries in index.

Used to remove heads.

Args:
    layer (`Union[torch.nn.Linear, Conv1D]`): The layer to prune.
    index (`torch.LongTensor`): The indices to keep in the layer.
    dim (`int`, *optional*): The dimension on which to keep the indices.

Returns:
    `torch.nn.Linear` or [`~pytorch_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`.
r   r   r
   zCan't prune layer of class )
isinstancer	   r'   r2   r4   r[   
ValueErrorrA   )r,   r-   r   s      r   prune_layerr`      sl     %##!%ANN#NN	E6	"	"!%ANN#NN6u6GHIIr   c                  ^ ^^	 [        U5      S:  d
   U S35       e[        [        R                  " T 5      R                  5      nU[        U5      :w  a  [	        SU S[        U5       S35      eUS:  a  US   R
                  T   nU H4  nUR
                  T   U:w  d  M  [	        SU SUR
                  T    35      e   US   R
                  T   U-  S:w  a!  [	        SUS   R
                  T    S	U 35      eUS   R
                  T   U-  m	[        UU	4S
 jU 5       5      n[        U 4S j[        U6  5       5      n[        R                  " UTS9$ T " U6 $ )a  
This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension
`chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory.

If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly
applying `forward_fn` to `input_tensors`.

Args:
    forward_fn (`Callable[..., torch.Tensor]`):
        The forward function of the model.
    chunk_size (`int`):
        The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`.
    chunk_dim (`int`):
        The dimension over which the `input_tensors` should be chunked.
    input_tensors (`Tuple[torch.Tensor]`):
        The input tensors of `forward_fn` which will be chunked

Returns:
    `torch.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`.


Examples:

```python
# rename the usual forward() fn to forward_chunk()
def forward_chunk(self, hidden_states):
    hidden_states = self.decoder(hidden_states)
    return hidden_states


# implement a chunked forward function
def forward(self, hidden_states):
    return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
```r   z" has to be a tuple/list of tensorszforward_chunk_fn expects z arguments, but only z input tensors are givenz/All input tenors have to be of the same shape: z, found shape zThe dimension to be chunked z( has to be a multiple of the chunk size c              3  B   >#    U  H  oR                  TTS 9v   M     g7f)r]   N)chunk).0input_tensor	chunk_dim
num_chunkss     r   	<genexpr>,apply_chunking_to_forward.<locals>.<genexpr>   s"     $ugtWc%7%7
	%7%Rgts   c              3  .   >#    U  H
  nT" U6 v   M     g 7fNrD   )rd   input_tensors_chunk
forward_fns     r   rh   ri      s     uZtCVj*=>Zts   r]   )
r&   inspect	signature
parametersr_   shapetuplezipr   cat)
rm   
chunk_sizerf   input_tensorsnum_args_in_forward_chunk_fntensor_shapere   input_tensors_chunksoutput_chunksrg   s
   ` `      @r   apply_chunking_to_forwardr{      s   R }!Wm_4V#WW! $'w'8'8'D'O'O#P #s='99'(D'EEZ[^_l[mZn o   
 	

 A~$Q'--i8)L!!),< El^ T##/#5#5i#@"AC  * !!),z9Q>.}Q/?/E/Ei/P.Q R"|% 
 #1%++I6*D
  %$ugt$uuuZ]_sZtuuyyI66}%%r   c                Z  ^ [         R                  " X5      n[        U 5      U-
  n U  H   mT[        U4S jU 5       5      -
  mSUT'   M"     UR	                  S5      R                  5       R                  S5      n[         R                  " [        U5      5      U   R                  5       nX4$ )a  
Finds the heads and their indices taking `already_pruned_heads` into account.

Args:
    heads (`List[int]`): List of the indices of heads to prune.
    n_heads (`int`): The number of heads in the model.
    head_size (`int`): The size of each head.
    already_pruned_heads (`Set[int]`): A set of already pruned heads.

Returns:
    `Tuple[Set[int], torch.LongTensor]`: A tuple with the indices of heads to prune taking `already_pruned_heads`
    into account and the indices of rows/columns to keep in the layer weight.
c              3  6   >#    U  H  oT:  a  S OSv   M     g7f)r
   r   NrD   )rd   hheads     r   rh   3find_pruneable_heads_and_indices.<locals>.<genexpr>  s     M8L14x!Q.8Ls   r   rJ   r
   )
r   onessetsumrL   r*   eqaranger&   long)headsn_heads	head_sizealready_pruned_headsmaskr-   r   s         @r    find_pruneable_heads_and_indicesr      s      ::g)DJ--EcM8LMMMT
  99R=##%((+D#ll3t95d;@@BE<r   )indexingc                .    [         R                  " USU 06$ )z
Wrapper around torch.meshgrid to avoid warning messages about the introduced `indexing` argument.

Reference: https://pytorch.org/docs/1.13/generated/torch.meshgrid.html
r   )r   meshgrid)r   tensorss     r   r   r     s     >>76X66r   c                   [         (       a[  SSKJn  [        X5      (       aE  U R	                  5       nU R
                  UR                  5       R                  5       U R                  4$ U R
                  R                  S:X  a/  [        5       (       a   SSKnUR                  R                  U 5      nO[        U 5      nU R
                  U[        U 5      4$ )a  
Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For
example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is
guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with
non-overlapping lifetimes may have the same id.
r   )DTensorxlaN)"is_torch_greater_or_equal_than_2_0torch.distributed.tensorr   r^   to_localr    storagedata_ptrnbytestyper   	torch_xla_XLAC_xla_get_tensor_idr   r   )tensorr   local_tensorr   	unique_ids        r   id_tensor_storager   $  s     *)4f&&!??,L==,"6"6"8"A"A"CV]]RR}}U"'='?'?
 	OO66v>	'	==)\&%999r   c                   U R                   R                  S:X  a  [        (       d  [        R                  " U5      nUR
                  S:X  a  UR                  S5      nU R                  UR                  S   S5      R                  UR                  S5      5      R                  SS9R                  5       R                  5       $ [        R                  " X5      $ )a  
Same as `torch.isin` without flags, but MPS-friendly. We can remove this function when we stop supporting
torch <= 2.3. See https://github.com/pytorch/pytorch/issues/77764#issuecomment-2067838075

Args:
    elements (`torch.Tensor`): Input elements
    test_elements (`torch.Tensor` or `int`): The elements to check against.

Returns:
    `torch.Tensor`: A boolean tensor of the same shape as `elements` that is True for `elements` in `test_elements`
    and False otherwise
mpsr   r
   r]   )r    r   "is_torch_greater_or_equal_than_2_4r   r   ndim	unsqueezetilerq   r   r   boolsqueezeisin)elementstest_elementss     r   isin_mps_friendlyr   @  s     u$-O-O]3")33A6M}}]003Q7::=;R;RST;UVZZ_`Zaffhpprr zz(22r   c                    ^ ^ U U4S jnU$ )z
LRU cache decorator from standard functools library, but with a workaround to disable
caching when torchdynamo is compiling. Expected to work with class methods.
c                6   >^  [        T 5      U UU4S j5       nU$ )Nc                8  > [        5       (       d  [        U STR                   35      (       d=  U R                  STR                   3[	        T0 TD6" TR                  U 5      5      5        U R                  STR                   35      " U0 UD6$ T" U /UQ70 UD6$ )N_cached_)r   hasattrrS   __setattr__r   __get____getattribute__)r   argskwargsfunclru_args
lru_kwargss      r   wrapperGcompile_compatible_method_lru_cache.<locals>.decorator.<locals>.wrapper_  s    +-- tx%?@@$$"4==/2Ix4V:4VW[WcWcdhWi4j ,,x-GH$YRXYY D242622r   )r   )r   r   r   r   s   ` r   	decorator6compile_compatible_method_lru_cache.<locals>.decorator^  s     	t	3 
	3 r   rD   )r   r   r   s   `` r   #compile_compatible_method_lru_cacher   X  s    " r   )r   )r,   	nn.Linearr-   torch.LongTensorr   intrQ   r   )r
   )r,   r4   r-   r   r   r   rQ   r4   rk   )r,   nn.Linear | Conv1Dr-   r   r   z
int | NonerQ   r   )rm   zCallable[..., torch.Tensor]ru   r   rf   r   rQ   torch.Tensor)
r   z	list[int]r   r   r   r   r   zset[int]rQ   z!tuple[set[int], torch.LongTensor])r   z!torch.Tensor | list[torch.Tensor]r   z
str | NonerQ   ztuple[torch.Tensor, ...])r   r   rQ   ztuple[torch.device, int, int])r   r   r   ztorch.Tensor | intrQ   r   ).
__future__r   rn   	functoolsr   r   typingr   r   safetensors.torchr   r   r	   utilsr   r   r   r   	LayerNormALL_LAYERNORM_LAYERS
get_loggerrS   logger"is_torch_greater_or_equal_than_2_6r   "is_torch_greater_or_equal_than_2_3"is_torch_greater_or_equal_than_2_2"is_torch_greater_or_equal_than_2_1r   #is_torch_greater_or_equal_than_1_13#is_torch_greater_or_equal_than_1_12distributedis_available_torch_distributed_availabler   r2   Moduler4   r[   r`   r{   r   r   r   r   r   rD   r   r   <module>r      sy   #  &   7  g g ~ 			H	%%>uQU%V "%>uQU%V "%>uQU%V "%>uQU%V " &?uQU%V "%>uQU%V "&?SW&X #&?SW&X #  %00==? U##(DODRYY :BJ,K&+K&K& K&
 K&\"/2JR&6 RV 7:830r   