
    eTh%                        S SK r S SKJr  S SKrS SKJrJr  SSKJr  \R                  " \	5      r
 " S S\R                  5      r " S S	\R                  5      r " S
 S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\R                  5      r " S S\5      r0 S\_S\S S!S".4_S#\_S$\_S%\S&S'04_S(\_S)\_S*\_S+\R0                  _S,\_S-\_S.\_S/\R2                  _S0\_S1\R4                  _S2\R6                  _S3\R8                  _\R8                  \R:                  \R<                  S4.Er\" \5      r S5 r!\!" S%5      r"\!" S$5      r#\!" S5      r$\!" S#5      r%\!" S.5      r&\!" S35      r'\!" S-5      r(\!" S,5      r)g)6    N)OrderedDict)Tensornn   )loggingc                   *    \ rS rSrSrS\S\4S jrSrg)PytorchGELUTanh   a
  
A fast C implementation of the tanh approximation of the GeLU activation function. See
https://arxiv.org/abs/1606.08415.

This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical
match due to rounding errors.
inputreturnc                 >    [         R                  R                  USS9$ )Ntanh)approximate)r   
functionalgeluselfr   s     P/var/www/auris/envauris/lib/python3.13/site-packages/transformers/activations.pyforwardPytorchGELUTanh.forward$   s    }}!!%V!<<     N__name__
__module____qualname____firstlineno____doc__r   r   __static_attributes__r   r   r   r	   r	      s    =V = =r   r	   c                   *    \ rS rSrSrS\S\4S jrSrg)NewGELUActivation(   z
Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
r   r   c                     SU-  S[         R                  " [        R                  " S[        R                  -  5      US[         R
                  " US5      -  -   -  5      -   -  $ )N      ?      ?       @Hm?g      @)torchr   mathsqrtpipowr   s     r   r   NewGELUActivation.forward.   sP    U{cEJJtyytww/G5S[^c^g^ghmor^sSsKs/t$uuvvr   r   Nr   r   r   r   r!   r!   (   s    
wV w wr   r!   c                   \   ^  \ rS rSrSrS
S\4U 4S jjjrS\S\4S jrS\S\4S jr	S	r
U =r$ )GELUActivation2   a  
Original Implementation of the GELU activation function in Google BERT repo when initially created. For
information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
use_gelu_pythonc                    > [         TU ]  5         U(       a  U R                  U l        g [        R
                  R                  U l        g N)super__init___gelu_pythonactr   r   r   )r   r1   	__class__s     r   r5   GELUActivation.__init__:   s/    ((DH}}))DHr   r   r   c                 n    US-  S[         R                  " U[        R                  " S5      -  5      -   -  $ )Nr$   r%   r&   )r(   erfr)   r*   r   s     r   r6   GELUActivation._gelu_pythonA   s,    s{cEIIediin.D$EEFFr   c                 $    U R                  U5      $ r3   r7   r   s     r   r   GELUActivation.forwardD       xxr   r>   )F)r   r   r   r   r   boolr5   r   r6   r   r   __classcell__r8   s   @r   r/   r/   2   sG    * * *G& GV GV   r   r/   c                   *    \ rS rSrSrS\S\4S jrSrg)FastGELUActivationH   zu
Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs
r   r   c                 ^    SU-  S[         R                  " US-  SSU-  U-  -   -  5      -   -  $ )Nr$   r%   g3E?r'   )r(   r   r   s     r   r   FastGELUActivation.forwardM   s:    U{cEJJu|/CsXX]M]`eMeGe/f$gghhr   r   Nr   r   r   r   rE   rE   H   s    iV i ir   rE   c                   *    \ rS rSrSrS\S\4S jrSrg)QuickGELUActivationQ   zj
Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs
r   r   c                 :    U[         R                  " SU-  5      -  $ )NgZd;?)r(   sigmoidr   s     r   r   QuickGELUActivation.forwardV   s    u}}UU]333r   r   Nr   r   r   r   rJ   rJ   Q   s    4V 4 4r   rJ   c                   J   ^  \ rS rSrSrS\S\4U 4S jjrS\S\4S jrS	r	U =r
$ )
ClippedGELUActivationZ   ab  
Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as
it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to
https://arxiv.org/abs/2004.09602.

Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when
initially created.

For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415
minmaxc                 h   > X:  a  [        SU SU S35      e[        TU ]	  5         Xl        X l        g )Nzmin should be < max (got min: z, max: ))
ValueErrorr4   r5   rR   rS   )r   rR   rS   r8   s      r   r5   ClippedGELUActivation.__init__g   s8    9=cU'#aPQQr   xr   c                 l    [         R                  " [        U5      U R                  U R                  5      $ r3   )r(   clipr   rR   rS   )r   rX   s     r   r   ClippedGELUActivation.forwardo   s!    zz$q'488TXX66r   )rS   rR   )r   r   r   r   r   floatr5   r   r   r   rB   rC   s   @r   rP   rP   Z   s3    
E  7 7F 7 7r   rP   c                   >   ^  \ rS rSrSrU 4S jrS\S\4S jrSrU =r	$ )AccurateGELUActivations   z
Applies GELU approximation that is faster than default and more accurate than QuickGELU. See:
https://github.com/hendrycks/GELUs

Implemented along with MEGA (Moving Average Equipped Gated Attention)
c                 z   > [         TU ]  5         [        R                  " S[        R                  -  5      U l        g )N   )r4   r5   r)   r*   r+   precomputed_constantr   r8   s    r   r5   AccurateGELUActivation.__init__{   s'    $(IIa$''k$:!r   r   r   c                     SU-  S[         R                  " U R                  US[         R                  " US5      -  -   -  5      -   -  $ )Nr$   r   r'      )r(   r   rb   r,   r   s     r   r   AccurateGELUActivation.forward   sE    U{a%**T-F-F%RZ]b]f]fglno]pRpJp-q"rrssr   )rb   )
r   r   r   r   r   r5   r   r   r   rB   rC   s   @r   r^   r^   s   s)    ;tV t t tr   r^   c                   P   ^  \ rS rSrSrU 4S jrS\S\4S jrS\S\4S jrSr	U =r
$ )	MishActivation   z
See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
visit the official repository for the paper: https://github.com/digantamisra98/Mish
c                 `   > [         TU ]  5         [        R                  R                  U l        g r3   )r4   r5   r   r   mishr7   rc   s    r   r5   MishActivation.__init__   s    ==%%r   r   r   c                 n    U[         R                  " [        R                  R	                  U5      5      -  $ r3   )r(   r   r   r   softplusr   s     r   _mish_pythonMishActivation._mish_python   s%    uzz"--"8"8"?@@@r   c                 $    U R                  U5      $ r3   r>   r   s     r   r   MishActivation.forward   r@   r   r>   )r   r   r   r   r   r5   r   rp   r   r   rB   rC   s   @r   ri   ri      s;    
&A& AV AV   r   ri   c                   *    \ rS rSrSrS\S\4S jrSrg)LinearActivation   zS
Applies the linear activation function, i.e. forwarding input directly to output.
r   r   c                     U$ r3   r   r   s     r   r   LinearActivation.forward   s    r   r   Nr   r   r   r   ru   ru      s    V  r   ru   c                   "    \ rS rSrSrSS jrSrg)LaplaceActivation   z
Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See
https://arxiv.org/abs/2209.10655

Inspired by squared relu, but with bounded range and gradient for better stability
c                     X-
  R                  U[        R                  " S5      -  5      nSS[        R                  " U5      -   -  $ )Nr&   r$   r%   )divr)   r*   r(   r;   )r   r   musigmas       r   r   LaplaceActivation.forward   s:      3!78cEIIe,,--r   r   N)g۞?g ^/?r   r   r   r   r   r   r   r   r   r   rz   rz      s    .r   rz   c                       \ rS rSrSrS rSrg)ReLUSquaredActivation   zP
Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2
c                 p    [         R                  R                  U5      n[        R                  " U5      nU$ r3   )r   r   relur(   square)r   r   relu_appliedsquareds       r   r   ReLUSquaredActivation.forward   s)    }}))%0,,|,r   r   Nr   r   r   r   r   r      s    r   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )ClassInstantier   c                 l   > [         TU ]  U5      n[        U[        5      (       a  UOU0 4u  p4U" S0 UD6$ )Nr   )r4   __getitem__
isinstancetuple)r   keycontentclskwargsr8   s        r   r   ClassInstantier.__getitem__   s7    '%c*!+GU!;!;g'2}V}r   r   )r   r   r   r   r   r   rB   rC   s   @r   r   r      s     r   r   r   gelu_10i
   )rR   rS   	gelu_fastgelu_newgelu_pythonr1   Tgelu_pytorch_tanhgelu_accuratelaplace
leaky_relulinearrl   
quick_gelur   relu2relu6rM   silu)swishr   preluc           	          U [         ;   a	  [         U    $ [        SU  S[        [         R                  5       5       35      e)Nz	function z not found in ACT2FN mapping )ACT2FNKeyErrorlistkeys)activation_strings    r   get_activationr      sB    F"'((#4"55RSWX^XcXcXeSfRghiir   )*r)   collectionsr   r(   r   r   utilsr   
get_loggerr   loggerModuler	   r!   r/   rE   rJ   rP   r^   ri   ru   rz   r   r   	LeakyReLUReLUReLU6SigmoidSiLUTanhPReLUACT2CLSr   r   r   r   r   r   r   r   rl   
linear_actr   r   r   <module>r      sj    #    
		H	%
=bii 
=w		 wRYY ,i i4")) 47BII 72tRYY t RYY "ryy 
.		 
.BII k 
N%s2'>? # !	
 N%6$=>  +   ",,  N % BGG " RXX  rzz!" BGG#$ WWGGXX), 
	!j ]+*%f;'	L)
ffH%
r   