
    eThrA                        S r SSKrSSKJrJrJr  SSKr SSKJ	r	  SSKJr  \" \R                   R"                  S5      (       a!  \R                   R"                  R$                  rO\R                   R"                  r " S S	\R&                  5      r         SS
\S\S\S\S\S\S\S\\   S\\   S\S\S\\\      4S jjr " S S\	5      r " S S5      rg! \
\4 a	    SSKJ	r	   Nf = f)z?Functions and classes related to optimization (weight updates).    N)CallableOptionalUnion)Adam   )keraslearning_rate_schedulec                   ^   ^  \ rS rSrSr  SS\S\S\S\S\\	   4
U 4S jjjr
S	 rS
 rSrU =r$ )WarmUp&   a  
Applies a warmup schedule on a given learning rate decay schedule.

Args:
    initial_learning_rate (`float`):
        The initial learning rate for the schedule after the warmup (so this will be the learning rate at the end
        of the warmup).
    decay_schedule_fn (`Callable`):
        The schedule function to apply after the warmup for the rest of training.
    warmup_steps (`int`):
        The number of steps for the warmup part of training.
    power (`float`, *optional*, defaults to 1.0):
        The power to use for the polynomial warmup (defaults is a linear warmup).
    name (`str`, *optional*):
        Optional name prefix for the returned tensors during the schedule.
initial_learning_ratedecay_schedule_fnwarmup_stepspowernamec                 ^   > [         TU ]  5         Xl        X0l        X@l        X l        XPl        g N)super__init__r   r   r   r   r   )selfr   r   r   r   r   	__class__s         T/var/www/auris/envauris/lib/python3.13/site-packages/transformers/optimization_tf.pyr   WarmUp.__init__8   s,     	%:"(
!2	    c                   ^ ^^ [         R                  " T R                  =(       d    S5       n[         R                  " T[         R                  5      n[         R                  " T R
                  [         R                  5      nX4-  nT R                  [         R                  R                  UT R                  5      -  m[         R                  " X4:  U4S jU U4S jUS9sS S S 5        $ ! , (       d  f       g = f)Nr   c                     > T $ r    )warmup_learning_rates   r   <lambda>!WarmUp.__call__.<locals>.<lambda>Q   s    ,r   c                  @   > T R                  TT R                  -
  5      $ r   )r   r   )r   steps   r   r   r    R   s    ..td6G6G/GHr   r   )tf
name_scoper   castfloat32r   r   mathpowr   cond)r   r"   r   global_step_floatwarmup_steps_floatwarmup_percent_doner   s   ``    @r   __call__WarmUp.__call__G   s    ]]49901T !#bjj 9!#):):BJJ!G"3"H#'#=#=L_aeakak@l#l 77!6,H	 211s   B1C((
C6c                 v    U R                   U R                  U R                  U R                  U R                  S.$ )Nr   r   r   r   r   r1   r   s    r   
get_configWarmUp.get_configV   s5    %)%?%?!%!7!7 --ZZII
 	
r   )r   r   r   r   r   )      ?N)__name__
__module____qualname____firstlineno____doc__floatr   intr   strr   r.   r3   __static_attributes____classcell__r   s   @r   r   r   &   s]    , "$ $ 	
  sm 
 
r   r   init_lrnum_train_stepsnum_warmup_stepsmin_lr_ratio
adam_beta1
adam_beta2adam_epsilonadam_clipnormadam_global_clipnormweight_decay_rater   include_in_weight_decayc                     [         R                  U X-
  X-  U
S9nU(       a  [        U UUS9nU	S:  a  [        UU	UUUUU/ SQUS9	nX4$ [        R
                  R                  UUUUUUS9nX4$ )a9  
Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.

Args:
    init_lr (`float`):
        The desired learning rate at the end of the warmup phase.
    num_train_steps (`int`):
        The total number of training steps.
    num_warmup_steps (`int`):
        The number of warmup steps.
    min_lr_ratio (`float`, *optional*, defaults to 0):
        The final learning rate at the end of the linear decay will be `init_lr * min_lr_ratio`.
    adam_beta1 (`float`, *optional*, defaults to 0.9):
        The beta1 to use in Adam.
    adam_beta2 (`float`, *optional*, defaults to 0.999):
        The beta2 to use in Adam.
    adam_epsilon (`float`, *optional*, defaults to 1e-8):
        The epsilon to use in Adam.
    adam_clipnorm (`float`, *optional*, defaults to `None`):
        If not `None`, clip the gradient norm for each weight tensor to this value.
    adam_global_clipnorm (`float`, *optional*, defaults to `None`)
        If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
        weight tensors, as if they were concatenated into a single vector.
    weight_decay_rate (`float`, *optional*, defaults to 0):
        The weight decay to use.
    power (`float`, *optional*, defaults to 1.0):
        The power to use for PolynomialDecay.
    include_in_weight_decay (`List[str]`, *optional*):
        List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
        applied to all parameters except bias and layer norm parameters.
)r   decay_stepsend_learning_rater   )r   r   r           )	LayerNorm
layer_normbias)	learning_raterJ   beta_1beta_2epsilonclipnormglobal_clipnormexclude_from_weight_decayrK   )rS   rT   rU   rV   rW   rX   )	schedulesPolynomialDecayr   AdamWeightDecayr   
optimizersr   )rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   r   rK   lr_schedule	optimizers                 r   create_optimizerr`   `   s    \ ++%#6!0	 , K ")))

 3#%/ "0&I$;

	, !! $$))% "0 * 
	 !!r   c                     ^  \ rS rSrSr         SS\\\R                  4   S\S\S\S\	S\S	\
\\      S
\
\\      S\4U 4S jjjr\U 4S j5       rU 4S jrS rSU 4S jjrS rSU 4S jjrSU 4S jjrU 4S jrS rSrU =r$ )r\      a	  
Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
Regularization](https://arxiv.org/abs/1711.05101).

Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
to adding the square of the weights to the loss with plain (non-momentum) SGD.

Args:
    learning_rate (`Union[float, LearningRateSchedule]`, *optional*, defaults to 0.001):
        The learning rate to use or a schedule.
    beta_1 (`float`, *optional*, defaults to 0.9):
        The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
    beta_2 (`float`, *optional*, defaults to 0.999):
        The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
    epsilon (`float`, *optional*, defaults to 1e-07):
        The epsilon parameter in Adam, which is a small constant for numerical stability.
    amsgrad (`bool`, *optional*, defaults to `False`):
        Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
        Beyond](https://arxiv.org/abs/1904.09237).
    weight_decay_rate (`float`, *optional*, defaults to 0.0):
        The weight decay to apply.
    include_in_weight_decay (`List[str]`, *optional*):
        List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
        applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
    exclude_from_weight_decay (`List[str]`, *optional*):
        List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
        `include_in_weight_decay` is passed, the names in it will supersede this list.
    name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
        Optional name for the operations created when applying gradients.
    kwargs (`Dict[str, Any]`, *optional*):
        Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
        norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
        inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
        `learning_rate` instead.
rS   rT   rU   rV   amsgradrJ   rK   rY   r   c
                 P   > [         TU ]  " XX4XY40 U
D6  X`l        Xpl        Xl        g r   )r   r   rJ   _include_in_weight_decay_exclude_from_weight_decay)r   rS   rT   rU   rV   rc   rJ   rK   rY   r   kwargsr   s              r   r   AdamWeightDecay.__init__   s.     	YRXY!2(?%*C'r   c                 .   > S[         0n[        TU ]	  XS9$ )z?Creates an optimizer from its config with WarmUp custom object.r   )custom_objects)r   r   from_config)clsconfigrj   r   s      r   rk   AdamWeightDecay.from_config   s"     #F+w"6"IIr   c                 r   > [         TU ]  XU5        [        R                  " U R                  SS9X1U4   S'   g )Nadam_weight_decay_rater#   rJ   )r   _prepare_localr$   constantrJ   )r   
var_device	var_dtypeapply_stater   s       r   rq   AdamWeightDecay._prepare_local   s;    zkBDFKK"")AE
+,-@Ar   c                     U R                  UR                  5      nU(       aD  UR                  X!-  X1R                  UR                  R
                  4   S   -  U R                  S9$ [        R                  " 5       $ )NrJ   )use_locking)	_do_use_weight_decayr   
assign_subdevicedtype
base_dtype_use_lockingr$   no_op)r   varrS   ru   do_decays        r   _decay_weights_op!AdamWeightDecay._decay_weights_op   sm    ,,SXX6>>#k::syy?S?S2T&UVi&jj -- "   xxzr   c                 b   > [        [        U6 5      u  pE[        TU ]  " [        XE5      4SU0UD6$ )Nr   )listzipr   apply_gradients)r   grads_and_varsr   rg   gradstvarsr   s         r   r   AdamWeightDecay.apply_gradients  s3    C01w&s5'8NtNvNNr   c                     Uc  U R                   U   0 4$ U=(       d    0 nUR                  X45      nUc  U R                  X5      nXCX4'   US   SU04$ )z1Retrieves the learning rate with the given state.lr_tru   )_decayed_lr_tget_fallback_apply_state)r   rs   rt   ru   coefficientss        r   _get_lrAdamWeightDecay._get_lr  so    %%i0"44!'R"
'>?55jLL3?/0F#m[%AAAr   c                   > U R                  UR                  UR                  R                  U5      u  pEU R	                  X$U5      n[
        R                  " U/5         [        TU ]   " X40 UD6sS S S 5        $ ! , (       d  f       g = fr   )	r   r{   r|   r}   r   r$   control_dependenciesr   _resource_apply_dense)r   gradr   ru   r   rg   decayr   s          r   r   %AdamWeightDecay._resource_apply_dense  se    ||CJJ		0D0DkR&&s+>$$eW-70EfE .--s   A99
Bc                   > U R                  UR                  UR                  R                  U5      u  pVU R	                  X%U5      n[
        R                  " U/5         [        TU ]   " XU40 UD6sS S S 5        $ ! , (       d  f       g = fr   )	r   r{   r|   r}   r   r$   r   r   _resource_apply_sparse)	r   r   r   indicesru   r   rg   r   r   s	           r   r   &AdamWeightDecay._resource_apply_sparse  sg    ||CJJ		0D0DkR&&s+>$$eW-71$WOO .--s   A::
Bc                 ^   > [         TU ]  5       nUR                  SU R                  05        U$ )NrJ   )r   r3   updaterJ   )r   rm   r   s     r   r3   AdamWeightDecay.get_config  s-    #%*D,B,BCDr   c                    U R                   S:X  a  gU R                  (       a,  U R                   H  n[        R                  " X!5      c  M    g   U R                  (       a,  U R                   H  n[        R                  " X!5      c  M    g   g)z0Whether to use L2 weight decay for `param_name`.r   FT)rJ   re   researchrf   )r   
param_namers      r   ry   $AdamWeightDecay._do_use_weight_decay$  sp    !!Q&((2299Q+7 3 **4499Q+7  5 r   )rf   re   rJ   )	gMbP??+?gHz>FrO   NNr\   r   )r6   r7   r8   r9   r:   r   r;   rZ   LearningRateScheduleboolr   r   r=   r   classmethodrk   rq   r   r   r   r   r   r3   ry   r>   r?   r@   s   @r   r\   r\      s    $P GL#&7;9=%DUI$B$BBCD D 	D
 D D !D "*$s)!4D $,DI#6D D D$ J J

OBFP
 r   r\   c                   J    \ rS rSrSrS r\S 5       r\S 5       rS r	S r
Srg	)
GradientAccumulatori6  aB  
Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
then call `.gradients`, scale the gradients if required, and pass the result to `apply_gradients`.
c                      / U l         SU l        g)zInitializes the accumulator.N)
_gradients_accum_stepsr2   s    r   r   GradientAccumulator.__init__A  s     r   c                 *   U R                   cm  [        R                  " [        R                  " S[        R                  S9S[        R
                  R                  [        R                  R                  S9U l         U R                   R                  5       $ )zNumber of accumulated steps.r   )r|   F	trainablesynchronizationaggregation)
r   r$   Variablerr   int64VariableSynchronizationON_READVariableAggregationONLY_FIRST_REPLICAvaluer2   s    r   r"   GradientAccumulator.stepF  sk     $ "ARXX. " : : B B22EE	!D   &&((r   c                     U R                   (       d  [        S5      eU R                    Vs/ s H  ob  UR                  5       OUPM     sn$ s  snf )z1The accumulated gradients on the current replica.zBThe accumulator should be called first to initialize the gradients)r   
ValueErrorr   r   gradients     r   	gradientsGradientAccumulator.gradientsS  sF     abbW[WfWfgWf8$8 hFWfgggs   Ac                    U R                   (       d  U R                  nU R                   R                  U Vs/ s Hc  nUb[  [        R                  " [        R
                  " U5      S[        R                  R                  [        R                  R                  S9OUPMe     sn5        [        U5      [        U R                   5      :w  a-  [        S[        U R                   5       S[        U5       35      e[        U R                   U5       H   u  pCUc  M
  Uc  M  UR                  U5        M"     U R                  R                  S5        gs  snf )z/Accumulates `gradients` on the current replica.NFr   z	Expected z gradients, but got r   )r   r"   extendr$   r   
zeros_liker   r   r   r   lenr   r   
assign_addr   )r   r   _r   accum_gradients        r   r.   GradientAccumulator.__call__Z  s   		AOO"" %.
 %.  + KKh/"'(*(B(B(J(J$&$:$:$M$M	 "" %.
 y>S11yT__)=(>>RSVW`SaRbcdd(+DOOY(G$N)h.B))(3 )H 	$$Q''
s   A*Ec                     U R                   (       d  gU R                  R                  S5        U R                    H-  nUc  M  UR                  [        R                  " U5      5        M/     g)z8Resets the accumulated gradients on the current replica.Nr   )r   r   assignr$   r   r   s     r   resetGradientAccumulator.resett  sI      #H#h 78 (r   )r   r   N)r6   r7   r8   r9   r:   r   propertyr"   r   r.   r   r>   r   r   r   r   r   6  s@    !
 
) 
) h h(49r   r   )	rO   r   r   g:0yE>NNrO   r5   N)r:   r   typingr   r   r   
tensorflowr$   tf_keras.optimizers.legacyr   ImportErrorModuleNotFoundError"tensorflow.keras.optimizers.legacymodeling_tf_utilsr   hasattrr]   rZ   r	   r   r   r;   r<   r   r=   r`   r\   r   r   r   r   <module>r      sf   F 	 , , 8/ % 5%%'?@@  **AAI  **I7
Y++ 7
| %),0"37Q"Q"Q" Q" 	Q"
 Q" Q" Q" E?Q" #5/Q" Q" Q" &d3i0Q"h~d ~DE9 E9{ 	() 878s   C C+*C+