o
    ZhrA                     @   s  d Z ddlZddlmZmZmZ ddlZzddlm	Z	 W n e
efy-   ddlm	Z	 Y nw ddlmZ eejjdrAejjjZnejjZG dd	 d	ejZ	
						
		d!dedededededededee dee dededeee  fddZG dd de	ZG dd  d ZdS )"z?Functions and classes related to optimization (weight updates).    N)CallableOptionalUnion)Adam   )keraslearning_rate_schedulec                       sP   e Zd ZdZ		ddededededee f
 fd	d
Z	dd Z
dd Z  ZS )WarmUpa  
    Applies a warmup schedule on a given learning rate decay schedule.

    Args:
        initial_learning_rate (`float`):
            The initial learning rate for the schedule after the warmup (so this will be the learning rate at the end
            of the warmup).
        decay_schedule_fn (`Callable`):
            The schedule function to apply after the warmup for the rest of training.
        warmup_steps (`int`):
            The number of steps for the warmup part of training.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for the polynomial warmup (defaults is a linear warmup).
        name (`str`, *optional*):
            Optional name prefix for the returned tensors during the schedule.
          ?Ninitial_learning_ratedecay_schedule_fnwarmup_stepspowernamec                    s,   t    || _|| _|| _|| _|| _d S N)super__init__r   r   r   r   r   )selfr   r   r   r   r   	__class__ K/var/www/auris/lib/python3.10/site-packages/transformers/optimization_tf.pyr   8   s   

zWarmUp.__init__c                    s   t  jpd:}t t j}t  jt j}|| } jt j| j	 t j
||k fdd fdd|dW  d    S 1 sEw   Y  d S )Nr	   c                      s    S r   r   r   )warmup_learning_rater   r   <lambda>Q   s    z!WarmUp.__call__.<locals>.<lambda>c                      s      j S r   )r   r   r   )r   stepr   r   r   R   s    r   )tfZ
name_scoper   castZfloat32r   r   mathpowr   Zcond)r   r   r   Zglobal_step_floatZwarmup_steps_floatZwarmup_percent_doner   )r   r   r   r   __call__G   s   
$zWarmUp.__call__c                 C   s   | j | j| j| j| jdS )Nr   r   r   r   r   r!   r   r   r   r   
get_configV   s   zWarmUp.get_config)r
   N)__name__
__module____qualname____doc__floatr   intr   strr   r    r#   __classcell__r   r   r   r   r	   &   s"    r	           ?+?:0yE>r
   init_lrnum_train_stepsnum_warmup_stepsmin_lr_ratio
adam_beta1
adam_beta2adam_epsilonadam_clipnormadam_global_clipnormweight_decay_rater   include_in_weight_decayc                 C   sz   t j| || | | |
d}|rt| ||d}|	dkr-t||	|||||g d|d	}||fS tjj||||||d}||fS )a  
    Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.

    Args:
        init_lr (`float`):
            The desired learning rate at the end of the warmup phase.
        num_train_steps (`int`):
            The total number of training steps.
        num_warmup_steps (`int`):
            The number of warmup steps.
        min_lr_ratio (`float`, *optional*, defaults to 0):
            The final learning rate at the end of the linear decay will be `init_lr * min_lr_ratio`.
        adam_beta1 (`float`, *optional*, defaults to 0.9):
            The beta1 to use in Adam.
        adam_beta2 (`float`, *optional*, defaults to 0.999):
            The beta2 to use in Adam.
        adam_epsilon (`float`, *optional*, defaults to 1e-8):
            The epsilon to use in Adam.
        adam_clipnorm (`float`, *optional*, defaults to `None`):
            If not `None`, clip the gradient norm for each weight tensor to this value.
        adam_global_clipnorm (`float`, *optional*, defaults to `None`)
            If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all
            weight tensors, as if they were concatenated into a single vector.
        weight_decay_rate (`float`, *optional*, defaults to 0):
            The weight decay to use.
        power (`float`, *optional*, defaults to 1.0):
            The power to use for PolynomialDecay.
        include_in_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters except bias and layer norm parameters.
    )r   Zdecay_stepsZend_learning_rater   )r   r   r   r,   )Z	LayerNormZ
layer_normZbias)	learning_rater9   beta_1beta_2epsilonclipnormglobal_clipnormexclude_from_weight_decayr:   )r;   r<   r=   r>   r?   r@   )	schedulesZPolynomialDecayr	   AdamWeightDecayr   
optimizersr   )r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r   r:   Zlr_scheduleZ	optimizerr   r   r   create_optimizer`   sD   .
rE   c                       s   e Zd ZdZ									 d&d	eeejf d
ededededede	e
e  de	e
e  def fddZe fddZ fddZdd Zd' fdd	Zdd Zd' fdd	Zd' fd d!	Z fd"d#Zd$d% Z  ZS )(rC   a]
  
    Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
    loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
    with the m and v parameters in strange ways as shown in [Decoupled Weight Decay
    Regularization](https://arxiv.org/abs/1711.05101).

    Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
    to adding the square of the weights to the loss with plain (non-momentum) SGD.

    Args:
        learning_rate (`Union[float, LearningRateSchedule]`, *optional*, defaults to 0.001):
            The learning rate to use or a schedule.
        beta_1 (`float`, *optional*, defaults to 0.9):
            The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
        beta_2 (`float`, *optional*, defaults to 0.999):
            The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
        epsilon (`float`, *optional*, defaults to 1e-07):
            The epsilon parameter in Adam, which is a small constant for numerical stability.
        amsgrad (`bool`, *optional*, defaults to `False`):
            Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
            Beyond](https://arxiv.org/abs/1904.09237).
        weight_decay_rate (`float`, *optional*, defaults to 0.0):
            The weight decay to apply.
        include_in_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
            applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
        exclude_from_weight_decay (`List[str]`, *optional*):
            List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
            `include_in_weight_decay` is passed, the names in it will supersede this list.
        name (`str`, *optional*, defaults to `"AdamWeightDecay"`):
            Optional name for the operations created when applying gradients.
        kwargs (`Dict[str, Any]`, *optional*):
            Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
            norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time
            inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use
            `learning_rate` instead.
    MbP?r-   r.   Hz>Fr,   Nr;   r<   r=   r>   amsgradr9   r:   rA   r   c
                    s4   t  j||||||	fi |
 || _|| _|| _d S r   )r   r   r9   _include_in_weight_decay_exclude_from_weight_decay)r   r;   r<   r=   r>   rH   r9   r:   rA   r   kwargsr   r   r   r      s   
zAdamWeightDecay.__init__c                    s   dt i}t j||dS )z?Creates an optimizer from its config with WarmUp custom object.r	   )custom_objects)r	   r   from_config)clsconfigrL   r   r   r   rM      s   zAdamWeightDecay.from_configc                    s0   t  ||| tj| jdd|||f d< d S )NZadam_weight_decay_rater   r9   )r   _prepare_localr   constantr9   )r   
var_device	var_dtypeapply_stater   r   r   rP      s   zAdamWeightDecay._prepare_localc                 C   sB   |  |j}|r|j|| ||j|jjf d  | jdS t S )Nr9   )Zuse_locking)	_do_use_weight_decayr   Z
assign_subdevicedtype
base_dtypeZ_use_lockingr   Zno_op)r   varr;   rT   Zdo_decayr   r   r   _decay_weights_op   s   z!AdamWeightDecay._decay_weights_opc                    s.   t t| \}}t jt||fd|i|S )Nr   )listzipr   apply_gradients)r   Zgrads_and_varsr   rK   Zgradstvarsr   r   r   r]     s   zAdamWeightDecay.apply_gradientsc                 C   s\   |du r| j | i fS |pi }|||f}|du r&| ||}||||f< |d d|ifS )z1Retrieves the learning rate with the given state.Nlr_trT   )Z_decayed_lr_tgetZ_fallback_apply_state)r   rR   rS   rT   Zcoefficientsr   r   r   _get_lr  s   zAdamWeightDecay._get_lrc                    sl   |  |j|jj|\}}| |||}t|g t j||fi |W  d    S 1 s/w   Y  d S r   )	ra   rV   rW   rX   rZ   r   control_dependenciesr   _resource_apply_dense)r   gradrY   rT   r_   rK   decayr   r   r   rc     s
   $z%AdamWeightDecay._resource_apply_densec                    sn   |  |j|jj|\}}| |||}t|g t j|||fi |W  d    S 1 s0w   Y  d S r   )	ra   rV   rW   rX   rZ   r   rb   r   _resource_apply_sparse)r   rd   rY   indicesrT   r_   rK   re   r   r   r   rf     s
   $z&AdamWeightDecay._resource_apply_sparsec                    s   t   }|d| ji |S )Nr9   )r   r#   updater9   )r   rO   r   r   r   r#     s   
zAdamWeightDecay.get_configc                 C   sb   | j dkrdS | jr| jD ]}t||dur dS q| jr/| jD ]}t||dur. dS q!dS )z0Whether to use L2 weight decay for `param_name`.r   FNT)r9   rI   researchrJ   )r   
param_namerr   r   r   rU   $  s   


z$AdamWeightDecay._do_use_weight_decay)	rF   r-   r.   rG   Fr,   NNrC   r   )r$   r%   r&   r'   r   r(   rB   LearningRateScheduleboolr   r[   r*   r   classmethodrM   rP   rZ   r]   ra   rc   rf   r#   rU   r+   r   r   r   r   rC      sP    (

	
	rC   c                   @   s@   e Zd ZdZdd Zedd Zedd Zdd	 Zd
d Z	dS )GradientAccumulatoraR  
    Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
    replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
    then call `.gradients`, scale the gradients if required, and pass the result to `apply_gradients`.
    c                 C   s   g | _ d| _dS )zInitializes the accumulator.N)
_gradients_accum_stepsr"   r   r   r   r   A  s   
zGradientAccumulator.__init__c                 C   s<   | j du rtjtjdtjddtjjtjjd| _ | j 	 S )zNumber of accumulated steps.Nr   )rW   FZ	trainableZsynchronizationZaggregation)
rr   r   VariablerQ   Zint64VariableSynchronizationON_READVariableAggregationONLY_FIRST_REPLICAvaluer"   r   r   r   r   F  s   

zGradientAccumulator.stepc                 C   s   | j stddd | j D S )z1The accumulated gradients on the current replica.zBThe accumulator should be called first to initialize the gradientsc                 S   s    g | ]}|d ur|  n|qS r   )ry   .0gradientr   r   r   
<listcomp>X  s     z1GradientAccumulator.gradients.<locals>.<listcomp>)rq   
ValueErrorr"   r   r   r   	gradientsS  s   zGradientAccumulator.gradientsc                 C   s   | j s| j}| j dd |D  t|t| j kr)tdt| j  dt| t| j |D ]\}}|dur@|dur@|| q/| jd dS )z/Accumulates `gradients` on the current replica.c                 S   s8   g | ]}|d urt jt |dt jjt jjdn|qS )NFrs   )r   rt   
zeros_likeru   rv   rw   rx   rz   r   r   r   r}   _  s    	z0GradientAccumulator.__call__.<locals>.<listcomp>z	Expected z gradients, but got Nr   )rq   r   extendlenr~   r\   Z
assign_addrr   )r   r   _Zaccum_gradientr|   r   r   r   r    Z  s   	
zGradientAccumulator.__call__c                 C   s>   | j sdS | jd | j D ]}|dur|t| qdS )z8Resets the accumulated gradients on the current replica.Nr   )rq   rr   Zassignr   r   )r   r|   r   r   r   resett  s   
zGradientAccumulator.resetN)
r$   r%   r&   r'   r   propertyr   r   r    r   r   r   r   r   rp   6  s    


rp   )	r,   r-   r.   r/   NNr,   r
   N)r'   ri   typingr   r   r   Z
tensorflowr   Ztf_keras.optimizers.legacyr   ImportErrorModuleNotFoundErrorZ"tensorflow.keras.optimizers.legacyZmodeling_tf_utilsr   hasattrrD   rB   r   rm   r	   r(   r)   r[   r*   rE   rC   rp   r   r   r   r   <module>   sf   >	


T 