
    JTh_              "          S r SSKJrJrJr  SSKrSSKJr  SSKJrJ	r	J
r
JrJrJrJrJrJrJrJrJrJrJr  SS/r " S	 S\5      rS
S\ S\ S\ S\ S\
 S3-   \l         S\\   S\\   S\\   S\\   S\\   S\S\S\S\S\S\S\S\S\S\4S jrS\\   S\\   S\\   S\\   S\\   S\S\S\S\S\S\S\S\S\S\4S jr\" \S 9      S#S\\   S\\   S\\   S\\   S\\   S\S!\\   S\S\S\S\S\S\S\S\S\4 S" jj5       rg)$z'Implementation for the RAdam algorithm.    )castOptionalUnionN)Tensor   )_capturable_doc_default_to_fused_or_foreach_differentiable_doc_disable_dynamo_if_unsupported_foreach_doc!_get_capturable_supported_devices_get_scalar_dtype
_get_value_maximize_doc_params_doc_use_grad_for_differentiable_view_as_real	OptimizerParamsTRAdamradamc                      ^  \ rS rSr     SSSSSS.S\S\\\4   S\\\4   S\S	\S
\	S\
\	   S\	S\	S\	4U 4S jjjjrU 4S jrS r\SS j5       rSrU =r$ )r      FN)foreachmaximize
capturabledifferentiableparamslrbetasepsweight_decaydecoupled_weight_decayr   r   r   r   c                  > [        U[        5      (       a  UR                  5       S:w  a  [        S5      eSU::  d  [        SU 35      eSU::  d  [        SU 35      eSUS   s=::  a  S:  d  O  [        SUS    35      eSUS   s=::  a  S:  d  O  [        S	US    35      eSU::  d  [        S
U 35      e[	        UUUUUUU	UU
S9	n[
        TU ]  X5        g )Nr   zTensor lr must be 1-element        zInvalid learning rate: zInvalid epsilon value: r         ?z#Invalid beta parameter at index 0: z#Invalid beta parameter at index 1: zInvalid weight_decay value: )	r   r    r!   r"   r   r   r   r#   r   )
isinstancer   numel
ValueErrordictsuper__init__)selfr   r   r    r!   r"   r#   r   r   r   r   defaults	__class__s               I/var/www/auris/envauris/lib/python3.13/site-packages/torch/optim/radam.pyr,   RAdam.__init__   s     b&!!bhhjAo:;;by6rd;<<cz6se<==eAh$$B58*MNNeAh$$B58*MNNl";L>JKK%!#9)

 	*    c                 t  > [         TU ]  U5        U R                   GH  nUR                  SS 5        UR                  SS5        UR                  SS5        UR                  SS5        UR                  SS5        US    H  nU R                  R                  U/ 5      n[        U5      S:w  d  M0  [        R                  " US	   5      (       a  MP  [        US	   5      nUS   (       a(  [        R                  " U[        5       UR                  S
9O[        R                  " U[        5       S9US	'   M     GM     g )Nr   r   Fr   r#   r   r   r   stepdtypedevicer6   )r+   __setstate__param_groups
setdefaultstategetlentorch	is_tensorfloattensorr   r7   )r-   r<   grouppp_statestep_valr/   s         r0   r9   RAdam.__setstate__F   s   U#&&EY-Z/-u55u=\518_**..B/w<1$U__WV_-M-M$WV_5H
 !. $,=,? #\\(:K:MN FO	 % 'r2   c                 
   SnUS    GHv  nUR                   c  M  U[        R                  " U5      -  nUR                  U5        UR                   R                  (       a  [        S5      eUR                  UR                   5        U R                  U   n	[        U	5      S:X  a  US   (       a(  [        R                  " S[        5       UR                  S9O[        R                  " S[        5       S	9U	S
'   [        R                  " U[        R                  S9U	S'   [        R                  " U[        R                  S9U	S'   UR                  U	S   5        UR                  U	S   5        UR                  U	S
   5        GMy     U$ )NFr   z'RAdam does not support sparse gradientsr   r    r5   r%   r8   r4   )memory_formatexp_avg
exp_avg_sq)gradr?   
is_complexappend	is_sparseRuntimeErrorr<   r>   zerosr   r7   rB   
zeros_likepreserve_format)
r-   rC   params_with_gradgradsexp_avgsexp_avg_sqsstate_stepshas_complexrD   r<   s
             r0   _init_groupRAdam._init_groupZ   sH    xAvv!u//22 ''*66##&'PQQQVV$

1u:? !. B.?.A!((S"\\#5F5HI &M (-'7'7)>)>(E)$ +0*:*:)>)>+E,' i 01""5#67""5=17 !: r2   c                    U R                  5         SnUb%  [        R                  " 5          U" 5       nSSS5        U R                   Hr  n/ n/ n/ n/ n/ n[	        [
        [        [        4   US   5      u  pU R                  X4XVXx5      n[        UUUUUU	U
US   US   US   US   US   US   US	   US
   US9  Mt     U$ ! , (       d  f       N= f)zPerform a single optimization step.

Args:
    closure (Callable, optional): A closure that reevaluates the model
        and returns the loss.
Nr    r   r"   r!   r   r   r   r   r#   )beta1beta2r   r"   r!   r   r   r   r   r#   rZ   )	 _cuda_graph_capture_health_checkr?   enable_gradr:   r   tuplerA   r[   r   )r-   closurelossrC   rU   rV   rW   rX   rY   r^   r_   rZ   s               r0   r4   
RAdam.step}   s    	--/""$y % &&E-/"$E%'H(*K(*KeUl 3U7^DLE**+K  ;">2%Lz*i( .$%56',-E'F'! '> E %$s   B??
CrI   )gMbP?)g?g+?g:0yE>r   FN)__name__
__module____qualname____firstlineno__r   r   rA   r   rb   boolr   r,   r9   r[   r   r4   __static_attributes____classcell__)r/   s   @r0   r   r      s     $(%1',&+ #' $&+&+ %- &+ UE\"	&+
 &+ &+ !%&+ $&+ &+ &+ &+ &+P(!F "- "-r2   a  Implements RAdam algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \: \beta_1, \beta_2
                \text{ (betas)}, \: \theta_0 \text{ (params)}, \:f(\theta) \text{ (objective)}, \:
                \lambda \text{ (weightdecay)}, \:\textit{maximize}                               \\
            &\hspace{13mm} \epsilon \text{ (epsilon)}, \textit{decoupled\_weight\_decay}         \\
            &\textbf{initialize} :  m_0 \leftarrow 0 \text{ ( first moment)},
                v_0 \leftarrow 0 \text{ ( second moment)},                                       \\
            &\hspace{18mm} \rho_{\infty} \leftarrow 2/(1-\beta_2) -1                      \\[-1.ex]
            &\rule{110mm}{0.4pt}  \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{6mm}\textbf{if} \: \textit{maximize}:                                       \\
            &\hspace{12mm}g_t           \leftarrow   -\nabla_{\theta} f_t (\theta_{t-1})         \\
            &\hspace{6mm}\textbf{else}                                                           \\
            &\hspace{12mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})          \\
            &\hspace{6mm} \theta_t \leftarrow \theta_{t-1}                                       \\
            &\hspace{6mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{12mm}\textbf{if} \: \textit{decoupled\_weight\_decay}                       \\
            &\hspace{18mm} \theta_t \leftarrow \theta_{t} - \gamma \lambda \theta_{t}            \\
            &\hspace{12mm}\textbf{else}                                                          \\
            &\hspace{18mm} g_t \leftarrow g_t + \lambda \theta_{t}                               \\
            &\hspace{6mm}m_t           \leftarrow   \beta_1 m_{t-1} + (1 - \beta_1) g_t          \\
            &\hspace{6mm}v_t           \leftarrow   \beta_2 v_{t-1} + (1-\beta_2) g^2_t          \\
            &\hspace{6mm}\widehat{m_t} \leftarrow   m_t/\big(1-\beta_1^t \big)                   \\
            &\hspace{6mm}\rho_t \leftarrow \rho_{\infty} -
                2 t \beta^t_2 /\big(1-\beta_2^t \big)                                    \\[0.1.ex]
            &\hspace{6mm}\textbf{if} \: \rho_t > 5                                               \\
            &\hspace{12mm} l_t \leftarrow \frac{\sqrt{ (1-\beta^t_2) }}{ \sqrt{v_t} +\epsilon  } \\
            &\hspace{12mm} r_t \leftarrow
      \sqrt{\frac{(\rho_t-4)(\rho_t-2)\rho_{\infty}}{(\rho_{\infty}-4)(\rho_{\infty}-2) \rho_t}} \\
            &\hspace{12mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t} r_t l_t        \\
            &\hspace{6mm}\textbf{else}                                                           \\
            &\hspace{12mm}\theta_t \leftarrow \theta_t - \gamma \widehat{m_t}                \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `On the variance of the adaptive learning rate and beyond`_.

    This implementation provides an option to use either the original weight_decay implementation as in Adam
    (where the weight_decay is applied to the gradient) or the one from AdamW (where weight_decay is applied
    to the weight) through the decoupled_weight_decay option. When decoupled_weight_decay is set to False
    (default), it uses the original Adam style weight decay, otherwise, it uses the AdamW style which
    corresponds more closely to the `author's implementation`_ in the RAdam paper. Further information
    about decoupled weight decay can be found in `Decoupled Weight Decay Regularization`_.

    z
    Args:
        a  
        lr (float, Tensor, optional): learning rate (default: 1e-3)
        betas (Tuple[float, float], optional): coefficients used for computing
            running averages of gradient and its square (default: (0.9, 0.999))
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-8)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        decoupled_weight_decay (bool, optional): whether to decouple the weight
            decay as in AdamW to obtain RAdamW. If True, the algorithm does not
            accumulate weight decay in the momentum nor variance. (default: False)
        z	
        a  

    .. _On the variance of the adaptive learning rate and beyond:
        https://arxiv.org/abs/1908.03265
    .. _author's implementation:
        https://github.com/LiyuanLucasLiu/RAdam
    .. _Decoupled Weight Decay Regularization:
        https://arxiv.org/abs/1711.05101

    r   rV   rW   rX   rY   r^   r_   r   r"   r!   r#   r   r   r   rZ   c       
           ^	^^^^^ [        U 5       GHj  u  nnU(       d  X   OX   * nX/   nX?   mXO   n[        R                  R                  5       (       dd  U(       a]  [	        5       nUR
                  R                  UR
                  R                  :X  a  UR
                  R                  U;   d   SU S35       e[        R                  " U5      (       aX  [        R                  " U5      n[        R                  " U5      n[        R                  " U5      n[        R                  " T5      mUS-  nU(       a  UO
[        U5      nUS:w  a.  U
(       a  UR                  SXx-  -
  5        OUR                  UUS9nUR                  USU-
  5        TR                  U5      R                  UUSU-
  S9  SUU-  -
  nSUU-  -
  mUU-  nSSU-
  -  S-
  mTSU-  UU-  -  T-  -
  mUU4S jnUUU	U4S	 jnU(       aA  [        R                  " TS
:  U" 5       U" 5       -  S5      nUR                  UU-  U-  SS9  GM+  TS
:  a&  UR                  UU-  U" 5       -  U" 5       -  SS9  GMW  UR                  UU-  SS9  GMm     g )NIIf capturable=True, params and state_steps must be on supported devices: .r   r   alpha)value   c                  D   > TS-
  TS-
  -  T -  T S-
  T S-
  -  T-  -  S-  $ )N   rt         ?rI   )rho_infrho_ts   r0   _compute_rect+_single_tensor_radam.<locals>._compute_rect=  sI    19 aKGaK058:  r2   c                     > TR                  5       n T(       a  U R                  T5      n OU R                  T5      n TS-  U -  $ )Nrw   )sqrtaddadd_)exp_avg_sq_sqrtbias_correction2r   r!   rL   s    r0   _compute_adaptive_lr2_single_tensor_radam.<locals>._compute_adaptive_lrE  sB    (oo/O"1"5"5c":"1"6"6s";$c)_<<r2         @r&   g      )	enumerater?   compileris_compilingr   r7   typerN   view_as_realr   mul_r~   lerp_addcmul_wherer   )r   rV   rW   rX   rY   r^   r_   r   r"   r!   r#   r   r   r   rZ   iparamrM   rK   step_tcapturable_supported_devicesr4   bias_correction1bias_corrected_exp_avgrz   r   updater   rL   rx   ry   s            ` `               @@@@r0   _single_tensor_radamr      s   $ f%5'uxehY+ ^
 ~~**,,+L+N(!!V]]%7%77LL%%)EE{ [[wZxxyz{F E""&&u-E%%d+D((1G++J7J 	!#vF);1%

1r001xx\x: 	dAI&''d!e)'Dud{?ud{? ")+;!; q5y/A%!d(eTk25EEE		= 	= [[]_/C/EEsF JJ-2V;4JHs{

**,- $o&    

1B6d
C] &r2   c       
         8  ^* [        U 5      S:X  a  g U(       a   S5       e[        R                  R                  5       (       d>  U(       a7  [	        SS9m*[        U*4S j[        X5       5       5      (       d   ST* S35       e[        R                  " XX#U/5      nUR                  5        GH  u  u  nnnnnn[        [        [           U5      n[        [        [           U5      n[        [        [           U5      n[        [        [           U5      n[        [        [           U5      n[        R                  R                  5       (       d>  US   R                  (       a*  [        R                  " U[        R                  " SS	S
9SS9  O[        R                  " US5        U(       a  [!        UUUU5        U(       a  [        R"                  " U5      nSSU-
  -  S-
  nU(       a  [        R$                  " UU5      n[        R&                  " U5        [        R                  " US5        [        R$                  " UU5      n[        R(                  " UU5        [        R(                  " US5        [        R*                  " UU5        [        R&                  " U5        [        R                  " UU5        UnOBU Vs/ s H5  nUS[-        U5      -  U[-        U5      -  -  SU[-        U5      -  -
  -  -
  PM7     nnUS:w  aX  U
(       a  [        R(                  " USXx-  -
  5        O4U(       a  [        R                  " UUUS9  O[        R.                  " UUUS9n[        R0                  " UUSU-
  5        [        R(                  " UU5        [        R2                  " UUUSU-
  5        AU(       GaT  [        R4                  " US5      n [        R4                  " US5      n![        R(                  " U U!5        A![        R(                  " U U5        US-
  US-
  -  n[        R6                  " UU5      n"[        R*                  " U U"5        A"[        R8                  " U 5        [        U U5       V#V$s/ s H!  u  n#n$[        R:                  " U$S:  U#S5      PM#     n%n#n$A AU% V%s/ s H  n%[        R:                  " U%S:  SS5      PM      n&n%[        R(                  " U&U5        [        R$                  " UU5      n[        R&                  " U5        [        R                  " US5        [        R*                  " U&U5        [        R&                  " U&5        [        R$                  " UU5      n[        R&                  " U5        [        R                  " US5        [        R8                  " U5        [        R(                  " UU5        [        R(                  " UW%5        A%[        R&                  " U5        [        R*                  " UU5        AOU V$s/ s H+  n$U$S:  a   U$S-
  U$S-
  -  U-  US-
  US-
  -  U$-  -  S-  OSPM-     n%n$U% V%s/ s H  n%U%S:  a  SOSPM     n'n%U Vs/ s H  nSU[-        U5      -  -
  PM     nn[        U'U5       V%V(s/ s H  u  n%n(UU%-  U(-  S-  PM     n&n%n([        UW%U5       VV%V(s/ s H'  u  nn%n(SU[-        U5      -  -
  S-  UU%-  U(-  -  S-  PM)     nn%nn([        R<                  " U5      n)[        R                  " U)U	5        [        R*                  " U)U5        [        R>                  " U)5        [        R                  " U)U&5        [        R2                  " UUU)5        GM     g s  snf s  sn$n#f s  sn%f s  sn$f s  sn%f s  snf s  sn(n%f s  sn(n%nf )Nr   z#_foreach ops don't support autogradF)supports_xlac              3      >#    U  HT  u  pUR                   R                  UR                   R                  :H  =(       a    UR                   R                  T;   v   MV     g 7frf   )r7   r   ).0rD   r4   r   s      r0   	<genexpr>&_multi_tensor_radam.<locals>.<genexpr>}  sN      
 4 HHMMT[[--- >!==>3s   AAro   rp   r&   cpu)r7   rq   r   rt   rv   r   r%      rw   ) r>   r?   r   r   r   allzipr   "_group_tensors_by_device_and_dtypevaluesr   listr   is_cpu_foreach_add_rB   r   _foreach_neg_foreach_pow_foreach_neg__foreach_mul__foreach_div_r   _foreach_add_foreach_lerp__foreach_addcmul__foreach_sub_foreach_mul_foreach_sqrt_r   _foreach_sqrt_foreach_reciprocal_)+r   rV   rW   rX   rY   r^   r_   r   r"   r!   r#   r   r   r   rZ   grouped_tensorsgrouped_params_grouped_grads_grouped_exp_avgs_grouped_exp_avg_sqs_grouped_state_steps__grouped_paramsgrouped_gradsgrouped_exp_avgsgrouped_exp_avg_sqsgrouped_state_stepsrx   r   r   
rho_t_listr4   numsub2denomnry   rectunrect_step_sizeunrectifiedbcbufferr   s+                                             @r0   _multi_tensor_radamr   a  sM   $ 6{aDDD >>&&((Z'H(
$  
 v3
 
 
 	w WWsVttuv		w 
  BB	{;O ""$		 	d6lO<T&\>:V.?@"4<1EF"4<1EF ~~**,,1DQ1G1N1N#U\\#e%DC  3Q7/?AT !..}=M q5y/A%
 $11%9LM 01 0!4$11%9LM 02EF 0!4 02BC 01 0':)J 0 0D T"#Jt,,. u
4 00022
 0   1%##NA8I4IJ ''%~\ %*$6$6%~\%M
 	-}a%iH/7q5y	

 $$Z3C%%j!4DT*W-{w{3G&&z7;EU+  % BES*AUAUXQECKC0AU   LPQDDD1Hc3 ?DQ 0"5$11%9LM 01 0!4 02BC 01$11%9LM 01 0!4  !12 0"5 0$7 01 02BC  ( (E 19 QYqy"  !!4u<>
   (   ?CCddq1c1dKC ;N :M$EZ---:M    7:+GW6X 6X($dR2%6X   
 '**=tEU&V &VND$ ez$///C7BINKbP&V   
 $$%89FC(F$45""6*F$45 	0@&Ik %X^
  R* D   s0   :<]0)(]5%];2^ =^^
^/.^
)single_tensor_fnr   c                h   [        S U 5       5      (       d  [        S5      eUc  [        XSS9u  nnU(       a.  [        R                  R                  5       (       a  [        S5      eU(       a*  [        R                  R                  5       (       d  [        nO[        nU" U UUUUUUUUUU
UUUU	S9  g)zhFunctional API that performs RAdam algorithm computation.

See :class:`~torch.optim.RAdam` for details.
c              3   V   #    U  H  n[        U[        R                  5      v   M!     g 7frf   )r'   r?   r   )r   ts     r0   r   radam.<locals>.<genexpr>>  s     @Kqz!U\\**Ks   ')zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF)	use_fusedz6torch.jit.script not supported with foreach optimizers)
r^   r_   r   r"   r!   r   r#   r   r   rZ   )r   rQ   r	   r?   jitis_scriptingr   r   )r   rV   rW   rX   rY   r#   r   r   r   rZ   r   r^   r_   r   r"   r!   r   funcs                     r0   r   r   $  s    4 @K@@@^
 	
 1e

7 599))++STTuyy--//"#!5%r2   )FNFFFF)__doc__typingr   r   r   r?   r   	optimizerr   r	   r
   r   r   r   r   r   r   r   r   r   r   r   __all__r   r   rA   rk   r   r   r   rI   r2   r0   <module>r      s   . ( (     $ G
NI Nd2f		 
	 
 		 		 		 	gK ``DL`D<`D 6l`D f	`D
 f`D `D `D 	`D `D 
`D !`D `D `D `D  !`DF@JL@J<@J 6l@J f	@J
 f@J @J @J 	@J @J 
@J !@J @J @J @J  !@JF  1EF $)" ;L;<; 6l; f	;
 f; !; d^; ; ; ; ; ;  !;" 	#;$ %;& 
'; G;r2   