o
    wZhP                  !   @   s4  d dl mZmZmZ d dlZd dlmZ ddlmZmZm	Z	m
Z
mZmZmZmZmZmZmZmZ ddgZG dd deZd	d
e de
 de de	 d	 e_							d)dee dee dee dee dee dee dee dedee dededededededef ddZd d! Zdee dee dee dee dee dee dedededededededefd"d#Zdee dee dee dee dee dee dedededededededefd$d%Zdee dee dee dee dee dee dedededededededed&dfd'd(ZdS )*    )castOptionalUnionN)Tensor   )_default_to_fused_or_foreach_device_dtype_check_for_fused_differentiable_doc_foreach_doc_get_scalar_dtype
_get_value_maximize_doc_params_doc_use_grad_for_differentiable_view_as_real	OptimizerParamsTAdagradadagradc                       s   e Zd Z						ddddddedeeef d	ed
edededee dededee f fddZ	 fddZ
dd Zdd ZedddZ  ZS )r   {Gz?r   绽|=NF)maximizedifferentiablefusedparamslrlr_decayweight_decayinitial_accumulator_valueepsforeachr   r   r   c                   sd  t |tr| dkrtdd|kstd| d|ks%td| d|ks0td| d|ks;td| d|ksFtd| t||||||||	|
d		}t || |
rk|	rbtd
|rhtdd| _| j	D ]A}|d D ]:}| j
| }|d rtjdt|d d|jdntjdt d|d< t|rt||n|}tj||tjd|d< qtqnd S )Nr   zTensor lr must be 1-elementg        zInvalid learning rate: zInvalid lr_decay value: zInvalid weight_decay value: z)Invalid initial_accumulator_value value: zInvalid epsilon value: )	r   r   r   r   r   r    r   r   r   z)`fused` does not support `differentiable`z0`fused` and `foreach` cannot be `True` together.Tr   r    Zis_fused)dtypedevicer#   step)Zmemory_formatsum)
isinstancer   Znumel
ValueErrordictsuper__init__RuntimeError"_need_device_dtype_check_for_fusedparam_groupsstatetorchZzerosr   r$   tensor
is_complexcomplexZ	full_likeZpreserve_format)selfr   r   r   r   r   r   r    r   r   r   defaultsgrouppr0   Z
init_value	__class__r!   B/var/www/auris/lib/python3.10/site-packages/torch/optim/adagrad.pyr,      sh   

zAdagrad.__init__c                    s   t  | d }| jD ]}|dd  |dd |dd |dd }qt| j }t|dko;t	|d d }|sS|D ]}tj
t|d t|dd	|d< q@d S d S )
Nr    r   Fr   r   r   r&   r"   r%   )r+   __setstate__r/   
setdefaultlistr0   valueslenr1   Z	is_tensorr2   floatr   )r5   r0   r   r7   Zstate_valuesZstep_is_tensorsr9   r!   r;   r<   a   s$   

zAdagrad.__setstate__c                 C   s4   | j D ]}|d D ]}| j| }|d   q	qd S )Nr   r'   )r/   r0   Zshare_memory_)r5   r7   r8   r0   r!   r!   r;   share_memoryv   s   

zAdagrad.share_memoryc           
      C   s   d\}}|d D ]E}|j d urM|d r"t| ddr"t|dd d| _||j jO }|t|O }|| ||j  | j| }	||	d  ||	d	  q||fS )
N)FFr   r   r.   T)Zcuda_unsupportedFr'   r&   )	gradgetattrr   r.   	is_sparser1   r3   appendr0   )
r5   r7   params_with_gradgrads
state_sumsstate_stepshas_sparse_gradhas_complexr8   r0   r!   r!   r;   _init_group|   s&   



zAdagrad._init_groupc           
      C   s   d}|durt   | }W d   n1 sw   Y  | jD ]A}g }g }g }g }| |||||\}}	t|||||d |d |d |d ||d |d |d |	|d	 t| d
dt| ddd q |S )zPerform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        Nr   r   r   r   r    r   r   r   
grad_scale	found_inf)r   r   r   r   rL   r    r   r   rM   r   rO   rP   )r1   Zenable_gradr/   rN   r   rE   )
r5   closureZlossr7   rH   rI   rJ   rK   rL   rM   r!   r!   r;   r&      s@   




zAdagrad.step)r   r   r   r   r   NN)__name__
__module____qualname__r   r   rA   r   r   boolr,   r<   rC   rN   r   r&   __classcell__r!   r!   r9   r;   r      sJ    


Fa[  Implements Adagrad algorithm.

    .. math::
       \begin{aligned}
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{input}      : \gamma \text{ (lr)}, \: \theta_0 \text{ (params)}, \: f(\theta)
                \text{ (objective)}, \: \lambda \text{ (weight decay)},                          \\
            &\hspace{12mm}    \tau \text{ (initial accumulator value)}, \: \eta\text{ (lr decay)}\\
            &\textbf{initialize} :  state\_sum_0 \leftarrow \tau                          \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                                 \\
            &\textbf{for} \: t=1 \: \textbf{to} \: \ldots \: \textbf{do}                         \\
            &\hspace{5mm}g_t           \leftarrow   \nabla_{\theta} f_t (\theta_{t-1})           \\
            &\hspace{5mm} \tilde{\gamma}    \leftarrow \gamma / (1 +(t-1) \eta)                  \\
            &\hspace{5mm} \textbf{if} \: \lambda \neq 0                                          \\
            &\hspace{10mm} g_t \leftarrow g_t + \lambda \theta_{t-1}                             \\
            &\hspace{5mm}state\_sum_t  \leftarrow  state\_sum_{t-1} + g^2_t                      \\
            &\hspace{5mm}\theta_t \leftarrow
                \theta_{t-1}- \tilde{\gamma} \frac{g_t}{\sqrt{state\_sum_t}+\epsilon}            \\
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
            &\bf{return} \:  \theta_t                                                     \\[-1.ex]
            &\rule{110mm}{0.4pt}                                                          \\[-1.ex]
       \end{aligned}

    For further details regarding the algorithm we refer to `Adaptive Subgradient Methods for Online Learning
    and Stochastic Optimization`_.
    z
    Args:
        a  
        lr (float, Tensor, optional): learning rate (default: 1e-2)
        lr_decay (float, optional): learning rate decay (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        initial_accumulator_value (float, optional): initial value of the
            sum of squares of gradients (default: 0)
        eps (float, optional): term added to the denominator to improve
            numerical stability (default: 1e-10)
        z	
        a  
        fused (bool, optional): whether the fused implementation (CPU only) is used.
            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
            are supported. (default: None). Please note that the fused implementations does not
            support sparse or complex gradients.
    .. _Adaptive Subgradient Methods for Online Learning and Stochastic
        Optimization: http://jmlr.org/papers/v12/duchi11a.html

    Fr   rI   rJ   rK   r   rO   rP   rL   r    r   rM   r   r   r   r   r   c                C   s   t dd |D std|du r|du rt| |	dd\}}|du r$d}|du r*d}|r5tj r5td|r@tj r@td|rJtj sJt}n|rTtj sTt}nt}|| ||||||||||	|
||d	 dS )
ztFunctional API that performs Adagrad algorithm computation.

    See :class:`~torch.optim.Adagrad` for details.
    c                 s   s    | ]	}t |tjV  qd S rR   )r(   r1   r   ).0tr!   r!   r;   	<genexpr>  s    zadagrad.<locals>.<genexpr>zPAPI has changed, `state_steps` argument must contain a list of singleton tensorsNF)Z	use_fusedz6torch.jit.script not supported with foreach optimizersz4torch.jit.script not supported with fused optimizers
r   r   r   r   rL   r   r   rM   rO   rP   )	allr-   r   r1   ZjitZis_scripting_fused_adagrad_multi_tensor_adagrad_single_tensor_adagrad)r   rI   rJ   rK   r   rO   rP   rL   r    r   rM   r   r   r   r   r   _funcr!   r!   r;   r      sJ   

c                 C   s   |   }t|||S rR   )sizer1   Zsparse_coo_tensor)rD   grad_indicesr?   rb   r!   r!   r;   _make_sparse<  s   rd   c             	   C   s  |d u r|d u s
J t | |||D ]\}}}}|d7 }t|}|s#|n| }|dkr8|jr1td|j||d}|d|d |   }|jrz| }| }| }|t	|||
d ||}|  |	}|jt	|||| | d qt|}|rt|}t|}t|}|j||dd |r| |	 }n| |	}|j||| d |rt|}t|}qd S )Nr   r   z;weight_decay option is not compatible with sparse gradientsalpha   value)zipr   rF   r-   addZcoalesceZ_indicesZ_valuesZadd_rd   powZsparse_maskZsqrt_r1   r3   Zview_as_realZaddcmul_sqrtZaddcdiv_Zview_as_complex)r   rI   rJ   rK   rO   rP   r   r   r   r   rL   r   r   rM   paramrD   Z	state_sumZstep_tr&   Zclrrc   Zgrad_valuesstdZ
std_valuesr3   r!   r!   r;   r_   A  sJ   






r_   c                   s  |rJ d|d u r|d u sJ t | dkrd S t| |||g}| D ]\\}}}}}ttt |}ttt |}ttt |}ttt |}|
oStdd |D }|rit|||| ||	d|||||d q%|rqt	||| |rxt
|}t
j s|d jrt
j|t
jddd	dd
 nt
|d |dkr|rt
j|||d
 nt
j|||d
} fdd|D }t
j|||dd t
|}t
||	 |dks|rt
|| |}nt
||}t
||| q%d S )Nz#_foreach ops don't support autogradr   c                 s   s    | ]}|j V  qd S rR   )rF   )rX   rD   r!   r!   r;   rZ     s    
z(_multi_tensor_adagrad.<locals>.<genexpr>Tr[   g      ?cpu)r$   re   r   c                    s&   g | ]}  d t |d     qS )r   )r   )rX   r&   r   r   r!   r;   
<listcomp>  s    z)_multi_tensor_adagrad.<locals>.<listcomp>rh   )r@   r   "_group_tensors_by_device_and_dtyper?   r   r>   r   anyr_   r   r1   Z_foreach_negcompilerZis_compilingZis_cpu_foreach_add_r2   Z_foreach_addZ_foreach_addcmul_Z_foreach_sqrtZ_foreach_mul_Z_foreach_mulZ_foreach_addcdiv_)r   rI   rJ   rK   rO   rP   r   r   r   r   rL   r   r   rM   Zgrouped_tensorlistsdevice_params_device_grads_device_state_sums_device_state_steps_r`   device_paramsdevice_gradsdevice_state_sumsdevice_state_stepsZdevice_has_sparse_gradZ	minus_clrro   	numeratorr!   rq   r;   r^   ~  s   


r^   returnc                C   sv  | sd S |
s|rt d|rt d|d ur|j|ind }|d ur&|j|ind }t| |||g}| D ]\\}}\\}}}}}ttt |}ttt |}ttt |}ttt |}d\}}|d urz|d urz||vrv|j|dd||< || }|d ur|d ur||vr|j|dd||< || }t	
|d t	j||||||||	|||d |d urt	||gt|  q5d S )Nz5`fused` does not support sparse grad or complex paramz<adagrad with fused=True does not support differentiable=True)NNT)Znon_blockingr   )r   r   r   r   r   rO   rP   )r-   r$   r   rs   itemsr   r>   r   tor1   rv   Z_fused_adagrad_Z_foreach_sub_r@   )r   rI   rJ   rK   rO   rP   r   r   r   r   rL   r   r   rM   Zgrad_scale_dictZfound_inf_dictZgrouped_tensorsr$   r`   rw   rx   ry   rz   r{   r|   r}   r~   Zdevice_grad_scaleZdevice_found_infr!   r!   r;   r]     sn   
r]   )NNNFNFF)typingr   r   r   r1   r   Z	optimizerr   r   r	   r
   r   r   r   r   r   r   r   r   __all__r   __doc__r>   rV   rA   r   rd   r_   r^   r]   r!   r!   r!   r;   <module>   s*  8 '
8

J	

=	

k	
