a
    h                  	   @   s  U d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZm Z m!Z!m"Z" d dl#m$Z$ d d	l%m&Z&m'Z'm(Z)m*Z*m+Z+m,Z,m-Z-m.Z. d d
l/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z8 d dl9m:Z: g dZ;dZ<dZ=dZ>dZ?e@eA ZBee3eejCeDeEeAf ZFeeFeGeF eHeF eIeAdf f ZJeIeAeJf ZKeGeK ZLeIeAeeKeLf f ZMe@ ZNe@e eOd< e jPdd ZQe	G dd dZRe	G dd deRZSdfejTeAeAeUeUeBddd ZVG d!d" d"ZWdgd#d$ZXddd%ejTeHejYjZd&f eUee@ejT  eeR eSd'd(d)Z[eIeAeJf eMeSdd*d+d,Z\eejTejYjZf eAed-d.d/Z]eIeAef eSeIeAef d0d1d2Z^e_ ejTeSeIeAeJf d3d4d5Z`e_ ejTeIeAeJf eSe5d6d7d8ZaejYjZdd9d:d;ZbeMeIeAeJf d<d=d>ZcejYjZeIeAeJf eSeMd?d@dAZde_ ejTeHejYjZd&f eSeMdBdCdDZeejTejYjZeMeSeMdEdFdGZfe_ ejTeHejYjZd&f eMeSddHdIdJZgddd%ejTee@ejT  eeR eIeAeJf dKdLdMZhddd%ejTeejYjZeejYjZ f ee@ejT  eeR eMdNdOdPZiddd%ejTeejYjZeejYjZ f ee@ejT  eeR eHeIeAeJf eMf dNdQdRZjejTeeIejTeIeAeJf f eIeAeJf f eIeAeJf dSdTdUZkddVejTeIeAeJf eeR e5dWdXdYZlddVejTeejYjZeejYjZ f eMeeR ddZd[d\ZmddVejTeejYjZeejYjZ f eIeAeJf eMeeR e5d]d^d_ZneddVejTeeR dd`dadbZoeddVejTeHejYjZd&f eeR ddcdddeZpdS )h    N)	GeneratorIterable)asdict	dataclassfield)chain)AnyCallablecastno_type_checkOptionalUnion)ShardedTensor)_broadcast_state_dict_distribute_state_dict_flatten_state_dict_gather_state_dict_offload_state_dict_to_cpu_unflatten_state_dict)_CHECKPOINT_PREFIX)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)DTensor)_IncompatibleKeys)DistributedDataParallel)tree_map_only)FQNS_TPrimitiveType	ValueTypeDictValueTypeListDictValueTypeOptimizerStateTypeStateDictOptionsget_model_state_dictget_optimizer_state_dictget_state_dictset_model_state_dictset_optimizer_state_dictset_state_dictZ_flat_paramparam_groupsparamsstater&   _patched_state_dictc                  c   s:   t  } t   zd V  W | r6t   n| r4t   0 d S N)gc	isenableddisableenable)
is_enabled r;   U/var/www/auris/lib/python3.9/site-packages/torch/distributed/checkpoint/state_dict.py_gc_contextQ   s    
r=   c                   @   sr   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< dZeed
< dZeed< dS )r*   ap  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().

    - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
       full state_dict and will broadcast the tensors in the state_dict/
       optim_state_dict one by one to other ranks. Other ranks will receive
       the tensors and shard according to the local shards in the model and
       optimizer. ``full_state_dict`` must be set to True when using this option.
       This option currently only supports DTensor, not the legacy ShardedTensor.
    Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictbroadcast_from_rank0flatten_optimizer_state_dict_fqn_modifiersdsd_fqn_modifiersN)__name__
__module____qualname____doc__r>   bool__annotations__r?   r@   rA   rB   rC   rD   rF   strr;   r;   r;   r<   r*   \   s   
$r*   c                   @   s   e Zd ZU eedZeeeej	f ee
ej	f f ed< eedZeeeej	f ee
ej	f f ed< eedZee ed< dZeed< dZeed< ejZeed< eedZeej ed	< d
S )_StateDictInfo)default_factoryfqn_param_mappingshared_params_mappingsubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesN)rG   rH   rI   r   dictrP   r   rM   torchTensorr$   rL   rQ   setrR   rS   rK   rT   
contextlibnullcontextrU   r	   listrV   nnModuler;   r;   r;   r<   rN      s"   
rN   rE   T)modelnamerF   skip_ddp_prefixskip_compiler_prefixreturnc                    s  | td}d|vr|hS |d}g }| }t|D ]r\}}	t|trj|	dksTJ |j}|sh||	 q4t|tr|t	|d k r||d  t
krd| t|t
}
 r  d  fdd|
jD   S t|t}|	tkr||	 t||	}q4t|tjjjr.|	dksJ |j}|s||	 q4t||rht|| |	 }rht||rht||}||	 |	tjjjkr|t	|d krtdq4t||	}q4d| tdhS )	a  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
     .module   c                    s   h | ]}  | qS r;   r;   .0fqnprefixr;   r<   	<setcomp>       z_get_fqns.<locals>.<setcomp>	_orig_modz-Expect `_extra_state` to be the last obj name)replacer   split	enumerate
isinstanceDDPrg   appendFSDPlen_FLAT_PARAMjoingetattrZ_fqnsr   rX   Z_dynamoZ
eval_frameZOptimizedModulerp   hasattrgetr^   modules_EXTRA_STATE_KEY_SUFFIXRuntimeError)r`   ra   rF   rb   rc   Z	obj_namesZfqn_obj_namesZcurr_objiZcurr_obj_nameZ
flat_paramZremoved_fqnr;   rl   r<   	_get_fqns   sP    


 








r   c                   @   s   e Zd ZdS )_EXTRA_STATEN)rG   rH   rI   r;   r;   r;   r<   r      s   r   c                 #   s6   t  tjttd fdd| dE d H  d S )N)rg   curr_fqnrd   c                 3   s   |  |r| dnd}|  D ]Z\}}|v r6q$t|  rb|t|    v rb|d d }n| | }||E d H  q$t| jdd| jddD ]*\}}|| jv rq| | }||fV  qt| j	dt
jjt
jjkr| t
jjj }|t fV  d S )Nrf   re   F)recurseget_extra_state)addZnamed_childrenr|   r{   valuesr   Znamed_buffersnamed_parametersZ_non_persistent_buffers_set	__class__r^   r_   r   r~   rg   r   r   )rg   r   ra   	submodulenew_fqnobjrF   r   Zvisited_modulesr;   r<   r      s2    

z+_iterate_valid_model_state.<locals>.recursere   )rZ   r^   r_   rM   r   )r`   rF   r;   r   r<   _iterate_valid_model_state   s    "r   )
submodulesoptions.)r`   optims
optim_onlyr   r   rd   c                C   s:  |rt dt |r |s td|p(t }i }i }t| D ]\}}t|trNq:t| |}	|	|d}
|
durt
tt || |	 || ||< n|	 ||< |	D ]}
t|ts|||
< qq:t| D ]"\}}|D ]}
t
tj|||
< qqt }|rVt|}|  D ]L\}}||vrqt| |}	t|	dks>J d|dd |	D  q|jrn|jsntdt| }|r|jrt|j|jd	}t|j|jp|jd	}tj}nt|jd
}t |jd
}tj!}t"j#dd }t$j%|| |||d}nt"j&}t'f i t(|||||t
tt)j* || t|dkdS )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    zGetting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.z;Optimizers are not passed in but optim_only is set to True.Nrh   z)Submodule FQN should only have 1 instancec                 s   s   | ]}| d V  qdS )rf   Nr;   ri   r;   r;   r<   	<genexpr>K  ro   z"_verify_options.<locals>.<genexpr>z?full_state_dict must be True when broadcast_from_rank0 is True.)offload_to_cpuZ
rank0_only)r   c              	   s   st   t  X t jddtd tj| |||d d V  W d    n1 sH0    Y  W d    n1 sf0    Y  d S )NignorezFSDP.state_dict_type)messagecategoryrg   state_dict_typestate_dict_configoptim_state_dict_config)warningscatch_warningsfilterwarningsFutureWarningrw   r   r   r;   r;   r<   $fsdp_state_dict_type_without_warningi  s    
z=_verify_options.<locals>.fsdp_state_dict_type_without_warningr   r   )rP   rQ   rR   rU   rV   rS   rT   )+r   warnr   r   r*   r   rt   r   r   r}   r
   rZ   rM   updatecopyr]   itemsrX   rY   named_modulesrx   rC   r>   
ValueErrorrw   rV   r   r?   r   r   ZFULL_STATE_DICTr   r   ZSHARDED_STATE_DICTr[   contextmanager	functoolspartialr\   rN   r   r^   r_   )r`   r   r   r   r   rP   rQ   ra   paramfqnsrk   Zparam_Zfqns_rR   rg   rV   r   r   r   r   rU   r;   r;   r<   _verify_options  s    








r   )model_state_dictoptim_state_dictinford   c                 C   s   |j D ]}t|}|d usJ dq|jrf| sf|jsf|jsf|jrF|jsf|jrf|jsft	dt
 d|jr|s|jr||js|jst	d| |  D ] }t|v rt	| dt dqd S )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=rf   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)rV   r   rS   rR   r@   r?   r>   rB   rC   r   distget_rankrT   keysry   )r   r   r   rg   Z
fsdp_statekeyr;   r;   r<   _verify_state_dict  sV    
	r   )r   apird   c                 C   s,   t | |}|tv r(tjt | j|| d}|S )N)self)r{   r4   r   r   r   )r   r   callr;   r;   r<   _state_dict_fn  s    
r   )
state_dictr   rd   c                 C   sD   |j r.|jrtj sdnd}t| |j|dS |jr<t| S | S d S )Nr;   )r   )r?   
ranks_only)r>   r?   rX   distributedZis_initializedr   r   )r   r   r   r;   r;   r<   _maybe_full_or_cpu_state_dict  s    r   )r`   r   rd   c                 C   sz  |j s
i S |  t| d }W d    n1 s40    Y  t| D ]t}t| |}t|dkspJ ||ftt|}||krJt	ddd}|||st
d| d| ||||< qJ|jr(i }| D ]N}|jD ]B}||sq|jr|| ||< q|t|d  }	|| ||	< qq|}|jrp|  D ]6\}}
|
jrLq8t| |}|D ]}|| qZq8t||S )Nr   rh   )rd   c                 S   s   t |t | krdS |d}| d}d}t|D ]P\}}||| krr|d7 }|t |kr|t |d k  S q4|dv r~q4q4 dS q4dS )NFrf   r   rh   )rg   rp   T)rx   rr   rs   )r   rk   Z	fqn_splitZ	key_splitZfqn_idxZkey_idxZkey_namer;   r;   r<   verify  s    

z%_get_model_state_dict.<locals>.verifyzAn unexpected key, z, exists. FQN is )rS   rU   r   r]   r   r   rx   nextiterrK   r   poprR   
startswithrA   r@   r   requires_gradr   )r`   r   r   r   r   rk   r   new_state_dictrm   r   r   r;   r;   r<   _get_model_state_dict  s@    
*




r   )r`   r   r   rd   c                 C   s  |j r|s|jsti i S i }t| |jD ]\}}t| ||j}t| ||jddd}t||D ]^\}}	|jrxt dkr||	kr|	|d }
|
d u r|j
rtd| dn|
||	< |||	< q^q*d}|js|jrt }| D ]*\}}t|r| dkr||j qtd|v r0|td d}t|dkrP|tj  nt|dkrftd	|jrt|||	 |j
|jd
 n|jrt|||	 d || | , ttt| d||j
|dW  d    S 1 s0    Y  d S )NF)rb   rc   r   zMissing key: rf   metaTrh   zMultiple devices found)devicerB   r?   r   load_state_dict)r   rB   assign) rS   rC   r!   r   rF   r   zipr   r   r   rB   r   r>   rZ   r   rX   Z	is_tensordimr   r   removerx   Zdistributed_c10dZ_get_pg_default_devicer   r   r?   r   r   rU   r
   r   )r`   r   r   local_state_dictr   valuer   Zfqns_with_prefixrk   Zfqn_with_prefix
load_valuer   Zdevicesr;   r;   r<   _load_model_state_dict  sp    



r   )optimrd   c                 C   s   | j r
dS | jD ]$}|t D ]}|jdur  dS qq| jD ]$}|t D ]}|jrHt||_qHq<g }| jD ]<}d|v rl||d  t|d tj	rt
dnd|d< ql| jdd | jD ]}d|v r|d|d< q| jdd dS )zH
    Initialize optim states by calling the step() with zero grads.
    Nlrg        )closurer   T)Zset_to_none)r3   r1   _PARAMSZgradr   rX   Z
zeros_likerv   rt   rY   Ztensorstepr   Z	zero_grad)r   param_groupr   Zlrsr;   r;   r<   _init_optim_state]  s.    




r   )r   rd   c           	   
   C   s   dd }i }t t| t  D ]@\}}t t| D ](\}}|| ||t d| d| < q4qt t| t D ]L}|t}t tt	 |D ].}| D ] \}}||t d| d| < qqqn|S )aI  
    This API flattens the optimizer state_dict to support optimizer resharding for
    MPMD, e.g., pipeline parallelism.

    Without the API, the original optimizer state_dict looks like:
    {
        "state": {
            "layer1.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
            "layer2.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
        },
        "param_group": [
            {
                "lr": 0.0,
                "betas": (0.9, 0.95), ...,
                "params": ["layer1.weight", "layer2.weight"]
            }
        ]
    }

    With this API, the optimizer state_dict looks like:
    {
        "state.layer1.weight.step": 10,
        "state.layer2.weight.step": 10,
        "state.layer1.weight.exp_avg": SomeTensor,
        "state.layer2.weight.exp_avg": SomeTensor,
        "state.layer1.weight.exp_avg_sq": SomeTensor,
        "state.layer2.weight.exp_avg_sq": SomeTensor,
        "param_group.layer1.weight.lr" : 0.1,
        "param_group.layer2.weight.lr" : 0.1,
        "param_group.layer1.weight.betas" : (0.9, 0.95),
        "param_group.layer2.weight.betas" : (0.9, 0.95),
    }

    Note that if any of the value is a container, like the betas in the example,
    this API won't flattent it.
    c                 S   s*   t | tjttfs&tdt|  dd S )NzUFlattening optimizer state_dict only supports tensor, int, float states now. Type is rf   )rt   rX   rY   intfloatNotImplementedErrortype)vr;   r;   r<   _raise_if_type_not_supported  s    z?_flatten_optim_state_dict.<locals>._raise_if_type_not_supportedrf   )
r
   r'   _STATEr   r(   _PGr   r   r]   rM   )	r   r   retrk   r3   kr   r   r   r;   r;   r<   _flatten_optim_state_dict  s    *
r   )r   r   r   rd   c                 C   s  i }g }t |t|i}| jD ]}|tg i |t D ]}|j| D ]}||jv rd}	| D ]2}
|
tkrlq^t d| d|
 }||v rd}	 qq^nd}	|	sqD|d t }t|t	sJ || |j
sqDi ||< | j|  D ]*}|t  d| d|  tt|| |< qqDq6tt	t |d t d }| D ]}
|
tkrDq2|t d| d|
  }|
|d vrx||d |
< n<|d |
 |kr2td| d|
 d| d|d |
  d	q2q|S )	z
    This API unflattens the state_dict generated by _flatten_optim_state_dict().
    See the docstring of _flatten_optim_state_dict() for more detail.
    Frf   Tr   r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )r   r   r1   rv   r   rP   rQ   r   rt   r]   r   r3   r
   r'   rM   r   )r   r   r   r3   pg_state
return_osdr   r   rk   	in_paramsr   Zflatten_keyr2   Z
state_nameZfirst_param_fqnr   r;   r;   r<   _unflatten_optim_state_dict  sb    	




r   )r`   
optimizersr   rd   c              	      s,  |j s
i S ti tg i}|D ]}t| t|d }|jr|  t| ||}W d    n1 sf0    Y  |svqt	|t 
 D ]*}d|v r|t ||t |dd< q|t D ]}dd |t D }||t< qnt	tdd |jD }tt|tt|}	i  |  D ]X\}
}t| |
}t|d	ks<J tt|}||	vrVq|	| }| |< | |< qt	|t 
 D ]$}
 |
 }|t |
|t |< q|t D ] } fd
d|t D |t< q|sqtt|t |t  tt|t |t  q|jr"ttt |}t!||S )Nr   rp   
_orig_mod.re   c                 S   s   g | ]}| d dqS )r   re   rq   rj   r   r;   r;   r<   
<listcomp>!  ro   z)_get_optim_state_dict.<locals>.<listcomp>c                 s   s   | ]}|t  V  qd S r5   )r   )rj   gr;   r;   r<   r   $  ro   z(_get_optim_state_dict.<locals>.<genexpr>rh   c                    s   g | ]} | qS r;   r;   )rj   pidZfqn_pid_mappingr;   r<   r   6  ro   )"rT   r   r   r   r   rV   rU   rw   r   r]   r   r   rq   r   r   from_iterabler1   rW   r   rangerx   r   r   r   r   r
   r'   r   r(   extendrD   r)   r   r   )r`   r   r   r   r   Zosdr   r   r2   Zparam_pid_mappingr   r   r   rk   r   groupr;   r   r<   _get_optim_state_dict  sV    

, 

r   )r`   r   r   r   rd   c              	   C   sL  i }g }t |t|i}i }tdd tt|t   D r<|S |jD ]}|tg i |t D ]}	|j	|	 D ]}
|
|j
v rd}tt|t D ]"}|
ttt |t v rd} qqnd}|sql|d t }t|tsJ ||
 |	jrtt|t  |
 ||
< tt|t D ]6}|
ttt |t v r
t|t d |t|< q
qlq^t|t dkrBg }tt|t D ],}tttt |t dkrh|| qht|dkrtdt|t t|jkrtd	t|t d |t|< qBtt|t D ]R}|t|d}|dkrq| D ]$\}}|tkr4q||| |< qq|S )
a  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    c                 s   s   | ]}t |tV  qd S r5   )rt   r   r   r;   r;   r<   r   `  s   z*_split_optim_state_dict.<locals>.<genexpr>FTr   rh   r   zThere are param groups that have zero parameters. In such a case, DSD only support exactly one param group with zero parameters.But the loaded state_dict has zero or more than one param groups that have zero parameters.z`When there is a parameter group that has zero parameters, multiple optimizers are not supported.)r   r   allr
   r'   r   r1   rv   r   rP   rQ   r(   r]   rM   rt   r   rx   idr   r}   r   )r`   r   r   r   r3   r   r   Z
pg_mappingr   r   rk   r   Zloaded_param_groupr2   r   Zpg_idxr   r   r;   r;   r<   _split_optim_state_dictF  sp    

 

r   )r`   r   r   r   rd   c              	      s  |j s
d S |D ]x}t| |rTt|v r8t| |||}qXt|ttttf ||}ni }|j	rr| 
 D ]\}}t| |}t| |dd}	||	krqht|dksJ | |	 |t D ]6}
ttttf |
}fdd|t D }||t< qtt|t }t| D ]&}|v r||||< qqh|  t| ||}W d    n1 sd0    Y  n|jrxd|_t| |f|}d|_d   fdd}ttj||} d usJ t|\}}t|\}}|jrt|| d	 nt || d	 | D ]6}||vr||v s"J || ||< || ||< qt!||}|t D ]&}t|vrPg ttttf |t< qPt"|d
|d qd S )NF)rc   rh   c                    s   g | ]}|  qS r;   r   )rj   r   )rk   fqn_with_compilerr;   r<   r     s   z*_load_optim_state_dict.<locals>.<listcomp>Tc                    s2   |   dkr. d u r| j n | jkr.td| S )Nr   zDevice mismatch)r   r   r   )tr   r;   r<   _device  s    
z'_load_optim_state_dict.<locals>._devicer   r   r   )#rT   r   r   r   r   r
   rW   rM   r&   rV   r   r   rx   r   r   r   r   r'   r]   r   rq   rU   rw   Zoptim_state_dict_to_loadr>   r   r#   rX   rY   r   rC   r   r   r   r   )r`   r   r   r   r   r   Zoriginal_fqn_r   Zfqns_with_compilerr   valr2   Z	osd_stater   r   r   Zflatten_osdZosd_mappingZflatten_local_osdZlocal_osd_mappingZ	optim_keyZpgr;   )r   rk   r   r<   _load_optim_state_dict  s~    




(	

r   )r`   r   r   rd   c                C   sV   t  < t| dd||d}t| |}t|i | |W  d   S 1 sH0    Y  dS )aH  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.

    :rtype: typing.Dict[str, ValueType]
    r;   Fr   r   r   N)r=   r   r   r   )r`   r   r   r   r   r;   r;   r<   r+     s    
r+   )r`   r   r   r   rd   c                C   st   t  Z t|tjjr|fnt|}t| |d||d}t| ||}ti || |W  d   S 1 sf0    Y  dS )a  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.

    :rtype: OptimizerStateType
    Tr   N)	r=   rt   rX   r   	Optimizertupler   r   r   )r`   r   r   r   r   r   r;   r;   r<   r,   *  s    r,   c                C   s   t  h t|tjjr|fnt|}t| |d||d}t| |}t| ||}t	||| ||fW  d   S 1 st0    Y  dS )a  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:
        >>> # xdoctest: +SKIP
        >>> import torch
        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        >>> from torch.nn.parallel import DistributedDataParallel as DDP
        >>> from torch.distributed.checkpoint.state_dict import get_state_dict

        >>> fsdp_model = FSDP(copy.deepcopy(model))
        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        >>> ddp_model = DDP(copy.deepcopy(model))
        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(
        ...     fsdp_model, fsdp_optim
        ... )

        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        >>> # the asserts will fail.
        >>> assert ddp_state_dict == fsdp_state_dict
        >>> assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.

    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
    Fr   N)
r=   rt   rX   r   r  r  r   r   r   r   )r`   r   r   r   r   r   r   r;   r;   r<   r-   W  s     H
r-   )r`   r   rd   c           	         s   |si S t tt| tjrtdt t	t
tjt
ttf f |}i }| D ]p\}}|  D ]^\}}||krvqdt| |}t|dksJ dtt| d | fdd| D  qdqT|S t	t
ttf |S d S )NzPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.rh   z/FQNs for a submodule should only have 1 elementrf   c                    s   i | ]\}} | |qS r;   r;   )rj   Zsubfqnr   rl   r;   r<   
<dictcomp>  ro   z/_unflatten_model_state_dict.<locals>.<dictcomp>)rt   r   r   r   r^   r_   r   r   r   r
   rW   rM   r&   r   r   r   rx   r   )	r`   r   Zcast_state_dictr   r   Zsub_state_dictra   mr   r;   rl   r<   _unflatten_model_state_dict  s*    
r  )r   )r`   r   r   rd   c                C   s\   t | |}t 8 t| dd|d}t|i | t| ||W  d   S 1 sN0    Y  dS )a=  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Dict[str, ValueType]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    :type model_state_dict: typing.Dict[str, ValueType]
    r;   Fr   r   N)r  r=   r   r   r   )r`   r   r   r   r;   r;   r<   r.     s    r.   )r`   r   r   r   rd   c                C   sp   t  V t|tjjr|fnt|}t| |d|d}ti || t| ||| W d   n1 sb0    Y  dS )a  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    WARN: ``set_optimizer_state_dict`` can only be called before ``backward()`` or after
        ``step()`` is called on the optimizers. Otherwise, the optimizer states won't be
        initialized correctly.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None

    :type optim_state_dict: typing.OptimizerStateType
    Tr  N)	r=   rt   rX   r   r  r  r   r   r   )r`   r   r   r   r   r;   r;   r<   r/     s    r/   )r`   r   r   r   r   rd   c                C   s   t | |}t d t|tjjr&|fnt|}t| || |d}t||| t	| ||| t
| ||W  d   S 1 sz0    Y  dS )a  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    WARN: ``set_state_dict`` can only be called before ``backward()`` or after ``step()``
        is called on the optimizers. Otherwise, the optimizer states won't be initialized
        correctly.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

    :type model_state_dict: typing.Dict[str, ValueType]
    :type optim_state_dict: typing.OptimizerStateType
    r  N)r  r=   rt   rX   r   r  r  r   r   r   r   )r`   r   r   r   r   r   r;   r;   r<   r0   %  s    .
r0   )r`   r   rd   c                   sj   t jt| |dfdd}|| _t jt| |d tttf d fdd}|| _t	
| t	
| dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )r`   r   c                      s     S r5   r;   r;   _state_dict_callr;   r<   state_dict_call  s    z0_patch_model_state_dict.<locals>.state_dict_callr   c                    s    | d d S )N)r   r;   r   _load_state_dict_callr;   r<   load_state_dict_call  s    z5_patch_model_state_dict.<locals>.load_state_dict_callN)r   r   r+   r   r.   rW   rM   r   r   r4   r   )r`   r   r	  r  r;   r  r  r<   _patch_model_state_dictg  s     
r  )r`   r   r   rd   c                   s   t jt| ||dfdd}t jt| ||d tttf d fdd}t| t| t	|t
jjrr|fnt|}|D ]}||_||_q~dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )r`   r   r   c                      s     S r5   r;   r;   r  r;   r<   r	    s    z4_patch_optimizer_state_dict.<locals>.state_dict_callr   c                    s    | d d S )N)r   r;   r   r
  r;   r<   r    s    z9_patch_optimizer_state_dict.<locals>.load_state_dict_callN)r   r   r,   r/   rW   rM   r   r4   r   rt   rX   r   r  r  r   r   )r`   r   r   r	  r  r   r;   r  r<   _patch_optimizer_state_dict  s.    

r  )rE   TT)rE   )qr[   r   r6   r   collections.abcr   r   Zdataclassesr   r   r   	itertoolsr   typingr   r	   r
   r   r   r   rX   Ztorch.distributedr   r   Ztorch.nnr^   Z'torch.distributed._shard.sharded_tensorr   Z#torch.distributed._state_dict_utilsr   r   r   r   r   r   Z;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   Ztorch.distributed.fsdpr   r   r   rw   r   r   r   r   r   Z$torch.distributed.fsdp._common_utilsr   r   Ztorch.distributed.tensorr    Ztorch.nn.modules.moduler!   Ztorch.nn.parallelr"   ru   Ztorch.utils._pytreer#   __all__ry   r   r   r   rZ   rM   r$   rY   r   r   r%   r]   r  rW   r&   r'   r(   r)   r4   rL   r   r=   r*   rN   r_   rK   r   r   r   r   r  r   r   r   r   Zno_gradr   r   r   r   r   r   r   r   r+   r,   r-   r  r.   r/   r0   r  r  r;   r;   r;   r<   <module>   sb  
  (


/   G
- 
- 

?
D*A
?@^c
)1\$
%
,1
B6