o
    Zh                     @   sZ  U d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZm	Z	m
Z
 d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZm Z m!Z!m"Z" d dl#m$Z$ d d	l%m&Z&m'Z'm(Z)m*Z*m+Z+m,Z,m-Z-m.Z. d d
l/m0Z0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z8 d dl9m:Z: g dZ;dZ<dZ=dZ>dZ?e@eA ZBee3eejCeDeEeAf ZFeeFeGeF eHeF eIeAdf f ZJeIeAeJf ZKeGeK ZLeIeAeeKeLf f ZMe@ ZNe@e eOd< e jPdd ZQe	G dd dZRe	G dd deRZSejT			dcdejUdeAd eAd!eVd"eVd#eBfd$d%ZWG d&d' d'ZXddd(d)ZYddd*dejUd+eHejZj[d,f d-eVd.ee@ejU  d/eeR d#eSfd0d1Z\d2eIeAeJf d3eMd4eSd#dfd5d6Z]d7eejUejZj[f d8eAd#efd9d:Z^d;eIeAef d4eSd#eIeAef fd<d=Z_e` dejUd4eSd#eIeAeJf fd>d?Zae` dejUd;eIeAeJf d4eSd#e5fd@dAZbdBejZj[d#dfdCdDZcd;eMd#eIeAeJf fdEdFZddBejZj[d;eIeAeJf d4eSd#eMfdGdHZee` dejUdIeHejZj[d,f d4eSd#eMfdJdKZfdejUdBejZj[d3eMd4eSd#eMf
dLdMZge` dejUdIeHejZj[d,f d;eMd4eSd#df
dNdOZhddd*dejUd.ee@ejU  d/eeR d#eIeAeJf fdPdQZiddd*dejUdIeejZj[eejZj[ f d.ee@ejU  d/eeR d#eMf
dRdSZjddd*dejUdIeejZj[eejZj[ f d.ee@ejU  d/eeR d#eHeIeAeJf eMf f
dTdUZkdejUd;eeIejUeIeAeJf f eIeAeJf f d#eIeAeJf fdVdWZlddXdejUd2eIeAeJf d/eeR d#e5fdYdZZmddXdejUdIeejZj[eejZj[ f d3eMd/eeR d#df
d[d\ZnddXdejUdIeejZj[eejZj[ f d2eIeAeJf d3eMd/eeR d#e5fd]d^ZoeddXdejUd/eeR d#dfd_d`ZpeddXdejUdIeHejZj[d,f d/eeR d#dfdadbZqdS )e    N)	GeneratorIterable)asdict	dataclassfield)chain)AnyCallablecastno_type_checkOptionalUnion)ShardedTensor)_broadcast_state_dict_distribute_state_dict_flatten_state_dict_gather_state_dict_offload_state_dict_to_cpu_unflatten_state_dict)_CHECKPOINT_PREFIX)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)DTensor)_IncompatibleKeys)DistributedDataParallel)tree_map_only)FQNS_TPrimitiveType	ValueTypeDictValueTypeListDictValueTypeOptimizerStateTypeStateDictOptionsget_model_state_dictget_optimizer_state_dictget_state_dictset_model_state_dictset_optimizer_state_dictset_state_dictZ_flat_paramparam_groupsparamsstater&   _patched_state_dictc                  c   s@    t  } t   zd V  W | rt   d S d S | rt   w w N)gc	isenableddisableenable)
is_enabled r;   V/var/www/auris/lib/python3.10/site-packages/torch/distributed/checkpoint/state_dict.py_gc_contextQ   s   
r=   c                   @   sr   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< dZeed
< dZeed< dS )r*   ap  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().

    - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
       full state_dict and will broadcast the tensors in the state_dict/
       optim_state_dict one by one to other ranks. Other ranks will receive
       the tensors and shard according to the local shards in the model and
       optimizer. ``full_state_dict`` must be set to True when using this option.
       This option currently only supports DTensor, not the legacy ShardedTensor.
    Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictbroadcast_from_rank0flatten_optimizer_state_dict_fqn_modifiersdsd_fqn_modifiersN)__name__
__module____qualname____doc__r>   bool__annotations__r?   r@   rA   rB   rC   rD   rF   strr;   r;   r;   r<   r*   \   s   
 $r*   c                   @   s   e Zd ZU eedZeeeej	f ee
ej	f f ed< eedZeeeej	f ee
ej	f f ed< eedZee ed< dZeed< dZeed< ejZeed< eedZeej ed	< d
S )_StateDictInfo)default_factoryfqn_param_mappingshared_params_mappingsubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesN)rG   rH   rI   r   dictrP   r   rM   torchTensorr$   rL   rQ   setrR   rS   rK   rT   
contextlibnullcontextrU   r	   listrV   nnModuler;   r;   r;   r<   rN      s$   
 rN   rE   TmodelnamerF   skip_ddp_prefixskip_compiler_prefixreturnc                    s  | td}d|vr|hS |d}g }| }t|D ]\}}	t|tr4|	dks)J |j}|s3||	 qt|trz|t	|d k rf||d  t
krfd| t|t
}
 rZ  d  fdd|
jD   S t|t}|	tkry||	 t||	}qt|tjjjr|	dksJ |j}|s||	 qt||rt|| |	 }rt||rt||}||	 |	tjjjkr|t	|d krtdqt||	}qd| tdhS )	a  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
     .module   c                    s   h | ]}  | qS r;   r;   .0fqnprefixr;   r<   	<setcomp>       z_get_fqns.<locals>.<setcomp>	_orig_modz-Expect `_extra_state` to be the last obj name)replacer   split	enumerate
isinstanceDDPrg   appendFSDPlen_FLAT_PARAMjoingetattrZ_fqnsr   rX   Z_dynamoZ
eval_frameZOptimizedModulerp   hasattrgetr^   modules_EXTRA_STATE_KEY_SUFFIXRuntimeError)r`   ra   rF   rb   rc   Z	obj_namesZfqn_obj_namesZcurr_objiZcurr_obj_nameZ
flat_paramZremoved_fqnr;   rl   r<   	_get_fqns   sX   



 










r   c                   @   s   e Zd ZdS )_EXTRA_STATEN)rG   rH   rI   r;   r;   r;   r<   r      s    r   c                 #   s<    t  dtjdtdtf fdd| dE d H  d S )Nrg   curr_fqnrd   c                 3   s    |  |r| dnd}|  D ]-\}}|v rqt|  r2|t|    v r2|d d }n| | }||E d H  qt| jdd| jddD ]\}}|| jv rXqN| | }||fV  qNt| j	dt
jjt
jjkr| t
jjj }|t fV  d S d S )Nrf   re   F)recurseget_extra_state)addZnamed_childrenr|   r{   valuesr   Znamed_buffersnamed_parametersZ_non_persistent_buffers_set	__class__r^   r_   r   r~   rg   r   r   )rg   r   ra   	submodulenew_fqnobjrF   r   Zvisited_modulesr;   r<   r      s2   

z+_iterate_valid_model_state.<locals>.recursere   )rZ   r^   r_   rM   r   )r`   rF   r;   r   r<   _iterate_valid_model_state   s    "r   )
submodulesoptionsoptims.
optim_onlyr   r   c                C   s&  |rt dt |r|std|pt }i }i }t| D ]@\}}t|tr'qt| |}	|	|d}
|
durIt
tt || |	 || ||< n|	 ||< |	D ]}
t|ts\|||
< qQqt| D ]\}}|D ]
}
t
tj|||
< qjqdt }|rt|}|  D ]"\}}||vrqt| |}	t|	dksJ d|dd |	D  q|jr|jstdt| }|r|jrt|j|jd	}t|j|jp|jd	}tj}nt|jd
}t |jd
}tj!}t"j#dd }t$j%|| |||d}nt"j&}t'di t(|||||t
tt)j* || t|dkdS )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    zGetting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.z;Optimizers are not passed in but optim_only is set to True.Nrh   z)Submodule FQN should only have 1 instancec                 s   s    | ]}| d V  qdS )rf   Nr;   ri   r;   r;   r<   	<genexpr>L  s    z"_verify_options.<locals>.<genexpr>z?full_state_dict must be True when broadcast_from_rank0 is True.)offload_to_cpuZ
rank0_only)r   c              	   s   s    t  5 t jddtd tj| |||d d V  W d    n1 s%w   Y  W d    d S W d    d S 1 s=w   Y  d S )NignorezFSDP.state_dict_type)messagecategoryrg   state_dict_typestate_dict_configoptim_state_dict_config)warningscatch_warningsfilterwarningsFutureWarningrw   r   r   r;   r;   r<   $fsdp_state_dict_type_without_warningj  s    
"z=_verify_options.<locals>.fsdp_state_dict_type_without_warningr   r   )rP   rQ   rR   rU   rV   rS   rT   r;   )+r   warnr   r   r*   r   rt   r   r   r}   r
   rZ   rM   updatecopyr]   itemsrX   rY   named_modulesrx   rC   r>   
ValueErrorrw   rV   r   r?   r   r   ZFULL_STATE_DICTr   r   ZSHARDED_STATE_DICTr[   contextmanager	functoolspartialr\   rN   r   r^   r_   )r`   r   r   r   r   rP   rQ   ra   paramfqnsrk   Zparam_Zfqns_rR   rg   rV   r   r   r   r   rU   r;   r;   r<   _verify_options  s   









r   model_state_dictoptim_state_dictinfoc                 C   s   |j D ]}t|}|d usJ dq|jr3| s3|js3|js3|jr#|js3|jr3|js3t	dt
 d|jrH|sH|jr>|jsH|jsHt	d| |  D ]}t|v r\t	| dt dqLd S )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=rf   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)rV   r   rS   rR   r@   r?   r>   rB   rC   r   distget_rankrT   keysry   )r   r   r   rg   Z
fsdp_statekeyr;   r;   r<   _verify_state_dict  sZ   
	r   r   apic                 C   s,   t | |}|tv rtjt | j|| d}|S )N)self)r{   r4   r   r   r   )r   r   callr;   r;   r<   _state_dict_fn  s   
r   
state_dictc                 C   s@   |j r|jrtj sdnd}t| |j|dS |jrt| S | S )Nr;   )r   )r?   
ranks_only)r>   r?   rX   distributedZis_initializedr   r   )r   r   r   r;   r;   r<   _maybe_full_or_cpu_state_dict  s   r   c                 C   s  |j si S |  t| d }W d    n1 sw   Y  t| D ]:}t| |}t|dks8J ||ftt|}||kr_dt	fdd}|||sXt
d| d| ||||< q%|jri }| D ]&}|jD ] }||svqn|jr|| ||< qn|t|d  }	|| ||	< qnqi|}|jr|  D ]\}}
|
jrqt| |}|D ]}|| qqt| D ]\}}t|r|jr|| qt||S )Nr   rh   rd   c                 S   s   t |t | kr
dS |d}| d}d}t|D ]&\}}||| kr9|d7 }|t |kr8|t |d k  S q|dv r>q dS dS )NFrf   r   rh   )rg   rp   T)rx   rr   rs   )r   rk   Z	fqn_splitZ	key_splitZfqn_idxZkey_idxZkey_namer;   r;   r<   verify  s   

z%_get_model_state_dict.<locals>.verifyzAn unexpected key, z, exists. FQN is )rS   rU   r   r]   r   r   rx   nextiterrK   r   poprR   
startswithrA   r@   r   requires_gradr   rX   	is_tensoris_metar   )r`   r   r   r   r   rk   r   new_state_dictrm   r   r   pr;   r;   r<   _get_model_state_dict  sP   







r   c                 C   s  |j r|s|jsti i S i }t| |jD ]J\}}t| ||j}t| ||jddd}t||D ]/\}}	|jr<t dkrZ||	krZ|	|d }
|
d u rV|j
rUtd| dn|
||	< |||	< q/qd}|jsh|jrt }| D ]\}}t|r| dkr||j qotd|v r|td d}t|dkr|tj  n
t|dkrtd	|jrt|||	 |j
|jd
 n|jrt|||	 d | D ]\}}|||< q|  ttt| d||j
|dW  d    S 1 sw   Y  d S )NF)rb   rc   r   zMissing key: rf   metaTrh   zMultiple devices found)devicerB   r?   r   load_state_dict)r   rB   assign)rS   rC   r!   r   rF   r   zipr   r   r   rB   r   r>   rZ   r   rX   r   dimr   r   removerx   Zdistributed_c10dZ_get_pg_default_devicer   r   r?   r   rU   r
   r   )r`   r   r   local_state_dictr   valuer   Zfqns_with_prefixrk   Zfqn_with_prefixZ
load_valuer   ZdevicesZlocal_stater;   r;   r<   _load_model_state_dict  sv   



$r   optimc                 C   s   | j rdS | jD ]}|t D ]}|jdur  dS qq| jD ]}|t D ]}|jr/t||_q$qg }| jD ]}d|v rT||d  t|d tj	rPt
dnd|d< q6| jdd | jD ]}d|v rk|d|d< q^| jdd dS )zH
    Initialize optim states by calling the step() with zero grads.
    Nlrg        )closurer   T)Zset_to_none)r3   r1   _PARAMSZgradr   rX   Z
zeros_likerv   rt   rY   Ztensorstepr   Z	zero_grad)r   param_groupr   Zlrsr;   r;   r<   _init_optim_statec  s:   




r   c           	   
   C   s   dd }i }t t| t  D ] \}}t t| D ]\}}|| ||t d| d| < qqt t| t D ]&}|t}t tt	 |D ]}| D ]\}}||t d| d| < qKqEq7|S )aI  
    This API flattens the optimizer state_dict to support optimizer resharding for
    MPMD, e.g., pipeline parallelism.

    Without the API, the original optimizer state_dict looks like:
    {
        "state": {
            "layer1.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
            "layer2.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
        },
        "param_group": [
            {
                "lr": 0.0,
                "betas": (0.9, 0.95), ...,
                "params": ["layer1.weight", "layer2.weight"]
            }
        ]
    }

    With this API, the optimizer state_dict looks like:
    {
        "state.layer1.weight.step": 10,
        "state.layer2.weight.step": 10,
        "state.layer1.weight.exp_avg": SomeTensor,
        "state.layer2.weight.exp_avg": SomeTensor,
        "state.layer1.weight.exp_avg_sq": SomeTensor,
        "state.layer2.weight.exp_avg_sq": SomeTensor,
        "param_group.layer1.weight.lr" : 0.1,
        "param_group.layer2.weight.lr" : 0.1,
        "param_group.layer1.weight.betas" : (0.9, 0.95),
        "param_group.layer2.weight.betas" : (0.9, 0.95),
    }

    Note that if any of the value is a container, like the betas in the example,
    this API won't flattent it.
    c                 S   s*   t | tjttfstdt|  dd S )NzUFlattening optimizer state_dict only supports tensor, int, float states now. Type is rf   )rt   rX   rY   intfloatNotImplementedErrortype)vr;   r;   r<   _raise_if_type_not_supported  s   z?_flatten_optim_state_dict.<locals>._raise_if_type_not_supportedrf   )
r
   r'   _STATEr   r(   _PGr   r   r]   rM   )	r   r   retrk   r3   kr   r   r   r;   r;   r<   _flatten_optim_state_dict  s   *
r   c                 C   s  i }g }t |t|i}| jD ]}|tg i |t D ]j}|j| D ]b}||jv rGd}	| D ]}
|
tkr5q.t d| d|
 }||v rEd}	 nd}	|	sLq!|d t }t|t	sYJ || |j
sbq!i ||< | j|  D ]}|t  d| d|  tt|| |< qmq!qtt	t |d t d }| D ]=}
|
tkrq|t d| d|
  }|
|d vr||d |
< q|d |
 |krtd| d|
 d| d|d |
  d	qq|S )	z
    This API unflattens the state_dict generated by _flatten_optim_state_dict().
    See the docstring of _flatten_optim_state_dict() for more detail.
    Frf   Tr   r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )r   r   r1   rv   r   rP   rQ   r   rt   r]   r   r3   r
   r'   rM   r   )r   r   r   r3   pg_state
return_osdr   r   rk   	in_paramsr   Zflatten_keyr2   Z
state_nameZfirst_param_fqnr   r;   r;   r<   _unflatten_optim_state_dict  sl   	



r   
optimizersc              	      s  |j si S ti tg i}|D ]}t| t|d }|jrm|  t| ||}W d    n1 s2w   Y  |s:qt	|t 
 D ]}d|v rW|t ||t |dd< qB|t D ]}dd |t D }||t< q\nut	tdd |jD }tt|tt|}	i  |  D ](\}
}t| |
}t|d	ksJ tt|}||	vrq|	| }| |< | |< qt	|t 
 D ]}
 |
 }|t |
|t |< q|t D ]} fd
d|t D |t< q|sqtt|t |t  tt|t |t  q|jr	ttt |}t!||S )Nr   rp   
_orig_mod.re   c                 S   s   g | ]}| d dqS )r   re   rq   rj   r   r;   r;   r<   
<listcomp>'  ro   z)_get_optim_state_dict.<locals>.<listcomp>c                 s   s    | ]}|t  V  qd S r5   )r   )rj   gr;   r;   r<   r   *  s    z(_get_optim_state_dict.<locals>.<genexpr>rh   c                    s   g | ]} | qS r;   r;   )rj   pidZfqn_pid_mappingr;   r<   r   <  s    )"rT   r   r   r   r   rV   rU   rw   r   r]   r   r   rq   r   r   from_iterabler1   rW   r   rangerx   r   r   r   r   r
   r'   r   r(   extendrD   r)   r   r   )r`   r   r   r   r   Zosdr   r   r2   Zparam_pid_mappingr   r   r   rk   r   groupr;   r   r<   _get_optim_state_dict  s\   




r   c              	   C   s:  i }g }t |t|i}i }tdd tt|t   D r|S |jD ]}|tg i |t D ]q}	|j	|	 D ]i}
|
|j
v rXd}tt|t D ]}|
ttt |t v rVd} nqEnd}|s]q5|d t }t|tsjJ ||
 |	jr}tt|t  |
 ||
< tt|t D ]}|
ttt |t v rt|t d |t|< qq5q.t|t dkrg }tt|t D ]}tttt |t dkr|| qt|dkrtdt|t t|jkrtd	t|t d |t|< q!tt|t D ]'}|t|d}|dkrq| D ]\}}|tkrq||| |< qq|S )
a  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    c                 s   s    | ]}t |tV  qd S r5   )rt   r   r   r;   r;   r<   r   f  s    

z*_split_optim_state_dict.<locals>.<genexpr>FTr   rh   r   zThere are param groups that have zero parameters. In such a case, DSD only support exactly one param group with zero parameters.But the loaded state_dict has zero or more than one param groups that have zero parameters.z`When there is a parameter group that has zero parameters, multiple optimizers are not supported.)r   r   allr
   r'   r   r1   rv   r   rP   rQ   r(   r]   rM   rt   r   rx   idr   r}   r   )r`   r   r   r   r3   r   r   Z
pg_mappingr   r   rk   r   Zloaded_param_groupr2   r   Zpg_idxr   r   r;   r;   r<   _split_optim_state_dictL  s   





r   c              	      s~  |j sd S |D ]4}t| |r*t|v rt| |||}nt|ttttf ||}ni }|j	r| 
 D ]d\}}t| |}t| |dd}	||	krHq3t|dksPJ | |	 |t D ]}
ttttf |
}fdd|t D }||t< q\tt|t }t| D ]}|v r||||< qq3|  t| ||}W d    n1 sw   Y  n|jr4d|_t| |f|}d|_d   fdd}ttj||} d usJ t|\}}t|\}}|jrt|| d	 nt || d	 | D ]}||vr||v s
J || ||< || ||< qt!||}|t D ]}t|vr2g ttttf |t< q t"|d
|d qd S )NF)rc   rh   c                    s   g | ]}|  qS r;   r   )rj   r   )rk   fqn_with_compilerr;   r<   r     s    z*_load_optim_state_dict.<locals>.<listcomp>Tc                    s4   |   dkr d u r| j | S  | jkrtd| S )Nr   zDevice mismatch)r   r   r   )tr   r;   r<   _device  s   
z'_load_optim_state_dict.<locals>._devicer   r   r   )#rT   r   r   r   r   r
   rW   rM   r&   rV   r   r   rx   r   r   r   r   r'   r]   r   rq   rU   rw   Zoptim_state_dict_to_loadr>   r   r#   rX   rY   r   rC   r   r   r   r   )r`   r   r   r   r   r   Zoriginal_fqn_r   Zfqns_with_compilerr   valr2   Z	osd_stater   r   r   Zflatten_osdZosd_mappingZflatten_local_osdZlocal_osd_mappingZ	optim_keyZpgr;   )r   rk   r   r<   _load_optim_state_dict  s   



	

r  c                C   sV   t   t| dd||d}t| |}t|i | |W  d   S 1 s$w   Y  dS )aH  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.

    :rtype: typing.Dict[str, ValueType]
    r;   Fr   r   r   N)r=   r   r   r   )r`   r   r   r   r   r;   r;   r<   r+     s   
$r+   c                C   st   t  - t|tjjr|fnt|}t| |d||d}t| ||}ti || |W  d   S 1 s3w   Y  dS )a  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.

    :rtype: OptimizerStateType
    Tr  N)	r=   rt   rX   r   	Optimizertupler   r   r   )r`   r   r   r   r   r   r;   r;   r<   r,   0  s    $r,   c                C   s   t  4 t|tjjr|fnt|}t| |d||d}t| |}t| ||}t	||| ||fW  d   S 1 s:w   Y  dS )a  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:
        >>> # xdoctest: +SKIP
        >>> import torch
        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        >>> from torch.nn.parallel import DistributedDataParallel as DDP
        >>> from torch.distributed.checkpoint.state_dict import get_state_dict

        >>> fsdp_model = FSDP(copy.deepcopy(model))
        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        >>> ddp_model = DDP(copy.deepcopy(model))
        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(
        ...     fsdp_model, fsdp_optim
        ... )

        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        >>> # the asserts will fail.
        >>> assert ddp_state_dict == fsdp_state_dict
        >>> assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.

    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
    Fr  N)
r=   rt   rX   r   r  r  r   r   r   r   )r`   r   r   r   r   r   r   r;   r;   r<   r-   ]  s"   H
$r-   c           	         s   |si S t tt| tjretdt t	t
tjt
ttf f |}i }| D ]8\}}|  D ]/\}}||kr;q2t| |}t|dksJJ dtt| d | fdd| D  q2q*|S t	t
ttf |S )NzPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.rh   z/FQNs for a submodule should only have 1 elementrf   c                    s   i | ]	\}} | |qS r;   r;   )rj   Zsubfqnr   rl   r;   r<   
<dictcomp>  s    z/_unflatten_model_state_dict.<locals>.<dictcomp>)rt   r   r   r   r^   r_   r   r   r   r
   rW   rM   r&   r   r   r   rx   r   )	r`   r   Zcast_state_dictr   r   Zsub_state_dictra   mr   r;   rl   r<   _unflatten_model_state_dict  s,   

r  )r   c                C   s\   t | |}t  t| dd|d}t|i | t| ||W  d   S 1 s'w   Y  dS )a=  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Dict[str, ValueType]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    :type model_state_dict: typing.Dict[str, ValueType]
    r;   Fr   r   N)r  r=   r   r   r   )r`   r   r   r   r;   r;   r<   r.     s   
$r.   c                C   sr   t  , t|tjjr|fnt|}t| |d|d}ti || t| ||| W d   dS 1 s2w   Y  dS )a  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    WARN: ``set_optimizer_state_dict`` can only be called before ``backward()`` or after
        ``step()`` is called on the optimizers. Otherwise, the optimizer states won't be
        initialized correctly.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None

    :type optim_state_dict: typing.OptimizerStateType
    Tr  N)	r=   rt   rX   r   r  r  r   r   r  )r`   r   r   r   r   r;   r;   r<   r/      s   "r/   c                C   s   t | |}t 2 t|tjjr|fnt|}t| || |d}t||| t	| ||| t
| ||W  d   S 1 s=w   Y  dS )a  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    WARN: ``set_state_dict`` can only be called before ``backward()`` or after ``step()``
        is called on the optimizers. Otherwise, the optimizer states won't be initialized
        correctly.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

    :type model_state_dict: typing.Dict[str, ValueType]
    :type optim_state_dict: typing.OptimizerStateType
    r  N)r  r=   rt   rX   r   r  r  r   r   r  r   )r`   r   r   r   r   r   r;   r;   r<   r0   +  s   .

$r0   c                   sj   t jt| |dfdd}|| _t jt| |d dtttf f fdd}|| _t	
| t	
| dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )r`   r   c                           S r5   r;   r;   _state_dict_callr;   r<   state_dict_call     z0_patch_model_state_dict.<locals>.state_dict_callr   c                        | d d S )N)r   r;   r   _load_state_dict_callr;   r<   load_state_dict_call     z5_patch_model_state_dict.<locals>.load_state_dict_callN)r   r   r+   r   r.   rW   rM   r   r   r4   r   )r`   r   r  r  r;   r  r  r<   _patch_model_state_dictm  s    
r  c                   s   t jt| ||dfdd}t jt| ||d dtttf f fdd}t| t| t	|t
jjr9|fnt|}|D ]}||_||_q?dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )r`   r   r   c                      r	  r5   r;   r;   r
  r;   r<   r    r  z4_patch_optimizer_state_dict.<locals>.state_dict_callr   c                    r  )N)r   r;   r   r  r;   r<   r    r  z9_patch_optimizer_state_dict.<locals>.load_state_dict_callN)r   r   r,   r/   rW   rM   r   r4   r   rt   rX   r   r  r  r   r   )r`   r   r   r  r  r   r;   r  r<   _patch_optimizer_state_dict  s0   

r  )rE   TT)rE   )rr[   r   r6   r   collections.abcr   r   dataclassesr   r   r   	itertoolsr   typingr   r	   r
   r   r   r   rX   Ztorch.distributedr   r   Ztorch.nnr^   Z'torch.distributed._shard.sharded_tensorr   Z#torch.distributed._state_dict_utilsr   r   r   r   r   r   Z;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   Ztorch.distributed.fsdpr   r   r   rw   r   r   r   r   r   Z$torch.distributed.fsdp._common_utilsr   r   Ztorch.distributed.tensorr    Ztorch.nn.modules.moduler!   Ztorch.nn.parallelr"   ru   Ztorch.utils._pytreer#   __all__ry   r   r   r   rZ   rM   r$   rY   r   r   r%   r]   r  rW   r&   r'   r(   r)   r4   rL   r   r=   r*   rN   cacher_   rK   r   r   r   r   r  r   r   r   r   Zno_gradr   r   r   r   r   r   r   r  r+   r,   r-   r  r.   r/   r0   r  r  r;   r;   r;   r<   <module>   s  
  (


/G
-
 

$-



C
E*@

??
^c

)
1
[$

%

,
1

B6