
    [Th                        % S SK r S SKrS SKrS SKrS SKJrJr  S SKJrJ	r	J
r
  S SKJr  S SKJrJrJrJrJrJr  S SKrS SKJr  S SKJr  S SKJr  S SKJrJrJrJ r J!r!J"r"  S SK#J$r$  S S	K%J&r&J'r'J(r)J*r*J+r+J,r,J-r-J.r.  S S
K/J0r0J1r1  S SK2J3r3  S SK4J5r5  S SK6J7r8  S SK9J:r:  / SQr;Sr<Sr=Sr>Sr?\@\A   rB\\3\\R                  \D\E\A4   rF\\F\G\F   \H\F   \I\AS4   4   rJ\I\A\J4   rK\G\K   rL\I\A\\K\L4   4   rM\@" 5       rN\@\   \OS'   \ R                  S 5       rQ\	 " S S5      5       rR\	 " S S\R5      5       rS\R                     SIS\R                  S\AS\AS\VS\VS \B4S! jj5       rW " S" S#5      rXSJS$ jrYSSS%.S\R                  S&\H\R                  R                  S'4   S(\VS)\\@\R                        S*\\R   S \S4S+ jjr\S,\I\A\J4   S-\MS.\SS S4S/ jr]S0\\R                  \R                  R                  4   S1\AS \4S2 jr^S3\I\A\4   S.\SS \I\A\4   4S4 jr_\R                  " 5       S\R                  S.\SS \I\A\J4   4S5 j5       ra\R                  " 5       S\R                  S3\I\A\J4   S.\SS \54S6 j5       rbS7\R                  R                  S S4S8 jrcS3\MS \I\A\J4   4S9 jrdS7\R                  R                  S3\I\A\J4   S.\SS \M4S: jre\R                  " 5       S\R                  S;\H\R                  R                  S'4   S.\SS \M4S< j5       rfS\R                  S7\R                  R                  S-\MS.\SS \M4
S= jrg\R                  " 5       S\R                  S;\H\R                  R                  S'4   S3\MS.\SS S4
S> j5       rhSSS%.S\R                  S)\\@\R                        S*\\R   S \I\A\J4   4S? jjriSSS%.S\R                  S;\\R                  R                  \\R                  R                     4   S)\\@\R                        S*\\R   S \M4
S@ jjrjSSS%.S\R                  S;\\R                  R                  \\R                  R                     4   S)\\@\R                        S*\\R   S \H\I\A\J4   \M4   4
SA jjrkS\R                  S3\\I\R                  \I\A\J4   4   \I\A\J4   4   S \I\A\J4   4SB jrlSSC.S\R                  S,\I\A\J4   S*\\R   S \54SD jjrmSSC.S\R                  S;\\R                  R                  \\R                  R                     4   S-\MS*\\R   S S4
SE jjrnSSC.S\R                  S;\\R                  R                  \\R                  R                     4   S,\I\A\J4   S-\MS*\\R   S \54SF jjro\SSC.S\R                  S*\\R   S S4SG jj5       rp\SSC.S\R                  S;\H\R                  R                  S'4   S*\\R   S S4SH jj5       rqg)K    N)	GeneratorIterable)asdict	dataclassfield)chain)AnyCallablecastno_type_checkOptionalUnion)ShardedTensor)_broadcast_state_dict_distribute_state_dict_flatten_state_dict_gather_state_dict_offload_state_dict_to_cpu_unflatten_state_dict)_CHECKPOINT_PREFIX)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)DTensor)_IncompatibleKeys)DistributedDataParallel)tree_map_only)FQNS_TPrimitiveType	ValueTypeDictValueTypeListDictValueTypeOptimizerStateTypeStateDictOptionsget_model_state_dictget_optimizer_state_dictget_state_dictset_model_state_dictset_optimizer_state_dictset_state_dict_flat_paramparam_groupsparamsstater'   _patched_state_dictc               #      #    [         R                  " 5       n [         R                  " 5          S v   U (       a  [         R                  " 5         g g ! U (       a  [         R                  " 5         f f = f7fN)gc	isenableddisableenable)
is_enableds    _/var/www/auris/envauris/lib/python3.13/site-packages/torch/distributed/checkpoint/state_dict.py_gc_contextr?   Q   sC     JJJLIIK :IIK s   +A2A A2A//A2c                       \ rS rSr% SrSr\\S'   Sr\\S'   Sr	\\S'   Sr
\\S'   Sr\\S	'   Sr\\S
'   Sr\\S'   Sr\\S'   Srg)r+   \   a   
This dataclass specifies how get_state_dict/set_state_dict will work.

- ``full_state_dict``: if this is set to True, all the tensors in the
  returned state_dict will be gathered. No ShardedTensor and DTensor
  will be in the returned state_dict.

- ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
  ``full_state_dict`` is also true, then only the rank0 will get the
  state_dict and all other ranks will get empty state_dict.

- ``ignore_frozen_params``: if the value is True, the returned state_dict
  won't contain any frozen parameters -- the ``requires_grad`` is False.
  The default value is False.

- ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
  indicates whether to keep the submodule prefixes from the state_dict keys.
  or example, if the submodule is ``module.pretrain`` and the full FQN of
  the parameter is ``pretrain.layer1.weight`` of the param. When this option
  is True, the parameter's key in the returned state_dict will be
  ``pretrain.layer1.weight``. If the options is False, the key will be
  ``layer1.weight``.
  Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
  FQNs, hence there should be only one submodule in ``submodules``.

- ``strict``: the ``strict`` option when ``set_state_dict`` calls
  model.load_state_dict().

- ``broadcast_from_rank0``: when the option is True, rank0 should receive a
   full state_dict and will broadcast the tensors in the state_dict/
   optim_state_dict one by one to other ranks. Other ranks will receive
   the tensors and shard according to the local shards in the model and
   optimizer. ``full_state_dict`` must be set to True when using this option.
   This option currently only supports DTensor, not the legacy ShardedTensor.
Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictbroadcast_from_rank0flatten_optimizer_state_dict_fqn_modifiersdsd_fqn_modifiers N)__name__
__module____qualname____firstlineno____doc__rB   bool__annotations__rC   rD   rE   rF   rG   rH   rJ   str__static_attributes__rK       r>   r+   r+   \   s_    "H "OT!K!&$&$(T(FD!&$&). $.-s-rU   r+   c                   X   \ rS rSr% \" \S9r\\\\	R                  4   \\\	R                  4   4   \S'   \" \S9r\\\\	R                  4   \\\	R                  4   4   \S'   \" \S9r\\   \S'   Sr\\S'   Sr\\S'   \R(                  r\\S	'   \" \S9r\\R4                     \S
'   Srg)_StateDictInfo   )default_factoryfqn_param_mappingshared_params_mappingsubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesrK   N)rL   rM   rN   rO   r   dictrZ   r   rS   torchTensorr%   rR   r[   setr\   r]   rQ   r^   
contextlibnullcontextr_   r
   listr`   nnModulerT   rK   rU   r>   rW   rW      s    
 	d# tc5<< fell"#	% $ 	d# 4c5<< fell"#	% $ $)#=C=L$L$'33L(3$)$$?L$ryy/?rU   rW   modelnamerJ   skip_ddp_prefixskip_compiler_prefixreturnc                 ,   UR                  [        S5      nSU;  a  U1$ UR                  S5      n/ nU n[        U5       GH&  u  p[	        U[
        5      (       a0  U	S:X  d   eUR                  nU(       d  UR                  U	5        MI  MK  [	        U[        5      (       a  U[        U5      S-
  :  a^  XXS-      [        :X  aO  SR                  U5      n
[        U[        5      nU
(       a  U
 S3n
UR                   Vs1 s H  o U 3iM
     sns  $ [        U[        5      nU	[        :w  a  UR                  U	5        [        Xy5      nGM	  GM  [	        U[        R                   R"                  R$                  5      (       a2  U	S:X  d   eUR&                  nU(       d  UR                  U	5        GMn  GMq  [)        Xr5      (       aA  [        Xr5      " 5       R+                  U	5      =n(       a  [)        X}5      (       a  [        X}5      nUR                  U	5        U	[,        R.                  R                  R0                  :X  a   U[        U5      S-
  :w  a  [3        S5      eGM  [        Xy5      nGM)     SR                  U5      R                  [        S5      1$ s  snf )a  
This API is used to convert the name of a parameter to the FQNs. For FSDP
without `use_orig_params`, the name of FlatParameter can be mapped to
multiple original parameters. As a result, the return type of this function
is `set[str]`.

Args:
    module (nn.Module): the root model.
    name (str): the name
    skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

Returns:
    The canonical FQNs based on the model traversal.
 .module   	_orig_modz-Expect `_extra_state` to be the last obj name)replacer   split	enumerate
isinstanceDDPrr   appendFSDPlen_FLAT_PARAMjoingetattr_fqnsr    rb   _dynamo
eval_frameOptimizedModulert   hasattrgetrh   modules_EXTRA_STATE_KEY_SUFFIXRuntimeError)rj   rk   rJ   rl   rm   	obj_namesfqn_obj_namescurr_objicurr_obj_nameprefix
flat_paramfqnremoved_fqns                 r>   	_get_fqnsr      sF   0 <<*B/D
$v

3IMH%i0h$$ H,,,H"$$]3 #$''3y>A%%)E*:k*I-0$X{;
 &xq\F4>4D4DE4DS(3%(4DEEx)<=H 33$$]3"8; 4 %--":":"J"JKK K///))H'$$]3 (
 x33")("F"H"L"L!# ;  x55#*8#A  /

 1 1 I III**&'VWW + #8;I 1L HH]#++,>CDD5 Fs   <Jc                       \ rS rSrSrg)_EXTRA_STATE   rK   N)rL   rM   rN   rO   rT   rK   rU   r>   r   r      s    rU   r   c              #      ^^^#    [        5       mS[        R                  S[        S[        4UUU4S jjmT" U S5       S h  vN   g  N7f)Nrr   curr_fqnrn   c              3     >#    TR                  U 5        U(       a  U S3OSnU R                  5        H]  u  p#UT;   a  M  [        U T5      (       a)  U[        U T5      " 5       R	                  5       ;   a  US S nOU U 3nT" X45       S h  vN   M_     [        U R                  SS9U R                  SS95       H!  u  p%X R                  ;   a  M  U U 3nXE4v   M#     [        U R                  S[        R                  R                  5      [        R                  R                  :w  a7  U [        R                  R                  R                   3nU[!        5       4v   g g  N7f)Nrq   rp   F)recurseget_extra_state)addnamed_childrenr   r   valuesr   named_buffersnamed_parameters_non_persistent_buffers_set	__class__rh   ri   r   r   rr   r   r   )	rr   r   rk   	submodulenew_fqnobjrJ   r   visited_moduless	         r>   r   +_iterate_valid_model_state.<locals>.recurse   s]    F#%-hZq>2%446ODO+  122GF,=>@GGII #3B-%Jtf-y222  7    /1H1HQV1H1W
ID 999!
4&)G,
 F$$&79R9RSyy(() "
2::#4#4#L#L"MNG<>))	) 3s   BE$E"CE$rp   )rd   rh   ri   rS   r   )rj   rJ   r   r   s    `@@r>   _iterate_valid_model_stater      sC     &)eO *		  *S  *Y  *  *D ub!!!s   =A
AA
)
submodulesoptionsoptims.
optim_onlyr   r   c                z   U(       a  [         R                  " S[        5        U(       a  U(       d  [        S5      eU=(       d
    [	        5       n0 n0 n[        U 5       H  u  px[        U[        5      (       a  M  [        X5      n	UR                  US5      n
U
b/  [        [        [           XX   5      R                  U	5        XX   Xh'   OU	R                  5       XX'   U	 H  n
[        U[        5      (       a  M  XU
'   M      M     [        UR!                  5       5       H*  u  pU H  n
[        ["        R$                  U5      Xj'   M!     M,     [        5       nU(       ad  [        U5      nU R'                  5        HE  u  p~X;  a  M  [        X5      n	[)        U	5      S:X  d   S5       eUR                  S U	 5       5        MG     UR*                  (       a  UR,                  (       d  [/        S5      e[0        R2                  " U 5      nU(       a  UR,                  (       a`  [5        UR6                  UR6                  S9n[9        UR6                  UR6                  =(       d    UR*                  S9n[:        R<                  nO6[?        UR6                  S	9n[A        UR6                  S	9n[:        RB                  n[D        RF                  S
 5       n[H        RJ                  " UU UUUS9nO[D        RL                  n[O        S0 [Q        U5      DUUUU[        [        [R        RT                     U5      U(       + [)        U5      S:  S.D6$ )zO
Verify the model and options passed by the user and generates _StateDictInfo.
zGetting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.z;Optimizers are not passed in but optim_only is set to True.Nrs   z)Submodule FQN should only have 1 instancec              3   *   #    U  H	  o S 3v   M     g7f)rq   NrK   ).0r   s     r>   	<genexpr>"_verify_options.<locals>.<genexpr>L  s     %@4CQi4s   z?full_state_dict must be True when broadcast_from_rank0 is True.)offload_to_cpu
rank0_only)r   c              3     #    [         R                  " 5          [         R                  " SS[        S9  [        R
                  " U UUUS9   S v   S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f7f)NignorezFSDP.state_dict_type)messagecategoryrr   state_dict_typestate_dict_configoptim_state_dict_config)warningscatch_warningsfilterwarningsFutureWarningr{   r   r   s       r>   $fsdp_state_dict_type_without_warning=_verify_options.<locals>.fsdp_state_dict_type_without_warningj  sj      ((*''&<} ))!$3&7,C	 	 +* 	 +*s4   B2A1
A A1	B 
A.	*A11
A?;Br   r   )rZ   r[   r\   r_   r`   r]   r^   rK   )+r   warnr   r   r+   r   rx   r   r   r   r   rd   rS   updatecopyrg   itemsrb   rc   named_modulesr|   rG   rB   
ValueErrorr{   r`   r   rC   r   r   FULL_STATE_DICTr   r   SHARDED_STATE_DICTre   contextmanager	functoolspartialrf   rW   r   rh   ri   )rj   r   r   r   r   rZ   r[   rk   paramfqnsr   param_fqns_r\   rr   r`   r   r   r   r   r_   s                        r>   _verify_optionsr     s    I 		
 &I
 	
 +)+G 	 
 	  2%8e\**%##E40?S,34;;DA+<+C!( (,yy{$Ce\22).#&  9  399;<C)-ellF)C!&  = $'5_
!//1LD'U)Dt9>N#NN>%%%@4%@@ 2 ##G,C,CM
 	
 $$U+L "" 3&22w?R?R! '?&22#//O73O3O'# ,;;O 6&22! 'B&22'# ,>>O		"	"	 
#	$ !((0+/$;
 "-- 	
/	+3-!$ryy/<8#^&kAo	 	rU   model_state_dictoptim_state_dictinfoc                    UR                    H  n[        U5      nUb  M   S5       e   UR                  (       a  U (       d  UR                  (       dx  UR                  (       dg  UR
                  (       a  UR                  (       dE  UR                  (       a4  UR                  (       d#  [        S[        R                  " 5       < S35      eUR                  (       aH  U(       dA  UR
                  (       a  UR                  (       d  UR                  (       d  [        SU 35      eU R                  5        H"  n[        U;   d  M  [        U S[         S35      e   g )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=rq   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)r`   r   r]   r\   rD   rC   rB   rF   rG   r   distget_rankr^   keysr}   )r   r   r   rr   
fsdp_statekeys         r>   _verify_state_dictr     s   
 ##CFK
%R'RR% $ 	 ''))!!d&:&:KK))'mmo'q*
 	
  %%$*>*>..::J9KM 
  $$&#%z+ /* *  'rU   r   apic                     [        X5      nU[        ;   a)  [        R                  " [        U R                  U5      U S9nU$ )N)self)r   r6   r   r   r   )r   r   calls      r>   _state_dict_fnr     s7    3D""  !<3GKrU   
state_dictc                     UR                   (       aL  UR                  (       a#  [        R                  R	                  5       (       d  SOSn[        XR                  US9$ UR                  (       a  [        U 5      $ U $ )NrK   )r   )rC   
ranks_only)rB   rC   rb   distributedis_initializedr   r   )r   r   r   s      r>   _maybe_full_or_cpu_state_dictr     so      $$E,=,=,L,L,N,N  	
 "$4$4
 	
 
		)*55rU   c                 z   UR                   (       d  0 $ UR                  5          [        U S5      " 5       nS S S 5        [        WR	                  5       5       H|  n[        X5      n[        U5      S:X  d   X445       e[        [        U5      5      nXS:w  d  M@  S[        4S jnU" X55      (       d  [        SU SU 35      eUR                  U5      X%'   M~     UR                  (       as  0 nUR	                  5        H[  nUR                   HH  nUR                  U5      (       d  M  UR                  (       a  X%   Xu'   M4  U[        U5      S  n	X%   Xy'   MJ     M]     UnUR                  (       aQ  U R!                  5        H=  u  p:U
R"                  (       a  M  [        X5      nU H  nUR                  U5        M     M?     [        UR%                  5       5       HF  u  p;[&        R(                  " U5      (       d  M"  UR*                  (       d  M5  UR                  U5        MH     [-        X!5      $ ! , (       d  f       GN= f)Nr   rs   rn   c                    [        U5      [        U 5      :  a  gUR                  S5      nU R                  S5      nSn[        U5       H>  u  pVXbU   :X  a)  US-  nU[        U5      :X  a  U[        U5      S-
  :H  s  $ M6  US;   a  M>    g   g)NFrq   r   rs   )rr   rt   T)r|   rv   rw   )r   r   	fqn_split	key_splitfqn_idxkey_idxkey_names          r>   verify%_get_model_state_dict.<locals>.verify  s    s8s3x' IIcN	IIcN	)29)=%GW#551"c)n4#*c)nq.@#@@ 5!%<< $ *> rU   zAn unexpected key, z, exists. FQN is )r]   r_   r   rg   r   r   r|   nextiterrQ   r   popr\   
startswithrE   rD   r   requires_gradr   rb   	is_tensoris_metar   )rj   r   r   r   r   r   r   new_state_dictr   r   r   ps               r>   _get_model_state_dictr     s    					#E<8:
 
 JOO%&$4yA~*{*~4::D " ###"%8=Nse#TUU(nnS1JO7 ': /1??$C11~~f--//*4/N'!#f+-0G.8oN+ 2 % $
  002JC""U(Ds# 	 3 z'')*??1!)))NN3 + )::u 
	s   H++
H:c           
         UR                   (       a  U(       d  UR                  (       d  [        0 0 5      $ 0 n[        XR                  5       H  u  pE[        XUR                  5      n[        U UUR                  SSS9n[        Xg5       Hr  u  pUR                  (       a  [        R                  " 5       S:X  a?  X:w  a:  UR                  US 5      n
U
c!  UR                  (       a  [        SU S35      eOXU	'   XSU	'   Mt     M     SnUR                  (       d  UR                  (       Ga  [        5       nUR                  5        HS  u  pE[        R                   " U5      (       d  M"  UR#                  5       S:  d  M8  UR%                  UR&                  5        MU     [        R&                  " S5      U;   a'  UR)                  [        R&                  " S5      5        Sn[+        U5      S:X  a.  UR%                  [        R,                  R/                  5       5        O[+        U5      S:  a  [1        S	5      eUR                  (       a0  [3        UUUR                  5       UR                  UR4                  S
9  O)UR                  (       a  [7        XUR                  5       S9  UR                  5        H	  u  pXU'   M     UR9                  5          [;        [        [=        U S5      " XR                  US95      sS S S 5        $ ! , (       d  f       g = f)NF)rl   rm   r   zMissing key: rq   metaTrs   zMultiple devices found)devicerF   rC   r   load_state_dict)r   rF   assign)r]   rG   r"   r   rJ   r   zipr   r   r   rF   r   rB   rd   r   rb   r   dimr   r   remover|   distributed_c10d_get_pg_default_devicer   r   rC   r   r_   r   r   )rj   r   r   local_state_dictr   valuer   fqns_with_prefixr   fqn_with_prefix
load_valuer   deviceslocal_states                 r>   _load_model_state_dictr    sa    Z8Q8Q R((08N8NO
T%;%;<$""!!&
 %($? C--A1E('^^C6
%{{*]3%q+ABB # 3=/05_- %@ P, F  D$8$8$8%*002JCu%%%))+/ELL) 3
 <<7*NN5<</0Fw<1KK--DDFG\A566$$! {{}{{ ,, !!":V 0 6 6 8C)sO !9 
			5"34%kk&
 
		s   	)K<<
L
optimc                    U R                   (       a  gU R                   H#  nU[            H  nUR                  c  M      g   M%     U R                   HA  nU[            H1  nUR                  (       d  M  [
        R                  " U5      Ul        M3     MC     / nU R                   H\  nSU;   d  M  UR                  US   5        [        US   [
        R                  5      (       a  [
        R                  " S5      OSUS'   M^     U R                  SS9  U R                   H  nSU;   d  M  UR                  S5      US'   M!     U R                  SS9  g)z@
Initialize optim states by calling the step() with zero grads.
Nlrg        )closurer   T)set_to_none)r5   r3   _PARAMSgradr   rb   
zeros_likerz   rx   rc   tensorstepr   	zero_grad)r  param_groupr   lrss       r>   _init_optim_stater  c  s*    {{ )) )Ezz% * *
 )) )E""""--e4
 * * C));JJ{4() k$/>> S!  * 
JJtJ )); #
K * 
OOO%rU   c           
         S n0 n[        [        U [           5      R                  5        HD  u  p4[        [        U5      R                  5        H  u  pVU" U5        Xb[         SU SU 3'   M     MF     [        [        U [
           5       H_  nUR                  [        5      n[        [        [           U5       H,  nUR                  5        H  u  pVXb[
         SU SU 3'   M     M.     Ma     U$ )a  
This API flattens the optimizer state_dict to support optimizer resharding for
MPMD, e.g., pipeline parallelism.

Without the API, the original optimizer state_dict looks like:
{
    "state": {
        "layer1.weight": {
            "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
        },
        "layer2.weight": {
            "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
        },
    },
    "param_group": [
        {
            "lr": 0.0,
            "betas": (0.9, 0.95), ...,
            "params": ["layer1.weight", "layer2.weight"]
        }
    ]
}

With this API, the optimizer state_dict looks like:
{
    "state.layer1.weight.step": 10,
    "state.layer2.weight.step": 10,
    "state.layer1.weight.exp_avg": SomeTensor,
    "state.layer2.weight.exp_avg": SomeTensor,
    "state.layer1.weight.exp_avg_sq": SomeTensor,
    "state.layer2.weight.exp_avg_sq": SomeTensor,
    "param_group.layer1.weight.lr" : 0.1,
    "param_group.layer2.weight.lr" : 0.1,
    "param_group.layer1.weight.betas" : (0.9, 0.95),
    "param_group.layer2.weight.betas" : (0.9, 0.95),
}

Note that if any of the value is a container, like the betas in the example,
this API won't flattent it.
c                     [        U [        R                  [        [        45      (       d  [        S[        U 5       S35      eg )NzUFlattening optimizer state_dict only supports tensor, int, float states now. Type is rq   )rx   rb   rc   intfloatNotImplementedErrortype)vs    r>   _raise_if_type_not_supported?_flatten_optim_state_dict.<locals>._raise_if_type_not_supported  sA    !ellC788%7)1&  9rU   rq   )
r   r(   _STATEr   r)   _PGr   r  rg   rS   )	r   r"  retr   r5   kr!  r  r   s	            r>   _flatten_optim_state_dictr(    s    T !#C=*V*<=CCE
.446DA(+)*6(!C5!%& 7 F
 -z#?w'S	4(C#))+*+se1SE1#&' , ) @
 JrU   c                    0 n/ n[         U[        U0nU R                   GH  nUR                  [        / 05        U[            GH  nUR
                  U    H  nXR                  ;   a;  Sn	UR                  5        H$  n
U
[        :X  a  M  [         SU SU
 3nX;   a  Sn	  O   OSn	U	(       d  MX  US   [           n[        U[        5      (       d   eUR                  U5        UR                  (       d  M  0 X8'   U R                  U   R                  5        H'  nU[          SU SU 3   [        [        X8   5      U'   M)     M     GM     [        [        [           US   [           5      S   nUR                  5        H[  n
U
[        :X  a  M  U[         SU SU
 3   nXS   ;  a	  XS   U
'   M1  US   U
   U:w  d  M?  [        SU SU
 SU SUS   U
    S3	5      e   GM     U$ )	z
This API unflattens the state_dict generated by _flatten_optim_state_dict().
See the docstring of _flatten_optim_state_dict() for more detail.
Frq   Tr   r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )r$  r%  r3   rz   r  rZ   r[   r   rx   rg   r   r5   r   r(   rS   r   )r  r   r   r5   pg_state
return_osdr  r   r   	in_paramsr'  flatten_keyr4   
state_namefirst_param_fqnr  s                   r>   _unflatten_optim_state_dictr0    s    E"$H&,eS(%CJ))"& )E--e4 444 %I(--/<$),Qse1QC&8&4(,I 0 !%I !"g.!&$////c"**
"'++e"4"9"9";JBL!(!C5*6CD
3J? #<3 5 *> tCy(2,w*?@C!!#AG|#a'8!=>E$"'Q"aE)"==L<MQqc R 3HRLO3DAG  $E *^ rU   
optimizersc                 @   UR                   (       d  0 $ [        0 [        / 0nU GH  n[        U5        [	        US5      " 5       nUR
                  (       a  UR                  5          [        R                  " XU5      nS S S 5        U(       d  Mj  [        U[           R                  5       5       H=  nSU;   d  M  U[           R                  U5      U[           UR                  SS5      '   M?     U[            H3  nU[            Vs/ s H  ofR                  SS5      PM     nnX[        '   M5     GO5[        [        R                  " S UR                    5       5      5      n[#        [%        U['        [)        U5      5      5      5      n	0 n
U R+                  5        HH  u  p[-        X5      n[)        U5      S:X  d   e[/        [1        U5      5      nX;  a  M<  X   nXU'   XU'   MJ     [        U[           R                  5       5       H)  nX   nU[           R                  U5      U[           U'   M+     U[            H&  nU[            Vs/ s H  oU   PM	     snU[        '   M(     U(       d  GML  [3        [4        U[           5      R7                  U[           5        [3        [8        U[           5      R;                  U[           5        GM     UR<                  (       a  [3        [>        [A        U5      5      n[C        X25      $ ! , (       d  f       GN= fs  snf s  snf )Nr   rt   z
_orig_mod.rp   c              3   2   #    U  H  o[            v   M     g 7fr8   )r  )r   gs     r>   r   (_get_optim_state_dict.<locals>.<genexpr>*  s     -UBTQjBTs   rs   )"r^   r$  r%  r  r   r`   r_   r{   r   rg   r   r   ru   r  r   from_iterabler3   ra   r   ranger|   r   r   r   r   r   r(   r   r)   extendrH   r*   r(  r   )rj   r1  r   r   r  osdr'  r4  r4   param_pid_mappingfqn_pid_mappingr   r   r   r   pidgroups                    r>   _get_optim_state_dictr>    s    	,2BR+@% UL13""$++E#> % #f+**,-!#?B6{q?QCK		, ;< . X?@zJz!))L"5zJ#'
  %---U%BTBT-UUVF $Ss6{1C%D E O#446
 ,4yA~%~4:&1'.'*$'*$ 7 CK,,./%*#&v;??3#7FC  0 SBG.!Q.3#"6.!Qg " ],V45<<S[I 0 56==c#hGY \ (( 9:J K
 ))9@@_ %$ K* "Rs   %LL
L
L	c           
      D   0 n/ n[         U[        U0n0 n[        S [        [        U[            5      R                  5        5       5      (       a  U$ UR                   GHP  nUR                  [        / 05        U[            GHA  n	UR                  U	    GH)  n
XR                  ;   aG  Sn[        [        U[           5       H)  nU
[        [        [           U[           5      ;   d  M'  Sn  O   OSnU(       d  Me  US   [           n[        U[        5      (       d   eUR                  U
5        U	R                  (       a  [        [        U[            5      U
   XJ'   [        [        U[           5       HH  nU
[        [        [           U[           5      ;   d  M'  [!        U[           5      S-
  U[#        U5      '   MJ     GM,     GMD     [!        U[           5      S:X  d  GM  / n[        [        U[           5       HA  n[!        [        [        [           U[           5      5      S:X  d  M0  UR                  U5        MC     [!        U5      S:w  a  [%        S5      e[!        U[           5      [!        UR                  5      :w  a  [%        S5      e[!        U[           5      S-
  U[#        W5      '   GMS     [        [        U[           5       HS  nUR'                  [#        U5      S5      nUS:X  a  M&  UR)                  5        H  u  nnU[        :X  a  M  UX_   U'   M     MU     U$ )	a  
Extract the corresponding optim state_dict from ``optim_state_dict`` for
``optim`` and return the result optim state_dict.

Args:
    model (nn.Module): the root model.
    optim (torch.optim.Optimizer): the optimizer.
    optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
        contains the optim state_dict of ``optim``.
    info (_StateDictInfo): state dict information.

Returns:
    The optim state_dict of ``optim``.
c              3   B   #    U  H  n[        U[        5      v   M     g 7fr8   )rx   r  )r   r'  s     r>   r   *_split_optim_state_dict.<locals>.<genexpr>f  s      $Xq
1c$Xs   FTr   rs   r   zThere are param groups that have zero parameters. In such a case, DSD only support exactly one param group with zero parameters.But the loaded state_dict has zero or more than one param groups that have zero parameters.z`When there is a parameter group that has zero parameters, multiple optimizers are not supported.)r$  r%  allr   r(   r   r3   rz   r  rZ   r[   r)   rg   rS   rx   r   r|   idr   r   r   )rj   r  r   r   r5   r*  r+  
pg_mappingr  r   r   r,  loaded_param_groupr4   r&  pg_idxr   r  s                     r>   _split_optim_state_dictrG  L  s   * E"$H&,eS(%CJ!#J
 $(8H8P$Q$V$V$X    ))"& )E--e4444 %I.2)+;C+@/* $tCy2DW2M"NN(,I!/ !%I !"g.!&$////c"&&!%m5Ef5M!Ns!SEJ*.%'7'<+& d49.@.IJJ=@C=QTU=U
2&8#9:	+' 5 *4 {7#$)C&*+<>Ns>S&T"tDI'9''BCDIJJ12 'U 3x1} 1  #C()S1C1C-DD =  25Z_1E1IJr,-._ *b -/?/DE;4R<%++-JCg~$)HS!	 . F rU   c           
      @  ^ UR                   (       d  g U GH  n[        U5        U(       a@  [        U;   a  [        XX#5      nO+[	        U[        [        [        [        4   U5      U5      nO0 nUR                  (       GaX  U R                  5        GH  u  pg[        X5      n[        XSS9n	X:X  a  M"  [        U5      S:X  d   eUR                  5       n
U	R                  5       nU[            HO  n[        [        [        [        4   U5      nU[             Vs/ s H  oR#                  X5      PM     nnX[         '   MQ     [        [$        U[           5      n['        UR)                  5       5       H.  nU
U;   d  M  UR                  U5      UUR#                  X5      '   M0     GM     UR+                  5          [,        R.                  " XU5      nS S S 5        GOUR0                  (       Ga
  SUl        [3        X4U5      nSUl        S mU4S jn[5        [6        R8                  UU5      nTc   e[;        U5      u  nn[;        U5      u  nnUR<                  (       a  [?        UUTS9  O[A        UUTS9  UR)                  5        H#  nUU;  d  M  UU;   d   eUU   UU'   UU   UU'   M%     [C        UU5      nU[            H3  n[         U;  d  M  / [        [        [        [        4   U5      [         '   M5     [E        US5      " US9  GM     g s  snf ! , (       d  f       N)= f)	NF)rm   rs   Tc                    > U R                  5       S:  a,  Tc  U R                  mU $ TU R                  :w  a  [        S5      eU $ )Nr   zDevice mismatch)r  r   r   )tr   s    r>   _device'_load_optim_state_dict.<locals>._device  sD    557Q;~!"   188+():;;rU   r   r   )r   )#r^   r  r$  rG  r0  r   ra   rS   r'   r`   r   r   r|   r   r%  r	   r  ru   r(   rg   r   r_   r{   optim_state_dict_to_loadrB   r>  r$   rb   rc   r   rG   r   r   r   r   )rj   r1  r   r   r  r   original_fqn_r   fqns_with_compilerr   fqn_with_compilerr4  valr   r4   	osd_stater'  r  rK  flatten_osdosd_mappingflatten_local_osdlocal_osd_mapping	optim_keypgr   s                             @r>   _load_optim_state_dictrZ    s    % ##:*$  $?4S)^ 4jA4$   " $)#9#9#; 5%.e&" -4yA~%~hhj$6$:$:$<!)#.AtCH~q1CGJ7|GSC;|   $*L / !0@0HI	inn./AaxGP}}UVGW	!))C"CD 0% $<, ""$#'#@#@"2$  %$ !!!#(D 4UHdK#'D F ellG5EFA%%%':;K'L$K3FGW3X00((%k3DVT&{4EfU
 )--/	$55$3333>y3I%i03>y3I%i0	 0
  5!#4  's+"$>@Dc9n-r27; , 	u/0<LMi : %$s   L

6L
L	c          	          [        5          [        U SSUUS9n[        X5      n[        U0 U5        UsSSS5        $ ! , (       d  f       g= f)a  
Return the model state_dict of ``model``.

See ``get_state_dict`` for the detail usage.

Args:
    model (nn.Module): the nn.Module to the model.
    submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
        that belong to the submodules.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be returned. See
        `StateDictOptions` for the details.

Returns:
    The state_dict for ``model``.

:rtype: typing.Dict[str, ValueType]
rK   Fr   r   r   N)r?   r   r   r   )rj   r   r   r   r   s        r>   r,   r,     sI    0 
!
 1=+R6 
s	   '<
A
c          	         [        5          [        U[        R                  R                  5      (       a  U4O
[        U5      n[        U USUUS9n[        XU5      n[        0 XT5        UsSSS5        $ ! , (       d  f       g= f)a  
Return the combined state_dict for optimizers.

See ``get_state_dict`` for the detail usage.

Args:
    model (nn.Module): the nn.Module to the model.
    optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
        The optimizers that are used to optimize ``model``.
    submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
        that belong to the submodules.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be returned. See
        `StateDictOptions` for the details.

Returns:
    The state_dict for ``optimizers``.

:rtype: OptimizerStateType
Tr\  N)	r?   rx   rb   r  	Optimizertupler   r>  r   )rj   r1  r   r   r   r   s         r>   r-   r-   0  sx    6 
 *ekk&;&;<< Mz" 	
 !
 1DI2/6 
s   AA33
Bc          	          [        5          [        U[        R                  R                  5      (       a  U4O
[        U5      n[        U USUUS9n[        X5      n[        XU5      n[        XVU5        XV4sSSS5        $ ! , (       d  f       g= f)a(  
Return the model state_dict and optimizers state_dict.

``get_state_dict`` can process any module that is parallelized by PyTorch
FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
combination of these parallelisms. The main functions of ``get_state_dict``
are: 1.) returning a model and optimizer state_dict that can be resharded
with a different number of trainers and/or different parallelisms.
2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
these APIs.
3.) sanity checking the result state_dict.

The keys of the result state dictionary are the canonical FQNs (Fully
Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
position in an nn.Module hierarchy. More specifically, a canonical FQN to a
parameter is the FQN returned by ``module.named_parameters()`` or
``module.named_buffers()`` when the module is not distributed by any
parallelisms. Since the optimizer internally uses parameter IDs to represent
a parameter, there will be a conversion from the parameter IDs to the
canonical FQNs when calling this API.

``get_state_dict`` can also process a module that is not parallelized. In
such a case, ``get_state_dict`` only performs one function -- converting the
optimizer parameter IDs to the canonical FQNs.

Example:
    >>> # xdoctest: +SKIP
    >>> import torch
    >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
    >>> from torch.nn.parallel import DistributedDataParallel as DDP
    >>> from torch.distributed.checkpoint.state_dict import get_state_dict

    >>> fsdp_model = FSDP(copy.deepcopy(model))
    >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
    >>> ddp_model = DDP(copy.deepcopy(model))
    >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


    >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
    >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(
    ...     fsdp_model, fsdp_optim
    ... )

    >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
    >>> # the asserts will fail.
    >>> assert ddp_state_dict == fsdp_state_dict
    >>> assert ddp_optim_state == fsdp_optim_state_dict


Args:
    model (nn.Module): the nn.Module to the model.
    optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
        The optimizers that are used to optimize ``model``.
    submodules (deprecated): Optional[set[nn.Module]]: only return the model parameters
        that belong to the submodules.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be returned. See
        `StateDictOptions` for the details.

Returns:
    ``Tuple`` that contain model state_dict and optimizer state_dict.

:rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
Fr\  N)
r?   rx   rb   r  r^  r_  r   r   r>  r   )rj   r1  r   r   r   r   r   s          r>   r.   r.   ]  s    P 
 *ekk&;&;<< Mz" 	
 !
 1=0DI+tD1! 
s   A*A??
Bc           
         U(       d  0 $ [        [        [        UR                  5       5      5      [        R
                  5      (       a  [        R                  " S[        5        [        [        [        R
                  [        [        [        4   4   U5      n0 nUR                  5        H  u  pEU R                  5        H{  u  pgXt:w  a  M  [        X5      n[!        U5      S:X  d   S5       e[        [        U5      5       S3n	UR#                  UR                  5        V
Vs0 s H
  u  pX-   U_M     snn
5        M}     M     U$ [        [        [        [        4   U5      $ s  snn
f )NzPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.rs   z/FQNs for a submodule should only have 1 elementrq   )rx   r   r   r   rh   ri   r   r   r   r   ra   rS   r'   r   r   r   r|   r   )rj   r   cast_state_dictr   r   sub_state_dictrk   mr   r   subfqnr  s               r>   _unflatten_model_state_dictrf    s0    	$tJOO-./;;" 	
 tBIItCN/C$CDjQ/1)8)>)>)@%I ..0> -4yA~X'XX~ d,-Q/%%AOAUAUAWXAWV_e+AWX 1 *A Di(*55	 Ys   !E!)r   c                    [        X5      n[        5          [        U SSUS9n[        U0 U5        [	        XU5      sSSS5        $ ! , (       d  f       g= f)a  Load the model state_dict.

The counterpart of ``get_model_state_dict`` to set the state_dict to the
model. See ``set_state_dict`` for the detail usage.

Args:
    model (nn.Module): the nn.Module to the model.
    model_state_dict: (Dict[str, ValueType]):
       the model state_dict to load. If the key of the ``model_state_dict``
       is nn.Module, the key is a submodule of ``model`` and the value should
       be the state_dict of the submodule. When loading the state_dict,
       the prefix of the submodule will be append to the state_dict.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be loaded. See
        `StateDictOptions` for the details.

Returns:
    ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
        * **missing_keys** is a list of str containing the missing keys
        * **unexpected_keys** is a list of str containing the unexpected keys

:type model_state_dict: typing.Dict[str, ValueType]
rK   Fr   r   N)rf  r?   r   r   r  )rj   r   r   r   s       r>   r/   r/     sK    : .I. 
ubUGL+R6%etD	 
s   %A
Ac                    [        5          [        U[        R                  R                  5      (       a  U4O
[        U5      n[        XSUS9n[        0 X$5        [        XX$5        SSS5        g! , (       d  f       g= f)ad  Load the optimizers state_dict.

The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
optimizers. See ``set_state_dict`` for the detail usage.

WARN: ``set_optimizer_state_dict`` can only be called before ``backward()`` or after
    ``step()`` is called on the optimizers. Otherwise, the optimizer states won't be
    initialized correctly.

Args:
    model (nn.Module): the nn.Module to the model.
    optimizers (Union[Optimizer, Iterable[Optimizer]]):
        The optimizers that are used to optimize ``model``.
    optim_state_dict: OptimizerStateType:
        the optimizer state_dict to load.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be loaded. See
        `StateDictOptions` for the details.

Returns:
    None

:type optim_state_dict: typing.OptimizerStateType
Trh  N)	r?   rx   rb   r  r^  r_  r   r   rZ  )rj   r1  r   r   r   s        r>   r0   r0      sf    > 
 *ekk&;&;<< Mz" 	
 uT7S2/6u2BI 
s   AA//
A=c                8   [        X5      n[        5          [        U[        R                  R
                  5      (       a  U4O
[        U5      n[        XU(       + US9n[        X#U5        [        XX55        [        XU5      sSSS5        $ ! , (       d  f       g= f)a  Load the model state_dict and optimizers state_dict.

The counterpart of ``get_state_dict`` to set the state_dict to the model and
optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
have to be returned by ``get_state_dict`` but must meet the following
requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
3) optimizer state_dict cannot contain the parameter IDs; the keys should be
the canonical FQNs.

WARN: ``set_state_dict`` can only be called before ``backward()`` or after ``step()``
    is called on the optimizers. Otherwise, the optimizer states won't be initialized
    correctly.

Args:
    model (nn.Module): the nn.Module to the model.
    optimizers (Union[Optimizer, Iterable[Optimizer]]):
        The optimizers that are used to optimize ``model``.
    model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
       the model state_dict to load. If the key of the ``model_state_dict``
       is nn.Module, the key is a submodule of ``model`` and the value should
       be the state_dict of the submodule. When loading the state_dict,
       the prefix of the submodule will be append to the state_dict.
    optim_state_dict: OptimizerStateType:
        the optimizer state_dict to load.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be loaded. See
        `StateDictOptions` for the details.

Returns:
    ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
        * **missing_keys** is a list of str containing the missing keys of the model state_dict.
        * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

:type model_state_dict: typing.Dict[str, ValueType]
:type optim_state_dict: typing.OptimizerStateType
rh  N)rf  r?   rx   rb   r  r^  r_  r   r   rZ  r  )rj   r1  r   r   r   r   s         r>   r1   r1   +  s    \ .I. 
 *ekk&;&;<< Mz" 	
 .>*>
 	+tDu2BI%etD 
s   A+B
Bc                  ^^ [         R                  " [        U US9mU4S jnX l        [         R                  " [        U US9mS[
        [        [        4   4U4S jjnX0l        [        R                  U5        [        R                  U5        g)a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
be a partial function to call ``get_state_dict`` and ``set_state_dict``.

Example:
    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
    from torch.distributed.checkpoint.state_dict import patch_model_state_dict

    model = fsdp(model)
    patch_model_state_dict(model)

Args:
    model (nn.Module): the nn.Module to the model.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be loaded. See
        `StateDictOptions` for the details.
Returns:
    None
)rj   r   c                     > T " 5       $ r8   rK   _state_dict_calls   r>   state_dict_call0_patch_model_state_dict.<locals>.state_dict_call      !!rU   r   c                    > T" U S9  g )N)r   rK   r   _load_state_dict_calls    r>   load_state_dict_call5_patch_model_state_dict.<locals>.load_state_dict_call      z:rU   N)r   r   r,   r   r/   ra   rS   r	   r   r6   r   )rj   r   ro  ru  rt  rn  s       @@r>   _patch_model_state_dictrx  m  s    6 !((" '%--;c3h ; 1O,01rU   c                  ^^ [         R                  " [        U UUS9mU4S jn[         R                  " [        U UUS9mS[        [
        [        4   4U4S jjn[        R                  U5        [        R                  U5        [        U[        R                  R                  5      (       a  U4O
[        U5      nU H  nX5l        XEl        M     g)a`  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
be a partial function to call ``get_state_dict`` and ``set_state_dict``.

Note that if there are multiple optimizers, all of the optimizers will be patched.
So users only need to call one of the state_dict() to get the full result.

Example:
    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
    from torch.distributed.checkpoint.state_dict import patch_model_state_dict

    model = fsdp(model)
    patch_model_state_dict(model)

Args:
    model (nn.Module): the nn.Module to the model.
    options (StateDictOptions): the options to control how
        model state_dict and optimizer state_dict should be loaded. See
        `StateDictOptions` for the details.
Returns:
    None
)rj   r1  r   c                     > T " 5       $ r8   rK   rm  s   r>   ro  4_patch_optimizer_state_dict.<locals>.state_dict_call  rq  rU   r   c                    > T" U S9  g )N)r   rK   rs  s    r>   ru  9_patch_optimizer_state_dict.<locals>.load_state_dict_call  rw  rU   N)r   r   r-   r0   ra   rS   r	   r6   r   rx   rb   r  r^  r_  r   r   )rj   r1  r   ro  ru  r  rt  rn  s         @@r>   _patch_optimizer_state_dictr~    s    > !(( 	" &-- 	;c3h ; O,01 j%++"7"788 
: 
 * 4 rU   )rI   TT)rI   )rre   r   r9   r   collections.abcr   r   dataclassesr   r   r   	itertoolsr   typingr	   r
   r   r   r   r   rb   torch.distributedr   r   torch.nnrh   'torch.distributed._shard.sharded_tensorr   #torch.distributed._state_dict_utilsr   r   r   r   r   r   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   torch.distributed.fsdpr   r   r   r{   r   r   r   r   r   $torch.distributed.fsdp._common_utilsr   r    torch.distributed.tensorr!   torch.nn.modules.moduler"   torch.nn.parallelr#   ry   torch.utils._pytreer$   __all__r}   r%  r  r$  rd   rS   r%   rc   r  r  r&   rg   r_  ra   r'   r(   r)   r*   r6   rR   r   r?   r+   rW   cacheri   rQ   r   r   r   r  r^  r   r   r   r   no_gradr   r  r  r(  r0  r>  rG  rZ  r,   r-   r.   rf  r/   r0   r1   rx  r~  rK   rU   r>   <module>r     s     	  / 0 0  F F     A 	 	 	 - 5 < -" 
		Sg}ellCKL4&m(<d3CS>TT	 S)^$' #u]4E%EFFG  &)U S] *   ,. ,. ,.^ @% @ @   . !%DE99DE
DE DE 	DE
 DE DE DEN	 	%"Z ,0*.99%++'',- 
 RYY( &' D*3	>**(* * 
	*Zbii)>)>>? c h S#X&4	#s(^$ @;99@;*@;	#y.@; @;F B
99B
S)^$B
 B
 	B
 B
J'&U[[22 '&t '&T=*< =c9nAU =@<;;  <S)^$< < 	<~ <A99<Aekk++S01<A <A 	<A <A~[99[;;  [ )[ 	[
 [| ]N99]Nekk++S01]N #]N 	]N
 
]N ]NF ,0*.	" 99"  RYY("  &'	" 
 
#y." R ,0*.* 99* ekk++Xekk6K6K-LLM*  RYY(	* 
 &'*  * b ,0*.X299X2ekk++Xekk6K6K-LLMX2 RYY(	X2
 &'X2 4Y!334X2v6996d299d3	>&::;T#y.=QQR6 
#y.6J +/	$E99$E3	>*$E &'	$E
 $EX +/(J99(Jekk++Xekk6K6K-LLM(J )(J
 &'(J 
(Jb +/=E99=Eekk++Xekk6K6K-LLM=E 3	>*	=E
 )=E &'=E =ED  +/129912 &'12 
	12 12l 
 +/	;599;5 ekk++S01;5 &'	;5
 
;5 ;5rU   