o
    Zh                     @   s  d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlmZ d dlm  m  mZ d dlm  m  mZ d dlm  m  mZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/m0Z0m1Z1m2Z2 d d	l3m4Z4 d d
l5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE erd dlFmGZG dZHz
d dlImJZJmKZK W n eLy   dZHY nw eMdZNdZOePejQejQf ZReeejQeRf  ZSe;jTe2jTe;jUe2jUe;jVe2jVe;jWe2jWe;jXe2jXiZYe;jWe;jXgZZe;jVe;jXfZ[e	dde(deSde;dee? dee# de(fddZ\ede(deSde#de(fddZ]ede	de^fdd Z_ede#de^fd!d"Z`ed#eMdejQfd$d%Zaed&ejQd#eMdejQfd'd(Zbd&ejQd#eMdePejQejQf fd)d*Zce	dde(d+ejdd,eeejjd  d-eeeejje  eeejjd  f de(f
d.d/Zfd-ege	 d0e^ddfd1d2Zhede(d+ejdd3eieje d4eeeMejjf  de(f
d5d6Zkede(d+ejdde(fd7d8Zlede(dee; d9ee: d:ee7 d;e^d<e^d=eMd>eMde(fd?d@Zmede(de(fdAdBZnede(dCe6dDe^de(fdEdFZoedde(de#de(fdGdHZpede(de(fdIdJZqd+ejddKegeje ddfdLdMZrede(dNejdd4eeeMejjf  dOee
ejdgdf  dPe^de(fdQdRZsede(dKegeje dNejdfdSdTZtdUejddVeeejjd  deiejd fdWdXZu	ddUejjdd,eiejjd dYeeejje  deiejje fdZd[ZvdUejjdd,eiejjd deiew fd\d]ZxdUejddeiew fd^d_Zyd+ejdd3eieje d4eeeMejjf  ddfd`daZzd4eeeMejjf  dbeMdce'deejj fdddeZ{d+ejdd3eieje d,eiejd dePe^e^f fdfdgZ|dUejddOe
ejdgdf d,eiejd ddfdhdiZ}dUejddjeejj d,eiejd dce'fdkdlZ~dUejdd,eiejd degejd fdmdnZd+ejdd3eieje doeiej djeejj ddf
dpdqZdKegeje dregej djeejj ddfdsdtZdudv Zd+ejdd3eieje djeejj dbeMdce'dejjfdwdxZd+ejddKegeje dejQddfdydzZd{egej ddfd|d}Zd+ejdd3eieje deeje fd~dZd3eieje ddfddZde;fddZdejQde jfddZdS )    N)	GeneratorIterableIterator)AnyCallableno_type_checkOptionalTYPE_CHECKINGUnion)default_hooks)_mesh_resources
DeviceMesh)_get_default_group)_FSDPDeviceHandle
_FSDPState_get_module_fsdp_state_is_fsdp_flattened!_named_parameters_with_duplicatesclean_tensor_nameTrainingState)_FSDP_USE_FULL_PREC_IN_EVALFlatParameterFlatParamHandleHandleShardingStrategy)_FreeEventQueue)BackwardPrefetch
CPUOffloadFullOptimStateDictConfigFullStateDictConfigMixedPrecisionShardingStrategyStateDictConfigStateDictType)_Policy)DTensorExtensions)_sync_params_and_buffers)is_traceable_wrapper_subclass)RemovableHandleT)deferred_initfakeFi  Z_fsdp_syncedstateprocess_groupsharding_strategypolicydevice_meshreturnc                 C   s   |d ur|d urt d|tv }|r-|d u r&|d u r&|d u r&t d| dt| ||} n|r:|| _|jdd| _n
|d ur@|nt | _| j | _| j | _	| j	}|r\|| j
 9 }tj|| _|| j | _| S )NzcCannot pass both process_group and device_mesh at the same time. Please just pass only one of them.zManual wrapping with zA requires explicit specification of process group or device_mesh.r   Zmesh_dim)
ValueErrorHYBRID_SHARDING_STRATEGIES*_init_process_group_state_for_hybrid_shard_device_mesh	get_groupr+   r   ranksize
world_size_inter_node_pgr   DefaultStateZ_get_gradient_predivide_factorZ_gradient_predivide_factorZ_gradient_postdivide_factor)r*   r+   r,   r-   r.   Zis_hybrid_strategyZdata_parallel_world_size r;   Q/var/www/auris/lib/python3.10/site-packages/torch/distributed/fsdp/_init_utils.py_init_process_group_stateY   s<   
r=   c                 C   s   |r t |r|| _|jdd| _|jdd| _n4td|j |d u r8t }t|| j	
 \}}|| _|| _nt|rC|\| _| _n	tdt| t| jd| _| S )Nr   r0      z,Expected device_mesh to have ndim=2 but got zmExpected process_group to be passed in as either None or Tuple[dist.ProcessGroup, dist.ProcessGroup] but got r+   )"_is_valid_hybrid_shard_device_meshr4   r5   r9   r+   r1   ndimr   !_init_intra_and_inter_node_groups_device_handleZdevice_count_is_valid_hybrid_shard_pg_typetype_get_default_comm_hook_stateZ_inter_node_state)r*   r+   r.   Zdefault_groupZintra_node_groupZinter_node_groupr;   r;   r<   r3      s4   

r3   c                 C   s(   t | tot| dkotdd | D S )N   c                 s       | ]	}t |tjV  qd S N)
isinstancedistProcessGroup).0Zpgr;   r;   r<   	<genexpr>       z1_is_valid_hybrid_shard_pg_type.<locals>.<genexpr>)rJ   tuplelenallr?   r;   r;   r<   rD      s
   

rD   c                 C   s   t | to	| jdkS )NrG   )rJ   r   rA   )r.   r;   r;   r<   r@      s   r@   num_devices_per_nodec                 C   s   t | \}}|S )aU  
    Return a process group across the current node.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return an intra-node subgroup across
    [0, 1, ..., 7] or [8, 9, ..., 15] depending on the process's rank.
    For example, rank 3 would get [0, 1, ..., 7].
    )rK   Znew_subgroups)rS   Zintra_node_subgroup_r;   r;   r<   _init_intra_node_process_group   s   rU   global_process_groupc           	         s   d}t | }t | }| }t |  }tD ]  fddt|D }t j||d} |kr6|}q|dusBJ | d|S )a  
    Return an inter-node process group where each contained rank has the same local rank.

    For example, given each row is a distinct node:
    0  1  2  3  4  5  6  7
    8  9 10 11 12 13 14 15
    This API would return inter-node process group [0, 8], [1, 9], [2, 10], and so forth
    depending on the process's rank. For example, rank 1 would get [1, 9], rank 5
    would get [5, 13].
    Nc                    s   g | ]} |  qS r;   r;   )rM   iZ
local_rankrS   r;   r<   
<listcomp>       z2_init_inter_node_process_group.<locals>.<listcomp>)Zranksbackendz. expected to assign inter-node pg, but did not)rK   get_backendZget_world_sizeZget_rankrangeZ	new_group)	rV   rS   Zinter_node_pgZsharding_backendr8   Z	num_nodesZmy_local_rankZranks_for_inter_groupgrpr;   rX   r<   _init_inter_node_process_group   s"   


r_   c                 C   s   t |t| |fS )a  
    Initialize intra and inter-node process groups and return the ones corresponding to this process's rank.

    This function can be used to initialize process groups for ``HYBRID_SHARD`` or
    ``_HYBRID_SHARD_ZERO2`` in FSDP.
    This function assumes each node has an equal number of CUDA-enabled devices.
    Returns:
        Tuple[dist.ProcessGroup, dist.ProcessGroup]: Intra and inter-node process group.
    )rU   r_   )rV   rS   r;   r;   r<   rB      s   rB   moduleignored_modulesignored_statesc                 C   s   |d ur|d urt dd }|d u}|rt|}t|d ng }t|d ur)t|ng d t|dkr@t|d tjr>|}n|}t||| _t	|| j|| _
t|| j| _| S )NzfCannot pass both ignored_modules and ignored_states at the same time. Please just pass ignored_states.TFr   )r1   list_check_ignored_statesrQ   rJ   nn	Parameter_get_ignored_modules_ignored_modules_get_ignored_params_ignored_params_get_ignored_buffer_names_ignored_buffer_names)r*   r`   ra   rb   ignored_parameterspassed_as_ignored_statesZignored_states_listr;   r;   r<   _init_ignored_module_states  s8   		ro   rn   c                 C   s   t | dkrdS |r6tdd | D }tdd | D }|s2|s4tdd | D td}td	| dS dS td
d | D sQtdd | D td}td| dS )z
    Check that the ignored states are uniformly parameters or uniformly modules.

    We may remove this check in the future if we permit mixing.
    r   Nc                 s   rH   rI   )rJ   re   rf   rM   r*   r;   r;   r<   rN   I  rO   z(_check_ignored_states.<locals>.<genexpr>c                 s   rH   rI   rJ   re   Modulerp   r;   r;   r<   rN   J  rO   c                 S      h | ]}t |qS r;   rE   rp   r;   r;   r<   	<setcomp>M      z(_check_ignored_states.<locals>.<setcomp>)keyzUignored_states expects all nn.Parameter or all nn.Module list elements but got types c                 s   rH   rI   rq   rp   r;   r;   r<   rN   S  rO   c                 S   rs   r;   rt   rp   r;   r;   r<   ru   T  rv   z>ignored_modules expects nn.Module list elements but got types )rQ   rR   sortedreprr1   )rb   rn   
all_paramsZall_modulesZsorted_typesr;   r;   r<   rd   >  s*   rd   ignored_params	device_idc                 C   s   d}|durt |tjr|nt|}|du rRt||D ]%}|jjdv r%q|du r-|j}q|jj|jkrAtd|j d|jj q|pHtj }|jdkrRtdt	|| _
| S )a<  
    Determine device handle used for initializing FSDP.

    If a device is specified by ``device_id``,
    then returns device handle corresponds to that device type. Otherwise, If the
    module is already on a non-CPU device, then the device type is that non-CPU device type.
    If the module is on CPU or meta, then the device type is the current accelerator device.
    See the :ref:`Accelerators<accelerators>` for details.


    This method will be called once ignored paramters was determined, as the device handle maybe needed
    for other initialization.
    N>   metacpuzLFSDP does not support modules with different device types but got params on z and r~   zOFSDP needs a non-CPU accelerator device, but no accelerator device is detected.)rJ   torchdevice_get_orig_paramsrE   RuntimeError_CZ_get_acceleratorr   Zfrom_devicerC   )r*   r`   r{   r|   Zdetermined_deviceparamr;   r;   r<   _init_device_handle[  s8   

r   c                 C   s<   t || _i }| D ]\}}t|}|j||< q|| _| S rI   )_get_buffer_namesZ_buffer_namesnamed_buffersr   Zdtype_buffer_name_to_orig_dtype)r*   r`   r   buffer_namebufferr;   r;   r<   _init_buffer_state  s   
r   mixed_precisioncpu_offloadlimit_all_gathersuse_orig_paramsbackward_prefetch_limitforward_prefetch_limitc                 C   s  | j dkr|tjkrtd|ptj d tj}n|tjkr'tjdtdd |p+tj| _|p1t | _	|d urCt
jdt| j	  tjtdd	k| _|pQt | _|| _|| _tj| _d | _t | _t | _t !| j||| _"d | _#i }|| _$d }	|	| _%g }
|
| _&| S )
Nr>   z/FSDP is switching to use `NO_SHARD` instead of z since the world size is 1.zoThe `NO_SHARD` sharding strategy is deprecated. If having issues, please use `DistributedDataParallel` instead.   )
stacklevelz'torch.distributed.fsdp.mixed_precision. 1)'r8   r    NO_SHARDwarningswarn
FULL_SHARDFutureWarningr,   r   r   r   r   Z_log_api_usage_oncestrosenvirongetr   Z_use_full_prec_in_evalr   r   r   _use_orig_paramsr   ZIDLEZtraining_stateZ_is_rootr   Z_free_event_queuerK   Zget_debug_levelZ_debug_levelexec_order_utilsZ_ExecOrderDataZ_exec_order_dataZ_unshard_event_fully_sharded_module_to_handle_handleparams)r*   r,   r   r   r   r   r   r   r   r   r   r;   r;   r<   _init_core_state  sT   



r   c                 C   s4   g }|| _ g }|| _g }|| _d| _d | _d | _| S )NT)_root_pre_forward_handles_pre_forward_handles_post_forward_handlesZ_sync_gradientsZ
_comm_hookZ_comm_hook_state)r*   r   r   r   r;   r;   r<   _init_runtime_state  s   r   backward_prefetchforward_prefetchc                 C   s   || _ || _| S rI   )r   r   )r*   r   r   r;   r;   r<   _init_prefetching_state  s   r   c                 C   s2   t |}|r|| jkrt| j| _| S d | _| S rI   )r   Zget_root_meshr4   r$   rC   _fsdp_extension)r*   r.   Z	root_meshr;   r;   r<   _init_extension  s   
r   c                 C   s*   t j| _t }t | _|| _i }|| _| S rI   )r"   ZFULL_STATE_DICTZ_state_dict_typer   r   Z_optim_state_dict_configZ_state_dict_configZ_unshard_params_ctx)r*   Zstate_dict_configZunshard_params_ctxr;   r;   r<   _init_state_dict_state  s   r   r   c                 C   sZ   |D ](}t |jdkr*d}|  D ]\}}||u r|} nq|s"J td| dqdS )z
    Verify if the parameters are accepted by FSDP. The only restriction now
    is that the parameter cannot be a scalar tensor (param.shape == []).
    r   r   z/FSDP doesn't support scalar parameters. Change z& to a 1D tensor with numel equal to 1.N)rQ   shapeZnamed_parametersr1   )r`   r   r   
param_namenameZparam_r;   r;   r<   _verify_managed_params  s    r   fully_sharded_moduleparam_init_fnsync_module_statesc           
         s  t | j| t| j j}t| j j\}}|s|r)|dur)t|| j n|r5t|| j j n|rBt	j
| fddd dd  jD }t| j|| t| j| j j _tt| j}	t||	 |rt||	 j  jtv rt||	 j t |	|  S )zHInitialize a ``FlatParamHandle`` from a module ``fully_sharded_module``.Nc                    s   t | d u o
|  jvS rI   )r   rh   )	submoduler*   r;   r<   <lambda>L  s    
z0_init_param_handle_from_module.<locals>.<lambda>)Zcheck_fnc                 S      h | ]}|  D ]}|qqS r;   buffers)rM   Zignored_moduler   r;   r;   r<   ru   P  s    z1_init_param_handle_from_module.<locals>.<setcomp>)_check_single_device_modulerj   _get_device_from_device_idr6   rC   _need_to_materialize_modulerh   _materialize_with_param_init_fn_materialize_meta_moduler(   Zmaterialize_module_move_module_to_device_get_compute_devicecompute_devicerc   r   r   _sync_module_params_and_buffersr+   r,   r2   r9   _init_param_handle_from_params)
r*   r   r|   r   r   device_from_device_idis_meta_moduleis_torchdistX_deferred_initignored_buffersmanaged_paramsr;   r   r<   _init_param_handle_from_module-  sf   	




r   c                 C   s   t |dkrd S t||| jt| j | jj| jj| jj	| jj
| j| j| jd}|  | jr/J | j|j || _|| j|j< td}| jjrU|jj|krW|| d S d S d S )Nr   )Zfsdp_extensionr~   )rQ   r   r   SHARDING_STRATEGY_MAPr,   r   Zoffload_paramsr   Zparam_dtypeZreduce_dtypeZkeep_low_precision_gradsr+   r   r   Zshardr   r   appendZ
flat_paramr   Z_fully_sharded_moduler   r   Zflat_param_to)r*   r   r   handle
cpu_devicer;   r;   r<   r   r  s0   

r   root_modulerh   c           	   
   C   s  d}z|durt |nt  }W n ty' } zt|dt|  |d}~ww |D ]}t|tjjs>t|dt|  t|rFtdq*| 	 D ]}t
|sW|| qKdd |D }| |v rktd|  | 	 D ]}t|}|durt|d	sJ ||j qo|S )
ah  
    Check that ``_ignored_modules`` is an iterable of ``nn.Module`` s without any FSDP instances.

    Return the modules contained in their module
    subtrees as a :class:`set`. Nested FSDP instances are excluded, but their
    already-computed ignored modules are included.

    ``_ignored_modules`` represents the argument passed by the user to FSDP.
    z>`ignored_modules` should be an iterable of `torch.nn.Module`s Nzbut got zbut got an iterable with z1`ignored_modules` should not include FSDP modulesc                 S   s*   h | ]}|  D ]
}t|tjs|qqS r;   )modulesrJ   	fsdp_fileFullyShardedDataParallel)rM   r`   childr;   r;   r<   ru     s    
z'_get_ignored_modules.<locals>.<setcomp>zTrying to ignore the top-level module passed into the FSDP constructor itself will result in all parameters being ignored and is not well-supported: rh   )set	TypeErrorrE   rJ   r   re   rr   r   r1   r   traversal_utilsZ_composableaddr   r   hasattrupdaterh   )	r   rh   Z
msg_prefixZignored_root_moduleser`   ra   r   optional_fsdp_stater;   r;   r<   rg     sD   

rg   rm   c                 C   sz   t  }dd |D }|| |durdd |D }|| |  D ]}t|}|dur:t|ds4J ||j q#|S )z
    Return the parameters of the modules in ``ignored_modules`` and the parameters in ``ignored_parameters``.

    :class:`FlatParameter` s are excluded from the result.
    c                 S   s&   h | ]}|  D ]}t|s|qqS r;   )
parametersr   )rM   mpr;   r;   r<   ru     s    z&_get_ignored_params.<locals>.<setcomp>Nc                 S   s   h | ]}t |s|qS r;   )r   rM   r   r;   r;   r<   ru     s
    rj   )r   r   r   r   r   rj   )r   ra   rm   Zall_ignored_paramsZparams_in_ignored_modulesZparams_in_ignored_parametersr   r   r;   r;   r<   ri     s"   


ri   c                    sl   t  }dd |D  | fdd|  D  |  D ]}t|}|dur3t|ds-J ||j q|S )z6Return the cleaned buffer FQNs in ``ignored_modules``.c                 S   r   r;   r   )rM   r   r   r;   r;   r<   ru     s
    z,_get_ignored_buffer_names.<locals>.<setcomp>c                    s    h | ]\}}| v rt |qS r;   r   )rM   r   r   Zbuffers_in_ignored_modulesr;   r<   ru     s
    Nrl   )r   r   r   r   r   r   rl   )r   ra   Zall_ignored_buffer_namesr   r   r;   r   r<   rk     s    
	rk   c                 C   s   dd |   D S )zrReturn the fully prefixed names of all buffers in the module hierarchy rooted at ``root_module`` as a class:`set`.c                 S   s   h | ]\}}t |qS r;   r   )rM   r   rT   r;   r;   r<   ru     rZ   z$_get_buffer_names.<locals>.<setcomp>)r   )r   r;   r;   r<   r   	  s   r   c                 C   s`   dd t | |D }t|dkr!td|v r!|du rtddS t|dkr.td| dS )	z
    Raise an error if ``module`` has original parameters on multiple devices, ignoring the parameters in ``ignored_params``.

    Thus, after this method, the
    module must be either fully on the CPU or fully on a non-CPU device.
    c                 S   s   h | ]}|j qS r;   r   rM   r   r;   r;   r<   ru     s    z._check_single_device_module.<locals>.<setcomp>rG   r~   NzTTo support a module with both CPU and GPU params, please pass in device_id argument.r>   z;FSDP only supports single device modules but got params on )r   rQ   r   r   r   )r`   r{   r|   Zdevicesr;   r;   r<   r     s   r   r6   device_handlec                 C   sv   | du rdS t | tjr| nt| }|jdkr9|jdu r9td|  d| d|  d|j d	 t| }|S )z
    Return a ``torch.device`` for the specified ``device_id``.

    Processes ``device_id`` and returns either the corresponding device or
    ``None`` if ``device_id`` is ``None``.
    Nr~   z"FSDP got the argument `device_id` z	 on rank zJ, which does not have an explicit index. FSDP will use the current device z6. If this is incorrect, please explicitly call `torch.zk.set_device()` before FSDP initialization or pass in the explicit device index as the `device_id` argument.)rJ   r   r   rE   indexr   r   current_device)r|   r6   r   r   r;   r;   r<   r   .  s    r   c                 C   sx   t t| |}tdd |D }|  D ]}||v rq|jddD ]}||jO }q!q| o7to7tdd |D }||fS )z
    Return if ``module`` has parameters on meta device and if ``module`` is using torchdistX deferred initialization.

    At most of the returned bools can
    be ``True``. If either is ``True``, then ``module`` needs to be
    materialized.
    c                 s   s    | ]}|j V  qd S rI   )is_metar   r;   r;   r<   rN   X  s    z._need_to_materialize_module.<locals>.<genexpr>Frecursec                 s   s    | ]}t |V  qd S rI   )r)   Zis_faker   r;   r;   r<   rN   d  s    )rc   r   anyr   r   r   _TORCHDISTX_AVAIL)r`   r{   ra   r   r   r   bufr   r;   r;   r<   r   K  s   r   c                 C   s@   t |std| dt| t| |}|D ]}|| qd S )Nz	Expected z to be callable but got )callabler1   rE   _get_modules_to_materialize)r   r   ra   modules_to_materializer`   r;   r;   r<   r   i  s   

r   r   c           
   
   C   s   |pt | }t| |}d }zAt  2 |D ]%}t|jdd|jdd}t	t
|dk}|r=|j|dd |  qW d    W d S 1 sJw   Y  W d S  tyo }	 ztdt|	 dt| d |	d }	~	ww )NFr   r   )r   r   zIUnable to call `reset_parameters()` for module on meta device with error z(. Please ensure that your module oftype z* implements a `reset_parameters()` method.)r   r   r   r   no_grad	itertoolschainr   r   rQ   rc   Zto_emptyZreset_parametersBaseExceptionr   r   r   rE   )
r   r   ra   r   Zmaterialization_devicer   r`   Zmodule_state_iterZhas_module_statesr   r;   r;   r<   r   w  s:   

&r   c                 C   sr   g }t | g}| h}|r7| }|| | D ]}||vr4t|d u r4||vr4|| || q|s|S rI   )collectionsdequepopleftr   childrenr   r   )r   ra   r   queueZvisited_modulesr`   Zchild_moduler;   r;   r<   r     s   


r   r   c                    s  t d |durjt }||  g }g }|rP| }| fdd|jddD  | fdd|jddD  |	 D ]}t
|tjsM|| q@|sfdd	|D }	fd
d	|D }
t|	|
| dS tt| d}|dur|j krt  dS dS dS )a  
    Move ``module`` depending on ``device_from_device_id`` and its current device.

    This includes moving ignored modules' parameters.

    - If ``device_from_device_id`` is not ``None``, then this moves
    ``module`` to the device.
    - If ``device_from_device_id`` is ``None``, then this does not move
    ``module`` but warns the user if it is on CPU.

    Precondition: ``_check_single_device_module()``.
    r~   Nc                 3       | ]
}|j  kr|V  qd S rI   r   r   r   r;   r<   rN         
z)_move_module_to_device.<locals>.<genexpr>Fr   c                 3   r   rI   r   )rM   r   r   r;   r<   rN     r   c                       g | ]}| vr|qS r;   r;   r   )r{   r;   r<   rY         z*_move_module_to_device.<locals>.<listcomp>c                    r   r;   r;   r   )r   r;   r<   rY     r   )r   r   r   r   r   r   extendr   r   r   rJ   r   r   _move_states_to_devicenextr   _warn_cpu_init)r`   r{   r   r   r   r   r   Zcurr_moduler   Zparams_to_moveZbufs_to_mover   r;   )r   r   r{   r<   r     s6   





r   r   c              	   C   s   t | dkrt |dkrdS t | dkr| d j}nt |dkr%|d j}td}|durg| D ])}t  |||_|jdurJ|j||j_W d   n1 sTw   Y  q0|D ]}|||_q\dS ||krpt  dS dS )z
    Move states to the specified device.

    Precondition: ``_check_single_device_module()`` and module's parameters and
    buffers have been materialized if needed.
    r   Nr~   )rQ   r   r   r   todataZgradr  )r   r   r   r   r   r   r   r;   r;   r<   r     s,   




r   c                   C   s   t d d S )Nam  The passed-in `module` is on CPU and will thus have FSDP's sharding initialization run on CPU, which may be slower than on GPU. We recommend passing in the `device_id` argument for FSDP to move `module` to GPU for the sharding initialization. `module` must also be on GPU device to work with the `sync_module_states=True` flag since that requires GPU communication.)r   r   r;   r;   r;   r<   r    s   r  c                 C   sh   t t| |d}|dur|jjdkr|j}nt| }|dur2||kr2td| d| d| |S )a)  
    Determine and return this FSDP instance's compute device.

    If the module is already on a non-CPU device, then the compute device is that non-CPU
    device. If the module is on CPU, then the compute device is the current
    device.

    Since this method should be called after materializing the module, any
    non-CPU device should not be meta device. For now, the compute device is
    always a CUDA or CUDA-like device with its explicit index.

    Precondition: ``_check_single_device_module()`` and
    ``_move_module_to_device()``.
    Nr~   z4Inconsistent compute device and `device_id` on rank z: z vs )r  r   r   rE   r   r   r1   )r`   r{   r   r6   r   r   r   r;   r;   r<   r     s   r   c           
         s   g }|   D ]0}t|tds6t|td |  t r1  \}} fdd|D }|| q|  q|D ]$}| trX \}}fdd|D }	||	 q9| q9t	| t
||tdd dS )	z
    Synchronize module states (i.e. parameters ``params`` and all not-yet-synced buffers) by broadcasting from rank 0 to all ranks.

    Precondition: ``sync_module_states == True`` and ``self.process_group`` has
    been set.
    FTc                       g | ]}t  |qS r;   getattrrM   attr)detached_bufferr;   r<   rY   I      z3_sync_module_params_and_buffers.<locals>.<listcomp>c                    r  r;   r  r  )detached_paramr;   r<   rY   R  r  r   )srcN)r   r  FSDP_SYNCEDsetattrdetachr&   Z__tensor_flatten__r   r   +_check_module_states_for_sync_module_statesr%   PARAM_BROADCAST_BUCKET_SIZE)
r`   r   r+   module_statesr   attrsrT   Zinner_buffersr   Zinner_paramsr;   )r
  r  r<   r   4  s2   

r   r  c                 C   s&   | rt dd | D rtdd S d S )Nc                 s   s     | ]}|j t d kV  qdS )r~   N)r   r   )rM   Ztensorr;   r;   r<   rN   c  s    
z>_check_module_states_for_sync_module_states.<locals>.<genexpr>zThe module has CPU parameters or buffers when `sync_module_states=True`, which requires them to be on GPU. Please specify the `device_id` argument or move the module to GPU before passing it to FSDP.)r   r1   )r  r;   r;   r<   r  `  s   r  c                 c   sB    |   }z	 t|}||vrt|s|V  q ty    Y dS w )aD  
    Return an iterator over the original parameters in ``module``.

    The iterator does not return
    the parameters in ``ignored_params``, any ``FlatParameter`` s (which may be
    present due to nested FSDP wrapping), or any original parameters already
    flattened (only relevant when ``use_orig_params=True``).
    TN)r   r  r   StopIteration)r`   r{   Z	param_genr   r;   r;   r<   r   m  s   r   c                 C   sF   t | D ]\}}||vr t|s td| d|  d|j qdS )a5  
    Check that original parameters in ``fsdp_module`` have been flattened.

    The flattened parameters are made
    invisible to ``named_parameters()`` for the module hierarchy rooted at
    ``fsdp_module``. This should be called as a sanity check after flattening
    the wrapped module's parameters.
    z Found an unflattened parameter: z;  N)r   r   r   r7   	__class__)Zfsdp_moduler{   r   r   r;   r;   r<   _check_orig_params_flattened  s   r  c                 C   s   | t jkrtjS tjS rI   )r    r   r   Zallreduce_hookZreduce_scatter_hook)r,   r;   r;   r<   _get_default_comm_hook  s
   
r  c                 C   s   t j| dS )Nr?   )r   r:   r?   r;   r;   r<   rF     s   rF   rI   )r   r   r   r   collections.abcr   r   r   typingr   r   r   r   r	   r
   r   Ztorch.distributeddistributedrK   Z(torch.distributed.fsdp._exec_order_utilsZfsdpZ_exec_order_utilsr   Z'torch.distributed.fsdp._traversal_utilsZ_traversal_utilsr   Z2torch.distributed.fsdp.fully_sharded_data_parallelZfully_sharded_data_parallelr   Ztorch.nnre   Z(torch.distributed.algorithms._comm_hooksr   Ztorch.distributed.device_meshr   r   Z"torch.distributed.distributed_c10dr   Z$torch.distributed.fsdp._common_utilsr   r   r   r   r   r   r   Z"torch.distributed.fsdp._flat_paramr   r   r   r   Z%torch.distributed.fsdp._limiter_utilsr   Ztorch.distributed.fsdp.apir   r   r   r   r   r    r!   r"   Ztorch.distributed.fsdp.wrapr#   Z&torch.distributed.tensor.parallel.fsdpr$   Ztorch.distributed.utilsr%   Ztorch.utils._python_dispatchr&   Ztorch.utils.hooksr'   r   Z
torchdistxr(   r)   ImportErrorintr  r  rP   rL   ZHybridShardProcessGroupTypeZProcessGroupTyper   r   ZSHARD_GRAD_OPZHYBRID_SHARDZ_HYBRID_SHARD_ZERO2r   r2   Z#NO_RESHARD_AFTER_FORWARD_STRATEGIESr=   r3   boolrD   r@   rU   r_   rB   rr   rf   ro   rc   rd   r   r   r   r   r   r   r   r   r   r   r   r   rg   ri   r   rk   r   r   r   r   r   r   r   ZTensorr   r   r  r   r   r  r   r  r  r:   rF   r;   r;   r;   r<   <module>   s   $	(

3+$
.
0	AD
<


"







#

6
 
#
,


