o
    wZh                  	   @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ d dl m	Z	 d dl
mZ d dlmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlmZ d dlZd dlmZ d dlmZ d dl m  m!Z" d d	l#m$Z$ d d
l%m&Z& d dl'm(Z(m)Z)m*Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4m5Z5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z< d dl=m>Z>m?Z?m@Z@ d dlAmBZBmCZCmDZDmEZE d dlmFZFmGZG d dlHmIZJ d dlKmLZLmMZMmNZNmOZO d dlPmQZQmRZRmSZSmTZTmUZU d dlVmWZW dZXeSrdZYdZZej[\ ZXneTrdZYdZZneUrdZYdZZej]\ ZXnd ZYd!ZZd"ZXG d#d$ d$eZ^G d%d& d&eZ_G d'd( d(ej`eZad)ej`d*ejbd+efd,d-Zcd.d/ Zd	0	1dd)ej`d2eefd3d4Zfdd5d6Zgd7d8 Zhd9d: Zidd)ej`d;eefd<d=Zjd)ej`d>eefd?d@Zkd)ej`dAeefdBdCZlG dDdE dEZmG dFdG dGeaZnG dHdI dIeaZoG dJdK dKeoZpG dLdM dMeoZqG dNdO dOeaZrG dPdQ dQerZsG dRdS dSej`ZtG dTdU dUeoZuG dVdW dWej`ZvG dXdY dYejwZxG dZd[ d[ej`Zye jzd\efd]d^Z{e jzd_efd`daZ|e jzdbefdcddZ}ee jzdeefdfdgZ~ee jzdhefdidjZee jzdkefdldmZee jzdnefdodpZdqed+edredsefdtduZ	vddwej`dxej`dyeedzf fd{d|ZG d}d~ d~eMZG dd deLZddee fddZG dd dej`ZG dd dej`ZG dd dej`ZdS )    N)ABCabstractmethod)nullcontext)deepcopy)autoEnumwraps)AnyCallablecastno_type_checkOptionalUnion)mock)
checkpoint)
DeviceMesh)
CPUOffloadfully_shardFullyShardedDataParallel)TrainingState)FSDPParamGroupRegisterPostBackwardFunction)#NO_RESHARD_AFTER_FORWARD_STRATEGIES)BackwardPrefetchMixedPrecisionShardingStrategy)ShardedGradScaler)always_wrap_policyModuleWrapPolicywrap)distribute_tensorDTensorShard)ColwiseParallelparallelize_moduleRowwiseParallelSequenceParallel)TransformerDecoderLayerTransformerEncoderLayer)DistributedDataParallel)MultiProcessTestCaseMultiThreadedTestCaserun_subtests
TEST_SKIPS)FILE_SCHEMAget_cycles_per_ms	TEST_CUDATEST_HPUTEST_XPU)
has_triton   cudaZncclzhpu:0ZhcclxpuZxcclcpuZgloo   c                   @      e Zd Ze Ze ZdS )FSDPInitModeN)__name__
__module____qualname__r   NO_FSDP	RECURSIVE rA   rA   R/var/www/auris/lib/python3.10/site-packages/torch/testing/_internal/common_fsdp.pyr;   V   s    
r;   c                   @   s   e Zd Ze Ze Ze ZdS )DEVICEInitModeN)r<   r=   r>   r   DEVICE_BEFOREDEVICE_AFTERZDEVICE_NEVERrA   rA   rA   rB   rC   _   s    
rC   c                   @   sn   e Zd ZdZedeejdf fddZedejfddZ	edd	d
Z
eedededejfddZdS )FSDPTestModelzZThis defines the interface expected from all models used commonly for
    FSDP unit tests.return.c                 C      dS )z+Returns an input for the model as as tuple.NrA   selfdevicerA   rA   rB   	get_inputl      zFSDPTestModel.get_inputc                 C   rH   )z,Returns the loss given the input and output.NrA   )rJ   inputoutputrA   rA   rB   get_lossq   rM   zFSDPTestModel.get_lossNc                 C   rH   )z<Runs the backward pass (e.g. including ``loss.backward()``).NrA   rJ   lossrA   rA   rB   run_backwardv   rM   zFSDPTestModel.run_backwardargskwargsc                  O   rH   )z&Initializes an instance of this model.NrA   rT   rU   rA   rA   rB   init{   s   zFSDPTestModel.initrG   N)r<   r=   r>   __doc__r   tupletorchTensorrL   rP   rS   staticmethodr
   nnModulerW   rA   rA   rA   rB   rF   h   s     rF   modelprocess_group	assert_fnc                 C   s   dd |   D }|dd |  D 7 }t|}dd t|D }tj|||d |d }|dus4J |dd D ]}|dusBJ t||D ]\\}}	\}}
||	|
 qGq:dS )	a  
    All-gathers module states across ranks and calls ``assert_fn`` on each pair
    of corresponding states from rank 0 and a nonzero rank. For example, if
    ``assert_fn`` is ``self.assertEqual()``, then this checks that all module
    states are equal across ranks.
    c                 S       g | ]\}}||   fqS rA   detachr8   ).0
param_nameparamrA   rA   rB   
<listcomp>       z)_assert_module_states.<locals>.<listcomp>c                 S   rc   rA   rd   )rf   Zbuffer_namebufferrA   rA   rB   ri      rj   c                 S   s   g | ]}d qS NrA   )rf   _rA   rA   rB   ri      s    groupr   Nr9   )named_parametersZnamed_buffersdistZget_world_sizerangeZall_gather_objectzip)r`   ra   rb   Znamed_module_states
world_sizeolistZrank0_statesstaterm   p1p2rA   rA   rB   _assert_module_states   s"   
ry   c                   C   s
   t tS rl   )r[   rK   DEVICE_TYPErA   rA   rA   rB   get_devtype      
r{   FTzero_buffersc              
   C   s   |rt | nt }|Q |  D ]}t  |  W d   n1 s&w   Y  q|rM|  D ]"}t  |  W d   n1 sGw   Y  q2W d   dS W d   dS 1 s`w   Y  dS )zBZeros the parameters and optionally buffers of ``model`` in place.N)FSDPsummon_full_paramsr   
parametersr[   no_gradZzero_buffers)r`   r}   Zsummon_fullctxrh   rk   rA   rA   rB   _zero_model   s"   



"r   c                 C   s"   |s|  t} |r|   |  S rl   )torz   half
state_dict)r`   cpu_offloadr   rA   rA   rB   _get_state_dict   s
   
r   c                    s   d  fdd|D S )Nrm   c                    s$   g | ]}|d ur t | ndqS )Nnone)str)rf   stest_name_mappingrA   rB   ri      s   $ z subtest_name.<locals>.<listcomp>)join)r   rT   rA   r   rB   subtest_name   s   r   c                 C   s   |  D ]\}}|jtdkr| ||< q| dkr|nd g}t| ttttj	f |d }|
 D ]}|| t||< q5|S )Nr8   r   )itemsrK   r[   r8   rq   Zbroadcast_object_listr   dictr   r\   keysr   rz   )rankr   rg   rh   ru   rA   rA   rB   _broadcast_state_dict   s   
r   recursec                 C   sB   t j| |d tt|  W  d   S 1 sw   Y  dS )a[  
    Returns the full unsharded parameters of ``model``. Any FSDP-managed
    parameters offloaded to CPU are moved to GPU in the returned list.

    Args:
        recurse (bool): If ``False``, only unshards the parameters immediate to
            ``model``; if ``True``, recurses through the module hierarchy
            rooted at ``model``.
    )r   N)r~   r   r   listr   )r`   r   rA   rA   rB   get_full_params   s   
$r   move_to_devicec                 C   s   |r|  tS | S rl   )r   rz   )r`   r   rA   rA   rB   _move_to_device      r   	wrap_fsdpc                 O   s   |s| S t | g|R i |S rl   r~   )r`   r   rT   rU   rA   rA   rB   _maybe_wrap_fsdp   s   r   c                   @   sB   e Zd ZdedefddZdefddZdefdd	Zd
d ZdS )DummyProcessGroupr   sizec                 C   s   || _ || _d S rl   )_rank_size)rJ   r   r   rA   rA   rB   __init__   s   
zDummyProcessGroup.__init__rG   c                 C      | j S rl   )r   rJ   rA   rA   rB   r         zDummyProcessGroup.rankc                 C   r   rl   )r   r   rA   rA   rB   r      r   zDummyProcessGroup.sizec                 O   s   t  }dd }||_|S )Nc                  S   s   t j } | d | S )Nr9   )r[   futuresFuture
set_result)futurerA   rA   rB   
get_future   s   

z/DummyProcessGroup.allreduce.<locals>.get_future)r   ZMockr   )rJ   rT   rU   Z	dist_waitr   rA   rA   rB   	allreduce   s   zDummyProcessGroup.allreduceN)r<   r=   r>   intr   r   r   r   rA   rA   rA   rB   r      s
    r   c                       s   e Zd Zdejdededef fddZdd Zd	d
 Z	dd Z
dd Ze			ddejdededeeeef  dededeejef fddZdd Z  ZS )TransformerWithSharedParamsro   device_init_modeadd_bndeterministicc                    s   t    | | _| | _|rtd d}d}t||| _	tj
|ddddd| _t||| _| j	j| j_| d| j	j|f | d	tj| jtjd
 d| _|r^tj| jntj | _|tjkrn| t} |rv|   d S d S )Nr               g?)d_modelZnum_encoder_layersZnum_decoder_layersZdim_feedforwardZdropout
vocab_biaslong_buffer)dtype)superr   r   r   rt   r[   manual_seedr^   Z	Embeddingembed_tokensZTransformertransformerLinearoutput_projweightregister_bufferZnew_onesZ
zeros_liker   longbsZBatchNorm1dZIdentitybnrC   rD   r   rz   eval)rJ   ro   r   r   r   Zd_vocabr   	__class__rA   rB   r     s>   





z$TransformerWithSharedParams.__init__c                 C   sN   t d| j  t jd|dd| j}t j| jd |dd| j}||fS )Nr9      rK      r5   )r[   r   r   Zarangeviewr   )rJ   rK   srctgtrA   rA   rB   rL   .  s   z%TransformerWithSharedParams.get_inputc                 C   sJ   |  |}|| j | j| }|  |}| |}| ||}| |S rl   )r   r   r   Ztype_asr   r   r   )rJ   Zsrc_idsZtgt_idsr   r   xrA   rA   rB   forward4  s   



z#TransformerWithSharedParams.forwardc                 C   s.   |\}}t jj|d|d|dddS )Nsum)Z	reduction)r^   
functionalZcross_entropyr   r   )rJ   rN   rO   rm   r   rA   rA   rB   rP   <  s   z$TransformerWithSharedParams.get_lossc                 C      |   d S rl   backwardrQ   rA   rA   rB   rS   B     z(TransformerWithSharedParams.run_backwardNFTfsdp_init_modefsdp_kwargsrG   c                 C   s   |du ri }|t jkrt| tr| d }n| }t||||S |t jkrud|vr.ttth}n|	d}d|v rI|d t
jt
jhv rIt| tsId}n| }t| trU| d }	n| }	t|	|||}
t|
|fd|i|}|tjkrs|t}|S td| )au  
        Initializes a :class:`TransformerWithSharedParams` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps with
                top-level FSDP. By default, the top-level FSDP uses the
                ``ModuleWrapPolicy`` for encoder and decoder layers, but a
                different auto wrap policy may be specified via
                ``fsdp_kwargs``.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            add_bn (bool): Whether to include batch norm in the model.
        Nr   auto_wrap_policysharding_strategyUnsupported FSDP init mode: )r;   r?   
isinstancerZ   r   r@   r   r)   r(   popr   ZHYBRID_SHARDZ_HYBRID_SHARD_ZERO2r~   rC   rE   r   rz   
ValueError)ro   r   r   r   r   r   Zpgr   Zfsdp_pgZ
tformer_pgm
fsdp_modelrA   rA   rB   rW   E  sV   









z TransformerWithSharedParams.initc                 C   s   | j gS rl   )r   r   rA   rA   rB   get_ignored_modules  s   z/TransformerWithSharedParams.get_ignored_modules)NFT)r<   r=   r>   rq   ProcessGrouprC   boolr   rL   r   rP   rS   r]   r;   r   r   r   r
   r   r^   r_   r~   rW   r   __classcell__rA   rA   r   rB   r     sD    *Mr   c                       s   e Zd Zdejdededef fddZdd Zd	d
 Z	dd Z
dd Ze		ddejdededeeeef  dedejfddZ  ZS )NestedWrappedModulero   r   r   r   c                    s   t     | _ | _|tjk} fdd}|r#td t	
tt	dd||t	
|tt	dd|tt	dd||tt	dd|tt	dd|| _d S )Nc                       rt | fi  S | S rl   r   layerr   ro   r   rA   rB   _maybe_wrap     z1NestedWrappedModule.__init__.<locals>._maybe_wrapr   r   r5   r   )r   r   r   r   rt   rC   rD   r[   r   r^   
Sequentialr   r   modulerJ   ro   r   r   r   r   r   r   r   r   rB   r     s$   





zNestedWrappedModule.__init__c                 C   s"   t d| j  t jdd|dfS )Nr9   r5   r   r   )r[   r   r   ZrandrI   rA   rA   rB   rL     s   zNestedWrappedModule.get_inputc                 C   
   |  |S rl   r   rJ   r   rA   rA   rB   r     r|   zNestedWrappedModule.forwardc                 C   s   |  }|S rl   )r   rJ   rN   rO   rR   rA   rA   rB   rP     s   zNestedWrappedModule.get_lossc                 C   r   rl   r   rQ   rA   rA   rB   rS     r   z NestedWrappedModule.run_backwardNFr   r   rG   c                 C   sn   |du ri }|t jkrt| d||dS |t jkr0t| fd||d|}|tjkr.|t}|S td| )a  
        Initializes a :class:`NestedWrappedModule` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP but not the top-level module. The model may
                later be wrapped with a top-level FSDP external to this method
                if desired.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
        NFr   r   r   Tr   )	r;   r?   r   r@   rC   rE   r   rz   r   )ro   r   r   r   r   r   rA   rA   rB   rW     s.   



zNestedWrappedModule.initNF)r<   r=   r>   rq   r   r   rC   r   rL   r   rP   rS   r]   r;   r   r   r   r
   r^   r_   rW   r   rA   rA   r   rB   r     s<     r   c                       sJ   e Zd Ze		d
dejdededee	e
ef  def
 fdd	Z  ZS )AlwaysWrapNestedWrappedModuleNFro   r   r   r   r   c                    sl   t ttj| tj|||d}|tjkr|S |tjkr4|pi }t|fdti|}|tj	kr2|
t}|S dS )z
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, for the ``RECURSIVE`` init mode, this
        wraps with top-level FSDP and the ``always_wrap_policy()`` auto wrap
        policy.
        )ro   r   r   r   r   r   N)r   r   rW   r;   r?   r@   r~   r   rC   rE   r   rz   )ro   r   r   r   r   r`   r   r   rA   rB   rW     s&   
	


z"AlwaysWrapNestedWrappedModule.initr   )r<   r=   r>   r]   rq   r   r;   rC   r   r   r   r
   r   rW   r   rA   rA   r   rB   r     s    r   c                       st   e Zd Zdejdededef fddZedd	d
Z	e		ddejde
dedeeeef  def
ddZ  ZS )NonUniformReqGradNWMro   r   r   r   c                    s   t t|    | _ | _|tjk} fdd}|r%t	d t
tt
dd||t
|tt
dd|tt
dd||t
tt
dd|tt
dd|| _d S )Nc                    r   rl   r   r   r   rA   rB   r   +  r   z2NonUniformReqGradNWM.__init__.<locals>._maybe_wrapr   r   r5   r   )r   r   r   r   r   rt   rC   rD   r[   r   r^   r   r   r   r   r   r   r   rB   r     s,   




zNonUniformReqGradNWM.__init__rG   Nc                 C   s,   |   D ]\}}t||s|d qd S r   )rp   rematchZrequires_grad_)r`   Zreq_grad_masknprA   rA   rB   _set_nonuniform_req_gradB  s
   
z-NonUniformReqGradNWM._set_nonuniform_req_gradFr   r   c                 C   s   t d}|tjkrt| d||d}t|| |S |tjkrC|du r%i }t| fd||d|}|tjkr;|	t
}t|| |S td| )a  
        Initializes a :class:`NestedWrappedModule` instance, but unlike
        :meth:`NestedWrappedModule.init`, it wraps a second :class:`torch.nn.Sequential`
        container to enable the desired non-uniform ``requires_grad``
        ``use_orig_params=True`` tests. For both ``RECURSIVE`` and ``NO_FSDP``
        init modes, freezes all parameters except the last two to validate
        ``ShardedGradScaler`` support for ranks with no (non-zero sized) local shards in
        FSDP ``use_orig_params=True`` mode.
        zmodule\.2.*\.1.*Fr   NTr   )r   compiler;   r?   r   r   r@   rC   rE   r   rz   r   )ro   r   r   r   r   Zreq_grad_patternZ	ddp_modelr   rA   rA   rB   rW   H  s6   




zNonUniformReqGradNWM.initrX   r   )r<   r=   r>   rq   r   r   rC   r   r]   r   r;   r   r   r   r
   rW   r   rA   rA   r   rB   r     s4    *r   c                       sv   e Zd ZdZdejdedef fddZdd Zd	d
 Z	dd Z
dd Zedee dedededef
ddZ  ZS )ModuleWithDelayzThis class wraps a :class:`FSDPTestModel` to optionally add a delay
    after computing the loss and/or before the gradient reduction.r   delay_after_loss_msdelay_before_reduction_msc                    s    t    || _|| _|| _d S rl   )r   r   r   r   r   )rJ   r   r   r   r   rA   rB   r   {  s   

zModuleWithDelay.__init__c                 C   s   | j |S rl   )r   rL   rI   rA   rA   rB   rL     r   zModuleWithDelay.get_inputc                 C   r   rl   r   r   rA   rA   rB   r     r|   zModuleWithDelay.forwardc                 C   sT   | j ||}| jdkr(tstrt| jd  |S tr(tj	
t| jt   |S Nr     )r   rP   r   r2   r3   timesleepr1   r[   r6   _sleepr   r0   r   rA   rA   rB   rP     s   
zModuleWithDelay.get_lossc                    sT   t jj  fdd}td| j| W d    d S 1 s#w   Y  d S )Nc                     sN   j dkr trtjtj t   ntstr t	
j d   | i |S r   )r   r1   r[   r6   r  r   r0   r2   r3   r  r  rV   orig_reduce_scatterrJ   rA   rB   _delayed_reduce_scatter  s   
z=ModuleWithDelay.run_backward.<locals>._delayed_reduce_scatterz'torch.distributed.reduce_scatter_tensor)r[   distributedreduce_scatter_tensorr   patchr   rS   )rJ   rR   r  rA   r  rB   rS     s   
"zModuleWithDelay.run_backwardmodule_class
model_argsmodel_kwargsc                O   s   t | j|i |||S )aA  
        Args:
            module_class (Type[FSDPTestModel]): Wrapped module class to which
                to add delays.
            model_args: Positional arguments forwarded to the ``module_class``
                ``init()``.
            delay_after_loss_ms (int): Delay after computing the loss/before
                the optimizer step (in ms).
            delay_before_reduction_ms (int): Delay before reduce-scattering
                gradients (in ms).
            model_kwargs: Keyword arguments forwarded to the ``module_class``
                ``init()``.
        )r   rW   )r  r   r   r  r  rA   rA   rB   rW     s
   zModuleWithDelay.init)r<   r=   r>   rY   r^   r_   r   r   rL   r   rP   rS   r]   typerF   r
   rW   r   rA   rA   r   rB   r   w  s2    
r   c                   @   sR   e Zd Zeejddddfdejdedede	e
eef  ded	ed
efddZdS )NestedWrappedModuleWithDelayNFr   ro   r   r   r   r   r   r   c              
   C   s   t jt| ||||||dS )Nro   r   r   r   r   r   r   )r   rW   r   r  rA   rA   rB   rW     s   
z!NestedWrappedModuleWithDelay.init)r<   r=   r>   r]   rC   rE   rq   r   r;   r   r   r   r
   r   r   rW   rA   rA   rA   rB   r    s,    r  c                       $   e Zd Z fddZdd Z  ZS )DummyDDPc                    s   t    || _d S rl   )r   r   r   )rJ   r   r   rA   rB   r     s   

zDummyDDP.__init__c                 O   s   | j |i |S rl   r   rJ   rT   rU   rA   rA   rB   r     s   zDummyDDP.forwardr<   r=   r>   r   r   r   rA   rA   r   rB   r    s    r  c                       s   e Zd Zdejdedededef
 fddZdd	 Z	d
d Z
e			ddejdededeeeef  dedefddZ  ZS )MixtureOfExpertsro   r   r   delay_before_free_msr   c              
      s$  t  j||||d || _|| _|| _|tjk| _|r#t	d| j
  d}d}d}	tt||| j}
tdd |
 D | _|
 D ]}d|_qC|rPt	d	 tt||| j}|rwtj|
 g}t|
|fi |}
t||fi |}ttt|	|| j||
tt||	| j| _d S )
N)ro   r   r   r   *   r   r   r   c                 s       | ]}|  V  qd S rl   )Znumel)rf   r   rA   rA   rB   	<genexpr>       z,MixtureOfExperts.__init__.<locals>.<genexpr>Tr   )r   r   ro   r  r   rC   rD   r   r[   r   r   r   r^   r   r   r   Znum_expert_paramsexpertr  Z	new_groupr~   r   r   )rJ   ro   r   r   r  r   r   Zd_expertZd_sharedZd_inputr  r   ZsharedZexpert_groupr   rA   rB   r     sD   	

zMixtureOfExperts.__init__c                    sx   j dkr7jd }t|tr7tjjjj  fdd}t	
d| |W  d    S 1 s2w   Y  |S )Nr   r   c                     sD   t rtjtjt   ntstrt	
jd   | i |S )Nr  )r1   r[   r6   r  r   r  r0   r2   r3   r  r  rV   orig_reshardrJ   rA   rB   _delayed_reshard  s   z2MixtureOfExperts.forward.<locals>._delayed_reshardz.torch.distributed.fsdp._runtime_utils._reshard)r  r   r   r~   r[   r  fsdpZ_runtime_utilsZ_reshardr   r
  )rJ   r   r  r  rA   r  rB   r     s   


 
zMixtureOfExperts.forwardc                 C   s   |   | jsAt , |  D ]}t|drq|jd ur.|j| j tj	j
|j| jd qW d    d S 1 s:w   Y  d S d S )Nr  rn   )r   r   r[   r   r   hasattrgradZdiv_rt   r  
all_reducero   )rJ   rR   r   rA   rA   rB   rS   1  s   


"zMixtureOfExperts.run_backwardNFr   r   r   c                 C   sr   |du ri }|t jkrt| d|||dS |t jkr2t| fd|||d|}|tjkr0|t}|S td| )a  
        Initializes a :class:`MixtureOfExperts` instance.

        Args:
            fsdp_init_mode (FSDPInitMode): If ``NO_FSDP``, then does not wrap
                any modules with FSDP. If ``RECURSIVE``, then wraps some nested
                modules with FSDP, including the expert and shared layers, but
                not the top-level module. The model may later be wrapped with a
                top-level FSDP external to this method if desired.
            device_init_mode (DEVICEInitMode): Determines model movement to DEVICE.
            fsdp_kwargs (Optional[Dict[str, Any]]): Optional keyword arguments
                forwarded to the FSDP constructor.
            deterministic (bool): Whether to make the model deterministic
                across constructions.
            delay_before_free_ms (int): Delay before resharding expert
                parameters in the forward pass (in ms).
        NF)r   r   r  r   Tr   )	r;   r?   r  r@   rC   rE   r   rz   r   )ro   r   r   r   r   r  r   rA   rA   rB   rW   =  s2   



zMixtureOfExperts.init)NFr   )r<   r=   r>   rq   r   r   rC   r   r   r   rS   r]   r;   r   r   r   r
   rW   r   rA   rA   r   rB   r    s>    4r  c                       sd   e Zd Z	ddddddedeej ded	ed
ef
 fddZdej	dej	fddZ
dd Z  ZS )MLPNTFr5   )biaswith_bufferdim_multiplierdimrK   r$  r%  r&  c                   sd   t    tj||| ||d| _tj|| |||d| _|r-| dtj|f|d d S d | _	d S )N)rK   r$  rk   r   )
r   r   r^   r   in_projout_projr   r[   Zrandnrk   )rJ   r'  rK   r$  r%  r&  r   rA   rB   r   r  s   
	
zMLP.__init__r   rG   c                 C   s@   |  |}t|}| |}t|}| jd ur|| j }|S rl   )r(  Frelur)  rk   )rJ   r   zrA   rA   rB   r     s   





zMLP.forwardc                 C   s"   | j d urtjj| j  d S d S rl   )rk   r[   r^   rW   Znormal_r   rA   rA   rB   reset_parameters  s   
zMLP.reset_parametersrl   )r<   r=   r>   r   r   r[   rK   r   r   r\   r   r-  r   rA   rA   r   rB   r#  q  s&    	r#  c                       sF   e Zd Zdddedef fddZdeded	ed
d fddZ  ZS )MLPStackF)with_seq_parallelmlp_dimr/  c                   sL   t |ddt |t |ddg}|r|tj|dd t j|  || _d S )N   )r&  Fr$  )r#  appendr^   	LayerNormr   r   r/  )rJ   r0  r/  modulesr   rA   rB   r     s   


zMLPStack.__init__tp_meshdp_meshuse_activation_checkpointingrG   c                 K   s   t ddtddt ddtddt dd| jrttddnt d}| jr-tdd|d< t| ||d | D ]}t|tjr?q6|rEt	| t
|fd	|i| q6t
| fd	|i| | S )
NF)Zuse_local_outputr9   )Zoutput_layouts)z	0.in_projz
0.out_projz	1.in_projz
1.out_projz	2.in_projz
2.out_proj)Zsequence_dim3)device_meshparallelize_planmesh)r$   r&   r/  r#   r'   r%   r   r^   r4  r   r   )rJ   r6  r7  r8  r   r;  r   rA   rA   rB   parallelize  s(   
zMLPStack.parallelize)	r<   r=   r>   r   r   r   r   r=  r   rA   rA   r   rB   r.    s    r.  c                       sV   e Zd ZdZddedef fddZdejde	e
ejejf ejf fd	d
Z  ZS )DoubleLinearz
    This can be used for returning multiple outputs from a module
    (``use_second_linear=True``) or for having an unused module (``False``).
    Tr'  use_second_linearc                    s:   t    t||| _t||| _t | _|| _d S rl   )	r   r   r^   r   lin1lin2ZReLUr+  r?  )rJ   r'  r?  r   rA   rB   r     s
   


zDoubleLinear.__init__r   rG   c                 C   s6   | j r| | || | |fS | | |S rl   )r?  r+  r@  rA  r   rA   rA   rB   r     s    zDoubleLinear.forwardT)r<   r=   r>   rY   r   r   r   r[   r\   r   rZ   r   r   rA   rA   r   rB   r>    s    r>  new_all_gather_into_tensorc                 c   B    t j}t   | t _zd V  W t   |t _d S t   |t _w rl   )rq   Zall_gather_into_tensorbarrier)rC  Zorig_all_gatherrA   rA   rB   patch_all_gather     
rF  new_reduce_scatter_tensorc                 c   rD  rl   )rq   r	  rE  )rH  r  rA   rA   rB   patch_reduce_scatter  rG  rI  new_all_reducec                 c   rD  rl   )rq   r"  rE  )rJ  Zorig_all_reducerA   rA   rB   patch_all_reduce  rG  rK  new_unshardc                 c   B    t j}t  | t _zd V  W t  |t _d S t  |t _w rl   )r   Zunshardrq   rE  )rL  Zorig_unshardrA   rA   rB   patch_unshard     
rN  new_reshardc                 c   rM  rl   )r   Zreshardrq   rE  )rP  r  rA   rA   rB   patch_reshard
  rO  rQ  new_post_backwardc                 c   rM  rl   )r   Zpost_backwardrq   rE  )rR  Zorig_post_backwardrA   rA   rB   patch_post_backward  rO  rS  new_backwardc                 c   rM  rl   )r   r   rq   rE  )rT  Zorig_backwardrA   rA   rB   *patch_register_post_backward_hook_backward$  rO  rU  r  rT   rU   c                 O   sR   t |dkr|d }nd|v r|d }n
td| d| || ||i |S )Nr   rO   z,Cannot get reduce-scatter output from
args: z	
kwargs: )lenAssertionError)clsr  rb   rT   rU   rO   rA   rA   rB   reduce_scatter_with_assert1  s   

rY  rA   replicated_modulesharded_moduleprefixes_to_ignore.c                 C   s  t | | D ]\\}}\}}|}|D ]}	||	d}q| || | |t t|ts1J |j|j}
}t	|t
dt
dfkrHtdt||
|}| | |  |jd u rd| |j q	| |j t|j|
|}| |jt t|jtsJ | |j |  q	d S )N r   zmFSDP's (Shard(0), Shard(0)) layout differs from distribute_tensor(), so we cannot check for equality using it)rs   rp   replaceassertEqualZassertIsInstancer"   r   r:  
placementsrZ   r#   rW  r!   Zto_localr!  ZassertIsNoneZassertIsNotNone)rX  rZ  r[  r\  Zreplicated_nameZreplicated_paramZsharded_nameZsharded_paramZclean_sharded_nameprefixr<  r`  Zsharded_ref_paramZsharded_ref_gradrA   rA   rB   check_sharded_parityD  s2   
rb  c                       s@   e Zd Zedd Z fddZdd Zdd Zd	d
 Z  Z	S )FSDPTestMultiThreadc                 C      t S rl   DEVICE_COUNTr   rA   rA   rB   rt   f     zFSDPTestMultiThread.world_sizec                    s   t    |   d S rl   )r   setUpZ_spawn_threadsr   r   rA   rB   rh  j  s   
zFSDPTestMultiThread.setUpc                 O      t | g|R i |S rl   r-   r  rA   rA   rB   r-   n     z FSDPTestMultiThread.run_subtestsc                 C      t j  d S rl   r[   _dynamoresetr   rA   rA   rB   perThreadSetUpq     z"FSDPTestMultiThread.perThreadSetUpc                 C   rl  rl   rm  r   rA   rA   rB   perThreadTearDownt  rq  z%FSDPTestMultiThread.perThreadTearDown)
r<   r=   r>   propertyrt   rh  r-   rp  rr  r   rA   rA   r   rB   rc  e  s    
rc  c                $       sj  e Zd Z fddZedd Zedd Zedefdd	Zed
d Z	dd Z
dd Zdd Zdd Zedd Z							d4dejdedededee dedee d ed!ed"eeeef  fd#d$Zdd%d&e dddddddddfd'ee d(ed)ed*ee d+eded,ed-ee  d.ee! dee d/ed0ed ed!ed1eeeef  d"eeeef  f d2d3Z"  Z#S )5FSDPTestc                    s    t    dtjd< |   d S )N0ZTORCH_NCCL_DESYNC_DEBUG)r   rh  osenvironZ_spawn_processesr   r   rA   rB   rh  y  s   

zFSDPTest.setUpc                 C   rd  rl   re  r   rA   rA   rB   rt     rg  zFSDPTest.world_sizec                 C   s
   t j S rl   )rq   Zdistributed_c10dZ_get_default_groupr   rA   rA   rB   ra     s   
zFSDPTest.process_grouprG   c                 C   rH   r   rA   r   rA   rA   rB   destroy_pg_upon_exit  rM   zFSDPTest.destroy_pg_upon_exitc                 C   s   t  | j S rl   )r/   	file_namer   rA   rA   rB   init_method  s   zFSDPTest.init_methodc                 C      |  ||j d S rl   )r_  r   )rJ   r   r   rA   rA   rB   _check_cpu_offload  r   zFSDPTest._check_cpu_offloadc                 C   r{  rl   )r_  backward_prefetch)rJ   r   r}  rA   rA   rB   _check_backward_prefetch  r   z!FSDPTest._check_backward_prefetchc                 C   r{  rl   )r_  forward_prefetch)rJ   r   r  rA   rA   rB   _check_forward_prefetch  r   z FSDPTest._check_forward_prefetchc                 O   ri  rl   rj  r  rA   rA   rB   r-     rk  zFSDPTest.run_subtestsc              
   K   s(  | |}||_ ||_|dd}td|j  d|j  z%|r2tjjjj	
 }tjd|j||d ntj|jtt|j|j d W n ty] }	 zd|	jd	 v rXttd
 j  d }	~	ww d }
|j t }tsitrotj| |g}
tj|
d tj  ||| tj  tj|
d t  d S )Nfake_pgFzdist init r=z, world=Zfake)backendrt   r   store)rz  r  rt   r   Z	recompiler   Zbackend_unavailable)
device_ids) r   ry  getprintrt   r[   testing	_internalr  r  Z	FakeStorerq   Zinit_process_grouprz  DISTRIBUTED_BACKENDr   RuntimeErrorrT   sysexitr.   	exit_coderf  r1   r3   ZacceleratorZset_device_indexrE  rn  ro  Zrun_testZdestroy_process_group)rX  r   Z	test_namery  piperU   rJ   r  r  er  	device_idrA   rA   rB   _run  sL   


zFSDPTest._run{Gz?NFr`   	num_stepsautocastlrfsdp_cpu_offload
save_modelmixed_precisionenable_sharded_grad_scaleruse_pure_fp16sharded_grad_scaler_kwargsc              	   C   sF  |o|j }t| j}|
d u ri }
tdd|i|
}tjj| |dd}t|D ]}|	  tj
jt|dY |jtt}|	sK|r_t|ts_t|tjrV| }n	tdd |D }|| }|rt|tr|jtvr| D ]}| |jtd qs|j|||}W d    n1 sw   Y  ||}|s|	s|jtjksJ dn'|	r| |jtj nt|tr|d usJ | |j|j n| |jtj |j| |rt|tr| D ]}| |jtd q|| |   |rd	d
 |! " D }t#| |$| q*t|tr|%t&j' |( S )Nenabledg?)r  Zmomentum)r  c                 s   r  rl   )r   )rf   r   rA   rA   rB   r    r  z4FSDPTest._train_for_several_steps.<locals>.<genexpr>r8   zeloss data type should be float32, as the original                     parameter data type is float32.c                 S   s   i | ]	\}}||  qS rA   )clone)rf   kvrA   rA   rB   
<dictcomp>  s    z5FSDPTest._train_for_several_steps.<locals>.<dictcomp>rA   ))offload_paramsnextr   rK   r   r[   optimZSGDrr   Z	zero_gradampr  rz   r   rL   r   r~   r\   r   rZ   r   r   r_  rP   r   scaler   Zfloat32Zfloat16Zparam_dtyperS   stepupdater   r   r   Zload_state_dictZ_assert_stater   ZIDLEre   )rJ   r`   r  r  r  r  r  r  r  r  r  Zcpu_offload_paramsZmodel_deviceZsharded_grad_scalerr  rm   rN   rO   r   rR   r   rA   rA   rB   _train_for_several_steps  sn   





z!FSDPTest._train_for_several_stepsr   Tmodel_classr   r   ref_init_fn	num_itersr   r}  r   r  use_orig_paramsinit_kwargsc           "      K   s  |t jks	J d|du ri }d}| j }|j| jt jtjfddi|}|du r>tr5t|t	gt	d}nt||g|d}n||}|rH|
 }| j|||
du|||
|||d	}t| }||||	|
||d z|j| j|||fddi|}W n ty } ztd	| d
t| |d}~ww t|tst|| jfi |}|r|
 }|tjkr|t	}|duo|j}|o|tjk}|o|tjk}|rtd}| D ]	}| |j| q|r| tdt	 nt }| | j||d||||
|||d
} W d   n	1 s	w   Y  |rdS |r/td}| D ]
}| |j| q| t	} t|}!tjj|| dd |
du rO|sQ| j||!ddd dS dS dS )a  
        Tests FSDP training against a reference, which defaults to DDP but
        may be customized with ``ref_init_fn``.

        Args:
            model_class (Type[FSDPTestModel]): A model class that inherits from
                ``FSDPTestModel``, which defines the expected interface.
            fsdp_init_mode (FSDPInitMode): The mode to initialize the
                FSDP-wrapped model. This should not be ``NO_FSDP``.
            ref_init_fn (Optional[Callable]): A callable to invoke that wraps a
                non-wrapped model to construct the reference model, where this
                wrapper should provide data parallel semantics. If ``None``,
                then the callable defaults to the DDP constructor.
        z.Expects an FSDP init mode that wraps with FSDPNr  r   T)r  Zoutput_device)r  r  r  r  r  r  r  )r   r}  r   r  r  r  zInitializing z raised error r8   zOAn FSDP-managed module with parameter CPU offloading enabled has parameters on F)r  r  r  r  r  r  r  r  )Zcheck_dtypezFSDP did not match DDP)Zexact_devicemsg) r;   r?   ra   r   rW   rC   rD   r2   DDPrz   r   r  r   r   r  	Exceptionr   r   r   r~   rE   r   r  r[   rK   r_  assertRaisesRegexr  r   r   r  Zassert_close)"rJ   r  r   r   r  r  r  r   r}  r   r  r  r  r  r  r  r  r   r  r   r`   Z	ref_modelZref_lossZ
ddp_paramsr   r  r  Zexpects_device_errorZexpects_cpu_deviceZ
cpu_devicerh   contextZ	fsdp_lossZfsdp_unsharded_paramsrA   rA   rB   _test_fsdp_parity)  s   #






	


zFSDPTest._test_fsdp_parity)r  NFNFFN)$r<   r=   r>   rh  rs  rt   ra   r   rx  rz  r|  r~  r  r-   classmethodr  r^   r_   r   floatr   r   r   r   r   r
   r  r  rF   r;   rC   r   r   r   r  r   rA   rA   r   rB   rt  x  s    



8	

\	
rt  compile_compute_on_modulec                    s.   fddG dd dt   fdd}|S )Nc                     s>   t jjj| i |  d u st| d  r| d   d S d S )Nr   )r[   r  r  r   r   r   rV   )r  rA   rB   !fully_shard_with_compiled_compute  s   
z=compiled_fsdp_test.<locals>.fully_shard_with_compiled_computec                   @   r:   )z*compiled_fsdp_test.<locals>.FullyShardModeN)r<   r=   r>   r   EAGERCOMPILED_COMPUTErA   rA   rA   rB   FullyShardMode  s    
r  c                    s   t   fdd}|S )Nc                     s   t jjj} D ]b}| jkrt std qt jj	j
}t jj	j}t j  | jkr.|}n| jkr@dt jj	_
dt jj	_}ntd| |j|j< | i | t j  |j|j< |t jj	_
|t jj	_qd S )Nz0Inductor on GPU needs Triton and recent GPU archTr9   z!Need to implement FullyShardMode=)r[   r  r  r   r  r4   warningswarnrn  configZskip_fsdp_hooksZ	_inductorZcompile_threadsrE  r  NotImplementedError__globals__r<   )rT   rU   Zoriginal_fully_shardmodeZoriginal_skip_fsdp_hooksZoriginal_compile_threadsZfully_shard_patch)r  r  funcrA   rB   wrapper  s0   










z6compiled_fsdp_test.<locals>.decorator.<locals>.wrapperr   )r  r  )r  r  )r  rB   	decorator  s    z%compiled_fsdp_test.<locals>.decorator)r   )r  r  rA   )r  r  r  rB   compiled_fsdp_test  s   $r  c                       s&   e Zd Zd fddZdd Z  ZS )
SkipModulerG   Nc                    s    t    tjdddd| _d S N
   Fr2  )r   r   r^   r   linr   r   rA   rB   r     s   
zSkipModule.__init__c                 C   r   rl   )r  r   rA   rA   rB   r     r|   zSkipModule.forwardrX   r  rA   rA   r   rB   r    s    r  c                       r  )NestedLinearc                    sJ   t    |rttjddddt| _d S tjddddt| _d S r  )r   r   r    r^   r   r   rz   nested_linear)rJ   	fsdp_wrapr   rA   rB   r     s   
 zNestedLinear.__init__c                 C   r   rl   )r  r   rA   rA   rB   r     r|   zNestedLinear.forwardr  rA   rA   r   rB   r    s    r  c                       r  )	SkipModelc                    sH   t    tjddddt| _t t| _t	t
|dtd| _d S )Nr  Fr2  )r  )r  )r   r   r^   r   r   rz   linearr  linear_skipr    r  r  )rJ   Zdouble_nestr   rA   rB   r     s   

zSkipModel.__init__c                 C   s"   |  |}| |}| |}|S rl   )r  r  r  r   rA   rA   rB   r   $  s   


zSkipModel.forwardr  rA   rA   r   rB   r    s    r  )FT)FFrB  )rA   rl   )
contextlibrv  r   r  r  r  abcr   r   r   copyr   enumr   r   	functoolsr	   typingr
   r   r   r   r   r   Zunittestr   r[   Ztorch.distributedr  rq   Ztorch.nnr^   Ztorch.nn.functionalr   r*  Ztorch.distributed._composabler   Ztorch.distributed.device_meshr   Ztorch.distributed.fsdpr   r   r   r~   Z$torch.distributed.fsdp._common_utilsr   Z5torch.distributed.fsdp._fully_shard._fsdp_param_groupr   r   Z"torch.distributed.fsdp._init_utilsr   Z2torch.distributed.fsdp.fully_sharded_data_parallelr   r   r   Z*torch.distributed.fsdp.sharded_grad_scalerr   Ztorch.distributed.fsdp.wrapr   r   r    Ztorch.distributed.tensorr!   r"   r#   Z!torch.distributed.tensor.parallelr$   r%   r&   r'   r(   r)   Ztorch.nn.parallel.distributedr*   r  Z*torch.testing._internal.common_distributedr+   r,   r-   r.   Z$torch.testing._internal.common_utilsr/   r0   r1   r2   r3   Ztorch.utils._tritonr4   rf  rz   r  r6   Zdevice_countr7   r;   rC   r_   rF   r   ry   r{   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r#  r   r.  r>  contextmanagerrF  rI  rK  rN  rQ  rS  rU  rY  rZ   r   rb  rc  rt  r  r  r  r  r  rA   rA   rA   rB   <module>   s   		


	 ^"`M	  -


!  ]3	