o
    ZhE                     @   s  d dl Z d dlmZmZ d dlZd dlmZ d dlm	  m
Z d dlm	  mZ d dlmZ d dlmZ d dlmZmZmZ d dlmZmZ d dlmZ d dlmZmZmZ d d	lm Z  ej!j"Z"d
gZ#e j$dd
 Z%de&e df de'de'fddZ(de&e df dedefddZ)dej*j+de&e,df de-e.e,f defddZ/dd Z0dej*j+de&e,df de-e.e,f de,fddZ1dej*j+de&e,df de-e.e,f de,fddZ2d ed!ed"ee d#ee d$e'd%e'd&ej3d'e'ded(e'de&eef fd)d*Z4dej*j+de&e,df de-e.e,f de,fd+d,Z5d-ed ed!ed"ee d$e'd%e'd.ed&ej3d'e'ded(e'defd/d0Z6dej*j+de&e,df de-e.e,f de,fd1d2Z7e"j0j8e1e"j9j8e2e"j:j8e5e"j;j8e5e"j<j8e7e"j=j8e7iZ>d3d4 Z?d5d6 Z@dS )7    N)castOptional)Tensor)
DeviceMesh)DTensor	ReplicateShard)DTensorSpec
TensorMeta)_MaskPartial)	_skip_dim	Reductionreplicate_reduction_dims)	Placementloss_parallelc                   c   s    t   dV  t  dS )a  
    A context manager that enables loss parallelism, where efficient parallelized loss computation
    can be performed when the input is sharded on the class dimension. Currently only the cross-entropy
    loss is supported.

    Within this context manager, one can use :func:`~torch.nn.functional.cross_entropy` or
    :class:`~torch.nn.CrossEntropyLoss` as usual, with the following assumptions on the input parameters.
    The corresponding ``backward()`` call, if any, also needs to happen under this context manager.

    Args:
        input (:class:`DTensor`):
            Input logits. Assumed to be sharded on the class dimension.
        target (Union[:class:`torch.Tensor`, :class:`DTensor`]):
            Must be ground truth class indices (class probabilities currently not supported).
            Assumed to be replicated across the ``DeviceMesh``.
        weight (Union[:class:`torch.Tensor`, :class:`DTensor`], optional):
            If given, assumed to be replicated across the ``DeviceMesh``.
        label_smoothing:
            Currently not supported.

    Returns:
        A replicated :class:`DTensor`.

    Example:
        A sharded DTensor is manually created here to showcase the usage.
        In practice, it is usually the output of a TP module.

        >>> # xdoctest: +SKIP("distributed")
        >>> from torch.distributed.tensor.parallel import loss_parallel
        >>> from torch.distributed.device_mesh import init_device_mesh
        >>> ...
        >>> device_mesh = init_device_mesh("cuda", (8,))
        >>> input = torch.randn(4, 16, device="cuda", requires_grad=True)
        >>> dist_input = distribute_tensor(input, device_mesh, placements=[Shard(1)])
        >>> target = torch.randint(16, (4,), device="cuda")
        >>> with loss_parallel():
        >>>     loss = F.cross_entropy(dist_input, target, reduction="mean")
        >>>     loss.backward()
        >>> ...
    N)_enable_custom_loss_ops_disable_custom_loss_ops r   r   U/var/www/auris/lib/python3.10/site-packages/torch/distributed/tensor/parallel/loss.pyr      s   *

placements.dimreturnc                 C   s6   t | dks
td| d |std| ddS )N   zLCurrently loss_parallel() only supports input on one-dimensional DeviceMesh.r   zUloss_parallel() should be enabled only when the input tensor is sharded on dimension .)len
ValueErrorZis_shard)r   r   r   r   r   _find_all_reduce_mesh_dimP   s   
r   meshc                 C   s`   t | tr| j|kr| S td| d| j dt | tjr'tj| ||ddS tdt|  )Nz	Expected z	 but got r   F)Zdevice_meshr   Z	run_checkzUnsupported type )	
isinstancer   r   RuntimeErrortorchr   Z
from_local	TypeErrortype)Ztensorr   r   r   r   r   _cast_to_dtensor\   s   

r#   op_callargskwargsc                 C   sT   t j| ||}t jj|j}t|tr|S t|tr |d S t	dt
| d)Nr   zUnexpected tensor meta type: r   )r   _op_dispatcherZunwrap_to_op_infoZsharding_propagator_propagate_tensor_metaZschemar   r
   tupler   r"   )r$   r%   r&   Zop_infotensor_metar   r   r   r(   l   s   

r(   c                 C   s   |r
| j tjks
J tj| tjjd\}}| j|tjd} | 	 dkr&| }ntj
| |dd}tj|tjjj||fd}| | }tjt||dd}	tj|	tjjj||fd}	t|	}
||
 }|sg||}|S )N)Ztype_promotion_kind)dtypeZmemory_formatr   T)Zkeepdim)ZreduceOpgroup)r+   r    ZhalfutilsZelementwise_dtypesZELEMENTWISE_TYPE_PROMOTION_KINDDEFAULTtoZcontiguous_formatZnumelZamaxfuncolZ
all_reducec10dZReduceOpMAXnamesumexpSUMlog)xr   half_to_floatr   mesh_dimZcomputation_dtypeZresult_dtypeZshiftedZx_maxZshifted_sumexpZshifted_logsumexpresultr   r   r   _log_softmax   s,   


r<   c                 C   s~   t t|d }t t|d }t t|d }|j}t|j|}t| ||}t|j	|||j
|}	t|j
|j|d}
t|	|
|	jdS )Nr   r      r*   requires_grad)r   r   intbool_specr   r   r(   r<   _local_tensorr   r	   r@   )r$   r%   r&   r8   r   r9   specr:   output_tensor_metaresZres_specr   r   r   _log_softmax_handler   s"   rH   c                 C   s(   t t|d }t tj|d }||S )Nr      )r   r   r    r+   r/   )r$   r%   r&   grad_outputZinput_dtyper   r   r   _log_softmax_backward_handler   s   
rK   r8   targetweightlocal_weight	reductionignore_indexinput_shapechannel_dimr:   c
                    s  |   d dk rd dtdtf fdd}
|d ur.|
|}|d us&J |
|}| | } t||k|d}| }t| d}||||	}t|  |}||||	}|	  }t||k|d}|t
jjkrzdkrz| d	d
}||fS |d urt| j}d| < ||}t| |	 }t||k|d}| }n	||k | }|t
jjkr| }||fS |t
jjkr| | }||fS )Nr   r=   r   rM   r   c                    s6   dkrdg }| j d | < | |}|S | }|S )Nr   r   )shapeview)rM   rS   wrR   Zn_dimsr   r   _weight_view   s   
z'_nll_loss_forward.<locals>._weight_viewZoffset_shapeZ
offset_dimr   g        )r   r   r    where	unsqueezer   _partition_valuegatherZ_reduce_valuesqueezer   NONEvalueZnew_fulllistrS   expandr4   r/   r6   MEAN)r8   rL   rM   rN   rO   rP   rQ   rR   r   r:   rW   rU   Zlocal_wsafe_targetZsafe_target_partial_placementZsafe_target_partial_Zresult_partialZresult_reducedr;   total_weight	new_shapeZwsumr   rV   r   _nll_loss_forward   sJ   



rh   c                    s  t t|d }|d }|d }t t|d }t t|d }| dkr%dnd}|j}	t|	j| tt|	j|g|}
t	 f|	j
j }t||
|	j
}d }|d urxt|||	j
} fddt|	j
jD }||	j
|j}|jd |jj| ksxJ |tjjkr|
}n|}t|}|||d< |d< t| t||}t|j|j|d ur|jnd ||||j||	j
 
\}}t|	j
||d}t|||jd	|fS )
Nr   r   r=   rI      c                    s"   g | ]}| krt d nt qS )r   )r   r   ).0ir:   r   r   
<listcomp>)  s    z-_nll_loss_forward_handler.<locals>.<listcomp>r>   r?   )r   r   rA   r   rC   r   r   r   r   r   r   ndimr#   rangeZredistributerD   rS   r   r_   r`   ra   r(   r)   rh   r	   r@   )r$   r%   r&   r8   rL   rM   rO   rP   rR   rE   target_placementsall_replicate_placementsrN   Zsharded_placementsZoutput_placementsrF   r;   rf   out_specr   rl   r   _nll_loss_forward_handler  s^   

rs   rJ   rf   c                 C   s  |  dk rdnd}|tjjkr| | } ||}t||k|d}t|}t||d}|	|
 }|||	|
}|jjd usCJ |jj|jd }tj|jd |jd}|  dkrc|||< n.|  dkrp||||f< n!||d}|j}|d|j| }||||f< |||d}|  |     krdkrn n| |} |d urdd	 t|  D }|jd ||< ||}t|j}d||< ||}t|||}| | } t||k| d} |t| |  S )
Nr=   r   r   rX   g      ?)devicerY   c                 S   s   g | ]}d qS )r   r   )rj   _r   r   r   rm     s    z6_nll_loss_and_log_softmax_backward.<locals>.<listcomp>)r   r   rc   r`   r[   r    rZ   Z
zeros_liker   r^   flattenr\   Zmask_bufferdatar/   r+   ZarangerS   rt   Z	transposeZreshaperT   ro   ra   rb   r]   r5   )rJ   r8   rL   rM   rO   rP   rf   rQ   rR   r   r:   rd   Z
grad_inputre   Zmasked_safe_targetZgrad_updateZ	arange_1dZgrad_input_tZintermidate_shapeZgrad_input_2drg   rU   Zw_targetr   r   r   "_nll_loss_and_log_softmax_backwardX  sH   


 



rx   c                 C   sV  t t|d }t t|d }|d }|d }t t|d }t t|d }t t|d }	| dkr3dnd}
|j}t|j|
}tt	|j|
g|
}t
 f|jj }t|||j}|d urbt|||j}t|}|||d< |d< t|	||j|d< t| t||}t|j|j|j|d ur|jnd |||	|j|
|j|}t|j|j|d}t|||jd	S )
Nr   r   r=   rI   ri         r>   r?   )r   r   rA   r   r   rC   r   r   r   r   r   r   rn   r#   ra   r(   r)   rx   rD   rS   r	   r@   )r$   r%   r&   rJ   r8   rL   rM   rO   rP   rf   rR   rE   r:   rp   rq   rF   r;   rr   r   r   r   _nll_loss_backward_handler  sX   r{   c                   C   s   t jjt d S N)r   r'   _custom_op_handlersupdatecustomized_loss_opsr   r   r   r   r     s   r   c                  C   s   t D ]	} tjj|  qd S r|   )r   r   r'   r}   pop)Z	custom_opr   r   r   r     s   r   )A
contextlibtypingr   r   r    Ztorch._prims_commonZ_prims_commonr-   Z)torch.distributed._functional_collectivesdistributedZ_functional_collectivesr0   Z"torch.distributed.distributed_c10dZdistributed_c10dr1   r   Ztorch.distributed.device_meshr   Ztorch.distributed.tensorr   r   r   Z&torch.distributed.tensor._dtensor_specr	   r
   Z,torch.distributed.tensor._ops._embedding_opsr   Z'torch.distributed.tensor._ops._math_opsr   r   r   Z(torch.distributed.tensor.placement_typesr   opsZaten__all__contextmanagerr   r)   rA   r   r#   Z_opsZ
OpOverloadobjectdictstrr(   r<   rH   rK   Sizerh   rs   rx   r{   defaultZ_log_softmax_backward_dataZnll_loss_forwardZnll_loss2d_forwardZnll_loss_backwardZnll_loss2d_backwardr   r   r   r   r   r   r   <module>   s   
2










	


I


K	

E


<
