o
    Zh2                     @   s6  d dl mZ d dlmZ d dlZd dlm  m  mZ	 d dl
mZ d dlmZ d dlmZ d dlmZmZmZmZmZ ded	ed
ee deeedf eedf f fddZdejd	ed
ee deee ee f fddZdejjdee defddZded	ed
ee deedf fddZ dej!fddZ"dS )    )Sequence)castN)	ShapeType)
DeviceMesh)DTensorSpec)_StridedShardPartial	Placement	ReplicateShardglobal_shapemesh
placementsreturn.c              	      s>     du r
dS t| }dgt|  } fddtt| D }dgt|  }t|D ]b\}} |}	t|tr|j}
dgt|  }|
t|k sWJ d|
 dt| |j	||
 |	| d	d
\}}|||
< |||
< ||
 ||
 kr}||
 ||
< n
||
  ||
 7  < ||
  |	9  < q-t
dd |D }|rdgt|  }dgt|  }t|D ]S\}} |}	t|tr|j}
||
 rtd| d| d| d||
 rd	||
< t|trd	||
< ||
 |j|	  ||
 |< q||
  |	  < ||
 ||
 |< qfddt|D }dd t||D }t|t|fS )a  
    Compute the local tensor shape and the global offsets into the original tensor
    of a DTensor on its current global rank. This is useful for checkpointing purpose.

    Example (2 host with 4GPUs each):
    # Below is a DeviceMesh with mesh_shape of (2, 4)
    mesh = DeviceMesh(device_type="cuda",
                        mesh=[
                        [0, 1, 2, 3],
                        [4, 5, 6, 7]
                        ],
    )

    Let's say we distribute a global_tensor of shape (8,4) over the above DeviceMesh
    with a placements of [Shard(0), Shard(0)].
    The local shape and global offset will be as follows:
    rank0 -- local_shape:[1, 4], global_offset:[0, 0]
    rank1 -- local_shape:[1, 4], global_offset:[1, 0]
    rank2 -- local_shape:[1, 4], global_offset:[2, 0]
    rank5 -- local_shape:[1, 4], global_offset:[5, 0]
    rank3 -- local_shape:[1, 4], global_offset:[3, 0]
    rank4 -- local_shape:[1, 4], global_offset:[4, 0]
    rank6 -- local_shape:[1, 4], global_offset:[6, 0]
    rank7 -- local_shape:[1, 4], global_offset:[7, 0]

    Let's say we distribute a global_tensor of shape (2) over the above DeviceMesh with
    a placements of [Shard(0)]. We will not have non-empty local tensor for all the ranks.
    The local shape and global offset will be as follows:
    rank0 -- local_shape:[1,], global_offset:[0,]
    rank1 -- local_shape:[1,], global_offset:[1,]
    rank2 -- local_shape:[0,], global_offset:[2,]
    rank5 -- local_shape:[0,], global_offset:[2,]
    rank3 -- local_shape:[0,], global_offset:[2,]
    rank4 -- local_shape:[0,], global_offset:[2,]
    rank6 -- local_shape:[0,], global_offset:[2,]
    rank7 -- local_shape:[0,], global_offset:[2,]
    N)r    r   c                    s   g | ]}d g j  qS r   )ndim).0_)r   r   N/var/www/auris/lib/python3.10/site-packages/torch/distributed/tensor/_utils.py
<listcomp>B   s    z9compute_local_shape_and_global_offset.<locals>.<listcomp>   Sharding dim  greater than tensor ndim T)Zreturn_offsetc                 s   s    | ]}t |tV  qd S N)
isinstancer   )r   pr   r   r   	<genexpr>{   s    z8compute_local_shape_and_global_offset.<locals>.<genexpr>FzTStrided sharding does not allow Shard() to appear after the strided part has ended. z at idx z in z violates this assumption.c                    s(   g | ]\}}t d d t| D qS )c                 S      g | ]\}}|| qS r   r   r   xyr   r   r   r          zDcompute_local_shape_and_global_offset.<locals>.<listcomp>.<listcomp>)sumzip)r   	shard_dimZshard_idx_stride)my_coordinater   r   r      s    c                 S   r   r   r   r   r   r   r   r      r"   )Zget_coordinatelistlenrange	enumeratesizer   r   dimZ_local_shard_size_on_dimanyNotImplementedErrorr   Zsplit_factorr$   tuple)r   r   r   Zlocal_shapeZglobal_offsetZshard_idx_stride_by_mesh_dimZnum_shards_by_tensor_dimidx	placementmesh_dim_sizer%   Zlocal_offsetZ
shard_sizeZshard_offsetZstrided_shardingZstrided_part_seenZstrided_part_endZ	shard_idxr   )r   r&   r   %compute_local_shape_and_global_offset   s   (








r3   tensorc              	   C   s  t |  }t |  }t|D ]n\}}||}| rmtt|}|jdk r.td| |j}	|	| j	k sEJ d|	 d| j	 d| d||	 }
|
| ||	< t
t|D ]}||	krk|| ||	 krk|| | ||< qUqt|ttfs~tdt| dq||fS )	aV  
    Compute the global size and stride of a DTensor from the given local tensor.
    The local size is multiplited by `world_size` per Sharding dim.
    The local stride is multiplited by `world_size` per Sharding dim, as long as the
    dimension is outside sharding dim.

    For example, if we have a local tensor with size (4, 8, 2) and stride (16, 1, 8).
    If the DTensor placements are [Shard(2)] and world_size is 2;
    then the global size is (4, 8, 4) and stride is (16 * 2, 1, 8).

    Args:
        tensor (:class:`torch.Tensor`):
            Local tensor which DTensor will be constructed from.
        mesh (:class:`DeviceMesh`):
            Object which describes the mesh topology
            of devices for the DTensor.
        placements (Sequence[:class:`Placement`]]):
            The attribute of the DTensor that describes its layout
            on the mesh topology.

    Return:
        tensor_shape: A List of int which specifies the size of DTensor which build
            on top of the local tensor.
        tensor_stride: A List of int which specifies the stride of DTensor.
    r   zOShard placements should have negative dims normalized in the user-facing APIs: r   r   z for placement number .zplacement type z not supported!)r'   r+   Zstrider*   is_shardr   r   r,   AssertionErrorr   r)   r(   r   r
   r   RuntimeErrortype)r4   r   r   Ztensor_shapeZtensor_strider0   r1   r2   Zshard_placementr%   Zlocal_dim_sizeir   r   r   compute_global_tensor_info   s6   


r;   op_callargsc                 C   sp   |D ]-}t |tjtfr|j  S t |ttfr/t|dkr/t |d tjtfr/|d j  S qtd|  d)z
    Find the device mesh object from args.
    It returns None if no mesh is found.
    NOTE: we can optimize this search if needed
    r   z+Cannot find device mesh from args for op : r5   )	r   dtensorZDTensorr   Zdevice_meshr'   r/   r(   
ValueError)r<   r=   argr   r   r   try_find_mesh_from_args   s   
rA   global_stridec                    s   dgt   t|D ]*\}}| r5tt|j}tt  D ]} |  | kr4|  ||9  < qqt fddtt  D S )z
    Compute the stride of a local tensor shard, given the global stride of the DTensor.
    NOTE: Currently this function is assuming the DTensor is evenly shardable.
    r   c                 3   s     | ]} | |  V  qd S r   r   )r   r:   rB   Zstride_divisorsr   r   r     s    
z'compute_local_stride.<locals>.<genexpr>)	r(   r*   r6   r   r   r,   r)   r+   r/   )rB   r   r   Zmesh_idxr   r:   jr   rC   r   compute_local_stride   s   
rE   c                 C   s\   t | tjr| S t | tr| g}nt| dkr%t | d tr%t| d }nt| }t|S )z
    Unify variable types of size argument to torch.Size
    Acceptable types include:
        int, Sequence[int], Tuple[int], Tuple[Sequence[int]],
        or torch.Size
    r   r   )r   torchSizeintr(   r   r'   )r+   Z
torch_sizer   r   r   normalize_to_torch_size  s   

rI   )#collections.abcr   typingr   rF   Ztorch.distributed.tensor._apidistributedr4   Z_apir>   Ztorch._prims_commonr   Ztorch.distributed.device_meshr   Z&torch.distributed.tensor._dtensor_specr   Z(torch.distributed.tensor.placement_typesr   r   r	   r
   r   r/   rH   r3   ZTensorr'   r;   Z_opsZ
OpOverloadobjectrA   rE   rG   rI   r   r   r   r   <module>   sX    	
 
;


