o
    Zh^                      @   sP  d dl Z d dlZd dlmZ d dlZd dlZddlmZ ddlm	Z	m
Z
 ddlmZ G dd deZG d	d
 d
eZe jdefddZdejdefddZdejdefddZdejdefddZG dd deZG dd deZG dd deZdgdggZdgdggdgdggdgd gggZg d!g d"g d"gZdejdefd#d$ZdS )%    N)IntEnum   )ir)get_dtype_sizesympy_product)Vc                   @      e Zd ZdZdZdZdS )	NCCL_COLLr   r      N)__name__
__module____qualname__
ALL_REDUCE
ALL_GATHERREDUCE_SCATTER r   r   L/var/www/auris/lib/python3.10/site-packages/torch/_inductor/comm_analysis.pyr	          r	   c                   @   r   )NVIDIA_GPU_TYPEr   r   r
   N)r   r   r   VOLTAAMPEREHOPPERr   r   r   r   r      r   r   returnc                  C   sH   t jjt jjjpd} d| v rtjS d| v rtjS d| v r!tjS tjS )N ZV100ZA100ZH100)	torchutilsZcollect_envZget_gpu_inforunr   r   r   r   )Zgpu_infor   r   r   get_gpu_type   s   r   nodec                 C   sd   t | tjstd|  | j}|d usJ d|v rtjS d|v r$tjS d|v r+tjS td| )Nz!node is not a collective kernel: Z
all_reduceZ
all_gatherZreduce_scatterzUnsupported collective kernel: )	
isinstancer   _CollectiveKernel
ValueErrorZpython_kernel_namer	   r   r   r   )r   Zkernel_namer   r   r   get_collective_type(   s   r"   c                 C   s\   d}| j D ]&}t|jj}t|tjrt|}n	tj	j
j|dd}||t|jj 7 }q|S )Nr   )fallback)Zinputsr   Zlayoutsizer   sympyIntegerintr   graphZsizevarsZ	size_hintr   Zdtype)r   Zsz_bytesinpZnumelr   r   r   get_collective_input_size_bytes8   s   

r*   c                 C   s6   t | tjkrddlm} || jd S td|  )Nr   )_get_group_size_by_namezUnsupported collective type: )typer   r    Z"torch.distributed.distributed_c10dr+   Zconstant_args	TypeError)r   r+   r   r   r   get_collective_group_sizeE   s   r/   c                   @   r   )NCCL_HWr   r   r
   N)r   r   r   NVLINKZPCINETr   r   r   r   r0   S   r   r0   c                   @   s   e Zd ZdZdZdS )	NCCL_ALGOr   r   N)r   r   r   ZTREERINGr   r   r   r   r3   Y   s    r3   c                   @   s   e Zd ZdZdS )
NCCL_PROTOr   N)r   r   r   LLr   r   r   r   r5   ^   s    r5   g333333@gffffff@g333333?      ?g      @g@)     C@r8   gffffff4@)gU@g     6@g      3@c                 C   s  t | }|d d d }d}t| }t|| }|}|dkr!dS tj}tj}t| }	t	j
jj}
t	j
jj}t }|dkr@|d nd}|dkrH|nd}t| | }|dkrV|
n|}d}|| }t|||dksj|	tjkrldnd }|	tjkr|d|d  }n|	tjtjfv r|d }d| | }|| }|d	 }tj}|	tjkr|dkrd| }nd}n|	tjtjfv r|d }t| | }t| | | }ttj | | }d
}|dkrd}t||}||| | ||  7 }|d }|| }|| S )a9  
    Returns estimated NCCL collective runtime in nanoseconds (ns).

    The following heuristics are copied from https://github.com/NVIDIA/nccl/blob/master/src/graph/tuning.cc.
    We aim to estimate the runtime as accurately as possible.

    Assumptions:
    - only ring algorithm (NCCL_ALGO_RING) is used
    - only Low-Latency protocol (NCCL_PROTO_LL) is used, i.e. Simple or LL128 is not used
    - 8 gpus per node  # TODO: Need to find a way to get accurate "gpus per node" and "# nodes" info.
    - collective is one of: allreduce, reducescatter, allgather
    i      r   r   r
   g      ?gUUUUUU?r7   g    eAg        g     @@)r*   r/   mathceilr3   r4   r5   r6   r"   r   Z	_inductorconfigZintra_node_bwZinter_node_bwr   llMaxBwsminr	   r   r   r   r0   r1   baseLathwLatr2   max)r   Ztensor_storage_size_bytesZtensor_storage_size_GBZnum_gpus_per_nodeZ
group_sizeZnNodesZnRanksZ	nccl_algoZ
nccl_protoZcollZbwIntraZbwInterZcompCapIndexindex2Zindex1ZllMaxBwbwZ	nChannelsZbusBwZnstepsratio	bandwidthZbandwidth_GB_per_nsZintraHwZnInterStepsZlatencyZintraLatZinterLatZnetOverheadZ
latency_nsZtransport_nsr   r   r    estimate_nccl_collective_runtime   sf   





rF   )	functoolsr:   enumr   r%   r   r   r   r   r   r   Zvirtualizedr   r	   r   	lru_cacher   ZIRNoder"   r'   r*   r/   r0   r3   r5   r?   r@   r=   floatrF   r   r   r   r   <module>   sL    