a
    gh4O                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
mZ ddlmZ ddlmZ dd Zd	d
 Zdd Zd*ddZd+ddZd,ddZG dd dZG dd dZdd Zd-ddZd.d d!Zd"d# Zed/d&d'Zd0d(d)ZdS )1    N)contextmanager)AnyDictList   )language)runtimec                 C   sL   d | } dddd|  dg}t|}|tjjd}dd |D }|S )	N,
nvidia-smi-i0z--query-gpu=z--format=csv,noheader,nounitsc                 S   s   g | ]}t |qS  )int.0xr   r   </var/www/auris/lib/python3.9/site-packages/triton/testing.py
<listcomp>       znvsmi.<locals>.<listcomp>)join
subprocesscheck_outputdecodesysstdoutencodingsplit)attrscmdoutretr   r   r   nvsmi   s    

r!   c                    s0   t  t   fddfdd|D S )Nc                    sb   d|   krdksn t d| d  }t|}t|}|| }d|  |  | |   S )Nr   r   z%Quantiles must be in the range [0, 1])
ValueErrormathfloorceil)qZpointloweruppert)anr   r   get_quantile   s    

z_quantile.<locals>.get_quantilec                    s   g | ]} |qS r   r   )r   r&   )r,   r   r   r   '   r   z_quantile.<locals>.<listcomp>)lensorted)r*   r&   r   )r*   r,   r+   r   	_quantile   s    	r/   c                 C   s~   |d ur*t | |}t|dkr&|d }|S |dkr6| S |dkrFt| S |dkrVt| S |dkrht| S |dkrzt| S d S )Nr   r   allminmaxmeanmedian)r/   r-   r1   r2   
statisticsr3   r4   )times	quantilesreturn_moder    r   r   r   _summarize_statistics*   s    

r9      r3   c              	   C   s  ddl }|dv sJ |j|j  |   |durZ|D ]}|  |d d|_q<|jjdd}|jjdd}|  t	dD ]
}	|   q|  |j
  ||d }
|
dkrd}ntdt||
 }|j }|j|@ t	|D ]&}	|dur|D ]}d|_q|   qW d   n1 s00    Y  |j
  g }d	}t	|D ]X}	|jjdd}|jjdd}|  |  |  |j
  |||| g7 }qTt|||W  d   S 1 s0    Y  dS )
a  
    Benchmark the runtime of the provided function.

    :param fn: Function to benchmark
    :type fn: Callable
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
    :type return_mode: str
    r   Nr1   r2   r3   r4   r0   TZenable_timing   i  r   
   )torchcudastreamZStreamZdetach_Zrequires_grad_gradEventrecordrangesynchronizeelapsed_timer2   r   Z	CUDAGraphgraphZreplayr9   )fnrepgrad_to_noner7   r8   r?   r   start_event	end_event_estimate_msn_repeatgr    Z	n_retriesr   r   r   do_bench_cudagraph<   sN    




(

rR      d   c                    st  |dv sJ t jj  |      t jj } jdd} jdd}|  tdD ]}	t jj	| |   qZ|     |
|d }
tdt||
 }tdt||
 } fddt|D } fddt|D }t|D ]
}	|   qt|D ]L}|d	ur|D ]}d	|_qt jj	| ||   |   ||   q   d
d t||D }t|||S )a  
    Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
    the 20-th and 80-th performance percentile.

    :param fn: Function to benchmark
    :type fn: Callable
    :param warmup: Warmup time (in ms)
    :type warmup: int
    :param rep: Repetition time (in ms)
    :type rep: int
    :param grad_to_none: Reset the gradient of the provided tensor to None
    :type grad_to_none: torch.tensor, optional
    :param quantiles: Performance percentile to return in addition to the median.
    :type quantiles: list[float], optional
    :param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
    :type return_mode: str
    r;   Tr<   r=   r   c                    s   g | ]} j d dqS Tr<   rC   r   iZdir   r   r      r   zdo_bench.<locals>.<listcomp>c                    s   g | ]} j d dqS rU   rV   rW   rY   r   r   r      r   Nc                 S   s   g | ]\}}| |qS r   )rG   )r   ser   r   r   r      r   )r   driveractiveZget_device_interfacerF   Zget_empty_cache_for_benchmarkrC   rD   rE   clear_cacherG   r2   r   rB   zipr9   )rI   ZwarmuprJ   rK   r7   r8   cacherL   rM   rN   rO   Zn_warmuprP   rX   r   r6   r   rY   r   do_bench   s>    

ra    c                 C   sN  ddl }ddl}t| |js&|| } t||js<||}|du rHd}t|rZ|| jn|}|du rjd}t|r||| jn|}t| |jr| j|jkr|  } | 	 
   } t||jr|j|jkr| }|	 
   }| jdks|jdkr|jj| |||dd dS |j| |||dsJt| d	|  d
| d| d| d
dS )a  
    Asserts that two inputs are close within a certain tolerance.

    :param x: The first input.
    :type x: scala, list, numpy.ndarray, or torch.Tensor
    :param y: The second input.
    :type y: scala, list, numpy.ndarray, or torch.Tensor
    :param atol: The absolute tolerance. Default value is 1e-2.
    :type atol: float, optional
    :param rtol: The relative tolerance. Default value is 0.
    :type rtol: float, optional
    :param err_msg: The error message to use if the assertion fails.
    :type err_msg: str
    r   Ng{Gz?g        r   T)atolrtolZ	equal_nan)rc   rd    z is not close to z (atol=z, rtol=))numpyr?   
isinstanceZTensorZtensorcallabledtypebfloat16floatcpudetachsizeZtestingZassert_allcloseZallcloseAssertionError)r   yrc   rd   err_msgnpr?   r   r   r   assert_close   s2    

rt   c                   @   sL   e Zd ZdZdee ee eee ee eeeef eeeedddZ	dS )		Benchmarkzk
    This class is used by the :code:`perf_report` function to generate line plots with a concise API.
    rb   FN)x_namesx_valsline_arg	line_vals
line_names	plot_nameargsxlabelylabelx_logy_logc                 C   sL   || _ || _|
| _|| _|| _|| _|| _|| _|| _|	| _	|| _
|| _dS )aq  
        Constructor.
        x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list
        of scalars and there are multiple x_names, all arguments will have the same value.
        If x_vals is a list of tuples/lists, each element should have the same length as
        x_names.

        :param x_names: Name of the arguments that should appear on the x axis of the plot.
        :type x_names: List[str]
        :param x_vals: List of values to use for the arguments in :code:`x_names`.
        :type x_vals: List[Any]
        :param line_arg: Argument name for which different values correspond to different lines in the plot.
        :type line_arg: str
        :param line_vals: List of values to use for the arguments in :code:`line_arg`.
        :type line_vals: List[Any]
        :param line_names: Label names for the different lines.
        :type line_names: List[str]
        :param plot_name: Name of the plot.
        :type plot_name: str
        :param args: Dictionary of keyword arguments to remain fixed throughout the benchmark.
        :type args: Dict[str, Any]
        :param xlabel: Label for the x axis of the plot.
        :type xlabel: str, optional
        :param ylabel: Label for the y axis of the plot.
        :type ylabel: str, optional
        :param x_log: Whether the x axis should be log scale.
        :type x_log: bool, optional
        :param y_log: Whether the y axis should be log scale.
        :type y_log: bool, optional
        :param styles: A list of tuples, where each tuple contains two elements: a color and a linestyle.
        :type styles: list[tuple[str, str]]
        N)rv   rw   r   rx   ry   rz   r   stylesr}   r~   r{   r|   )selfrv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   __init__   s    /zBenchmark.__init__)rb   rb   FFN)
__name__
__module____qualname____doc__r   strr   r   boolr   r   r   r   r   ru      s&        
ru   c                   @   s4   e Zd Zdd ZdeeeedddZdd	d
ZdS )Markc                 C   s   || _ || _d S N)rI   
benchmarks)r   rI   r   r   r   r   r   9  s    zMark.__init__F   )bench	save_path
show_plots
print_datac              
      s|  dd l }dd lm}	 dd l}
|j}dd |jD }dd |jD }t|j}|
j|| | | d}|jD ]
 t	 tt
fs fdd|D  t t|krtdt| d  tt| }g g g   }}}|jD ]z}| jf i ||j|i|j|}z|\}}}W n$ ty6   |d d   }}}Y n0 ||g7 }||g7 }||g7 }qt | | | |jt|< qj|jr|	  |	 }|d }t|jD ]\}}||d	  ||d
   }}|jr|j| d nd }|jr|j| d nd }|j|| || |||d |  s|  s|t}|t}|j|| ||d|d q|   |!|j"pv| |#|j$ |%|j&rdnd |'|j(rdnd |r|	)  |r|	*|j+,||j d |||j  }|r(|j-d dkr(|j./ \}}|| ||  |d< |rHt0|jd  t0|1  |rx|j2|j+,||j dd| ddd |S )Nr   c                 S   s   g | ]}| d qS )-minr   r   r   r   r   r   D  r   zMark._run.<locals>.<listcomp>c                 S   s   g | ]}| d qS )-maxr   r   r   r   r   r   E  r   )columnsc                    s   g | ]} qS r   r   )r   rN   r   r   r   r   K  r   z	Expected z values, got r   r   r   )labelcolorZlsg333333?)alphar   logZlinearz.png   ZDiff:z.csvz%.fF)Zfloat_formatindex)3osZmatplotlib.pyplotZpyplotZpandasrz   listrv   Z	DataFramerw   rh   tupler-   r"   dictr_   ry   rI   rx   r|   	TypeErrorlocr{   figureZsubplot	enumerater   ZplotZisnullr0   Zastyperl   Zfill_betweenZlegendZ
set_xlabelr}   Z
set_ylabelr~   Z
set_xscaler   Z
set_yscaler   showZsavefigpathr   shaper   tolistprintZ	to_stringZto_csv)r   r   r   r   r   Zdiff_colZsave_precisionZkwragsr   ZpltpdZy_meanZy_minZy_maxrv   ZdfZx_argsZrow_meanZrow_minZrow_maxrq   r    axZfirst_xrX   colZstyZcol0Zcol1r   r   r   _run=  sv    

$

 

"z	Mark._runrb   c                 K   s|  t | jt}|r| jgn| j}g }z|D ]"}	|| j|	|||fi | q(W |rtj|dd ttj	|ddN}
|

d |d t| D ]}	|

d|	j d q|

d W d    n1 s0    Y  n|r^tj|dd ttj	|ddP}
|

d |d t| D ]}	|

d|	j d q|

d W d    n1 sT0    Y  0 |rx|rt|d	 S |S d S )
NT)exist_okzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
r   )rh   r   ru   appendr   r   makedirsopenr   r   writer-   r{   )r   r   r   r   Z	return_dfkwargsZhas_single_benchr   Z
result_dfsr   htmlr   r   r   run  s2    "
*
,zMark.runN)Fr   )FFrb   F)	r   r   r   r   ru   r   r   r   r   r   r   r   r   r   7  s
     Er   c                    s    fdd}|S )z
    Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

    :param benchmarks: Benchmarking configurations.
    :type benchmarks: List of :class:`Benchmark`
    c                    s
   t |  S r   )r   )rI   r   r   r   <lambda>  r   zperf_report.<locals>.<lambda>r   )r   wrapperr   r   r   perf_report  s    r   c                 C   s^   ddl }ddlm} | s"|j } |jj| d }|jj| d }|| d d d	 }|S )
z return DRAM bandwidth in GB/s r   Nr   r\   Zmem_clock_rateZmem_bus_widthr   g    .A   )r?   r   r\   r@   current_devicer]   utilsget_device_properties)devicer?   r\   Zmem_clock_khzZ	bus_widthZbw_gbpsr   r   r   get_dram_gbps  s    
r   c           	      C   s   dd l }ddlm} |s"|j }|jj|d d }|j|}|d dk rd| |j	ks^J d}nV| |j
|jfv rzd}n@| |j	|j|jfv rd}n&| |jtjtjtjfv rd	}ntd
|| | d }|S )Nr   r   r   multiprocessor_count   r      i   i   dtype not supported&.>)r?   r   r\   r@   r   r]   r   r   get_device_capabilityfloat16float32int32rk   int16Zint8tlZ
float8e4nvZfloat8e4b15Zfloat8e5RuntimeError	rj   Z
clock_rater   r?   r\   Znum_subcoresZ
capabilityZops_per_sub_coretflopsr   r   r   get_max_tensorcore_tflops  s$    
r   c                     s    fdd}|S )Nc                    s   t   fdd}|S )Nc            
         s   dd l }|t  }  | k}|r|dkrtjjd }tj	d dd}d|v shJ d|d j
jj}| d	j d
| d}tjddd|gd|d}	|	jdksJ ddt|	jv sJ n| i | d S )Nr   zcuda-memcheck__file__PATH1)r   ZPYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]Zpytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environnodeZcallspecidr   r   r   
returncoder   r   )
r|   r   r   Z	ppid_nameZrun_cuda_memcheckr   r   Ztest_idr   r   )target_kwargstest_fnr   r   r     s    z1cuda_memcheck.<locals>.decorator.<locals>.wrapper)	functoolswraps)r   r   r   )r   r   	decorator  s    z cuda_memcheck.<locals>.decoratorr   )r   r   r   r   r   cuda_memcheck  s    r   F    c              
   c   s$  zt g d t dddd|  d|  g t dddd| d| g tdgd	 }td
gd	 }t||  dk sJ d|  dt|| dk sJ d| dd|  }d| d }||fV  W t g d t g d t g d n,t g d t g d t g d 0 d S )N)r
   r   r   -pmr   r
   r   r   z--lock-gpu-clocks=r	   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryr>   zGPU SMs must run at z MHzg 3O?i   gMbP?)r
   r   r   r   r   )r
   r   r   z-rgc)r
   r   r   z-rmc)r   r   r!   abs)Zref_sm_clockZref_mem_clockZcur_sm_clockZcur_mem_clockr   Zgbpsr   r   r   set_gpu_clock  s6      r   c           	      C   s   dd l }ddlm} |s"|j }|jj|d d }|j }|d dk rx| |j	kr^d}q| |j
krnd}qtd	n.| |j	krd}n| |j
|jfv rd}ntd	|| | d
 }|S )Nr   r   r   r   r   r       @   r   r   )r?   r   r\   r@   r   r]   r   r   r   r   r   r   rk   r   r   r   r   get_max_simd_tflops  s&    





r   )r:   NNr3   )rS   rT   NNr3   )NNrb   )N)N)r   r   )N)r   r#   r   r5   r   r   
contextlibr   typingr   r   r   rb   r   r   r   r!   r/   r9   rR   ra   rt   ru   r   r   r   r   r   r   r   r   r   r   r   <module>   s0   
C
B
3Cc

