o
    Zh                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZ dd a					dddZ
dd	 Zd
d Zdd Zdd Zdd Zg adedefddZ			dddZdS )    N)profileProfilerActivityc                   C   s   d S )N r   r   r   O/var/www/auris/lib/python3.10/site-packages/torch/_functorch/benchmark_utils.pysynchronize   s   r      c	              	   C   sr  |du rdg}|dgkrt j rt jja|du ri }|du r!i }|> t d tdD ]}	| |fi | t  q-t d t }
t|D ]}	| |fi | t  qHt }W d   n1 sdw   Y  ||
 }tdd|i|4}|" t  t d t|D ]}	| |fi | t  qW d   n1 sw   Y  W d   n1 sw   Y  |	| |S )a0  
    Output the chrome trace of running f(input, **kwargs_for_f) with [optimize_ctx]
    [num_runs] times to [trace_filename].

    [activities] are the activities that the profiler will record, e.g. ProfilerActivity.CUDA.
    Return total runtime without the profiler

    Outputs to trace_filename
    Ncudacpui9     
activitiesr   )
torchr   Zis_availabler   Zmanual_seedrangetimeperf_counterr   Zexport_chrome_trace)finputZtrace_filenameoptimize_ctxr   num_runsdevicesZkwargs_for_fZkwargs_for_profiler_t0t1ZtimingZprofr   r   r   dump_chrome_trace   sD   




r   c                 C   s   t | }t|}|d }|S )NZtraceEvents)openjsonload)filenamer   dataeventsr   r   r   get_chrome_trace_eventsK   s   
r   c                 C   s(   d| v o| d t v od| v o| d dkS )NpidphX)gpu_pidseventr   r   r   is_gpu_compute_eventR   s   

r&   c                 C   s4   g }| D ]}t |sq|| qt|tddS )Nts)key)r&   appendsortedoperator
itemgetter)r   sorted_gpu_eventsr%   r   r   r   get_sorted_gpu_events\   s   r.   c                 C   s   t | dkrdS | d }|d |d  }|d }| dd  D ]}t|d |}|d |d  }|t|| d }t||}q|S )Nr   r'   Zdurr   )lenmax)r-   r%   Zcurrent_end_timeZtotal_duration
start_timeend_timer   r   r   get_duratione   s   r3   c                 C   s6   dd }t | }g }|D ]}||sq|| q|S )Nc                 S   s8   d| v od| d v pd| d v pd| d v pd| d v S )NnameZgemmconvZcutlassZwgradr   r$   r   r   r   is_mm_conv_eventt   s   


z7get_sorted_gpu_mm_conv_events.<locals>.is_mm_conv_event)r.   r)   )r   r6   Z
gpu_eventsZsorted_eventsr%   r   r   r   get_sorted_gpu_mm_conv_eventss   s   r7   r   total_lengthc                 C   s   t | }g a|D ]}d|vrq|d dkr$d|d d v r$t|d  q|d }t|}t|| }t|}t|| }||fS )a  
    Process the chrome traces outputs by the pytorch profiler to compute GPU Utilization
    and percent of times spent on matmul and convolution

    Args:
        filename(str): Name of chrome traces file produced by pytorch profiler

        total_length(float): total length of the process without profiler in second

    Return:
        tuple: (GPU Utilization, percent of time spent on matmul and convolution)
    r4   Zprocess_labelsZGPUargslabelsr    g    .A)r   r#   r)   r.   r3   r7   )r   r8   r   r%   r-   utilizationZsorted_gpu_mm_conv_eventsmm_conv_utilizationr   r   r   compute_utilization   s   r=   tmp_chrome_tracec              	   C   sz   t j|}|st | td|  |du rt }t j||d }t| |||t	j
g|dgd}t||\}	}
|	|
fS )a  
    Benchmark the GPU Utilization and percent of time spent on matmul and convolution operations of
    running f(input, **kwargs_for_f) with [optimize_ctx] [num_runs] times.
    It will produce a chrome trace file in trace_folder/trace_file_name.json

    Example:

    ```
    def f(a):
        return a.sum()
    a = torch.rand(2**20, device="cuda")
    utilization, mm_conv_utilization = benchmark_utilization(f, a, "tmp", trace_file_name = "tmp_chrome_trace")
    ```

    Args:
        f: function to benchmark

        input: input to :attr:`f`

        trace_folder: name of the folder to store the chrome trace

        optimize_ctx: the context in which f will run

        trace_file_name: name of the dumped chrome trace file, default to "tmp_chrome_trace"

        num_runs: number of times to run f, excluding the warm-up runs, default to 1.

    Return:
        tuple: (GPU Utilization, percent of time spent on matmul and convolution)

    zcreate folder Nz.jsonr   )r   r   )ospathexistsmakedirsprint
contextlibnullcontextjoinr   r   CUDAr=   )r   r   Ztrace_folderr   Ztrace_file_namer   ZisExistZchrome_trace_file_namer8   r;   r<   r   r   r   benchmark_utilization   s(   '
	rH   )r   NNN)Nr>   r   )rD   r   r+   r?   r   r   Ztorch.profilerr   r   r   r   r   r&   r.   r3   r7   r#   strfloatr=   rH   r   r   r   r   <module>   s0   

:
	&