o
    Zh[9                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
 d dlZd dlmZ d dlmZ ddlmZ dd	lmZmZ G d
d de
Zg dZdedefddZdedefddZdefddZdede	eeef  ddfddZe jG dd dZdedejj j!de"de#d eddfd!d"Z$de"d#e#d$e#ded%eddfd&d'Z%ded%eddfd(d)Z&d%eddfd*d+Z'ded%eddfd,d-Z(dS ).    N)defaultdict)
ModuleType)AnyOptionalProtocol)
DeviceType)
OrderedSet   )benchmarker)create_bandwidth_info_strget_num_bytesc                   @   s"   e Zd ZdededefddZdS )BenchmarkCallableTypetimesrepeatreturnc                 C   s   d S N )selfr   r   r   r   P/var/www/auris/lib/python3.10/site-packages/torch/_inductor/wrapper_benchmark.py__call__   s    zBenchmarkCallableType.__call__N)__name__
__module____qualname__intfloatr   r   r   r   r   r      s    r   )ZforeachZpersistent_reductionZ	pointwiseZ	reductionZ
split_scantemplatesrc_coder   c                    *    fddt D }t|dkr|d S dS )z
    Similar to get_kernel_category but use the source code. Call this API
    if we have not compile the src_code to module yet.
    c                    s   g | ]}d |  v r|qS )z@triton_heuristics.r   .0chr   r   r   
<listcomp>#   s    z6get_kernel_category_by_source_code.<locals>.<listcomp>r	   r   unknown_kernel_category_choiceslen)r   choicesr   r!   r   "get_kernel_category_by_source_code   s   
r(   
kernel_modc                    r   )a  
    Given the module defining a triton kernel, return the category of the kernel.
    Category can be one of:
    - pointwise
    - reduction
    - persistent_reduction

    Currently we simply decide the category depending on what decorator is imported
    by the kernel.
    c                    s   g | ]	}| j v r|qS r   )__dict__r   r)   r   r   r"   7   s    z'get_kernel_category.<locals>.<listcomp>r	   r   r#   r$   )r)   r'   r   r+   r   get_kernel_category,   s   r,   modc                    s<   ddl m   fdd| j D }t|dksJ |d S )Nr   CachingAutotunerc                    s(   g | ]\}}| d rt| r|qS )triton_)
startswith
isinstance)r   kvr.   r   r   r"   A   s    z%get_triton_kernel.<locals>.<listcomp>r	   )Z)torch._inductor.runtime.triton_heuristicsr/   r*   itemsr&   )r-   Z	cand_listr   r.   r   get_triton_kernel>   s   
r6   benchmark_namebenchmark_all_configsc                    s  ddl m} d}|jD ]ˉj}tdrtdsqt}t}  tdd |j	j
D }|jdddu rFt d	|id
 	d#dtdtt dtt dtt dtdtffdd}| dd|dd   d|dd  }	|rtdsJ  }
t|	 |
 D ]\}}td|||j|j|j d|j  qn-tj fdddd}t|jdksJ d |jd }t|||j|j|j|	 dd! |d7 }q|dkrtd" dS dS )$aX  
    An experimental API used only when config.benchmark_kernel is true.

    Run the kernel benchmarks for all the kernels cached in PyCodeCache.
    Used in the compiled modules.

    Put this method here rather than codegen it for convenience since its implementation
    does not change based on different graph modules being compiled.
    r   )PyCodeCacheget_argscallc                 S   s   g | ]	}| d r|qS )Z
in_out_ptr)r1   )r   Zarg_namer   r   r   r"   b   s    z)benchmark_all_kernels.<locals>.<listcomp>Zkernel_num_gbNZnum_in_out_argsg    eA msn_regsn_spillssharedprefixr   c                    sZ   t dd |||fD sd|dd|dd|dd}nd	} | d
  }t|  |||dS )Nc                 s   s    | ]}|d u V  qd S r   r   )r   xr   r   r   	<genexpr>s   s    z>benchmark_all_kernels.<locals>.get_info_str.<locals>.<genexpr>  3z regs  z	 spills  8z shared memr<   g     @@)rA   suffix)anyr   )r=   r>   r?   r@   rA   Zkernel_detail_strZgb_per_s)num_gbr   r   get_info_strl   s   
z+benchmark_all_kernels.<locals>.get_info_strZ20    
   r8   rD   z @ c                      s
     S r   )r;   r   )argsr)   r   r   <lambda>   s   
 z'benchmark_all_kernels.<locals>.<lambda>(   )repr	   z.Autotuner should have selected the best config)rA   zpNo kernel with benchmark functionality found. Make sure you run inductor with config.benchmark_kernel being True)r<   )Ztorch._inductor.codecacher9   moduleskeyhasattrr6   r,   r:   r&   fn	arg_namesZinductor_metagetr   r   r   r   strupperr8   printr5   r>   r?   r@   configr
   Zbenchmark_gpuZ	launchers)r7   r8   r9   ZnfoundZ
kernel_keyZtriton_kernelZkernel_categoryZnum_in_out_ptrsrJ   Zkernel_descZbench_resultlauncherr=   r   )rN   r)   rI   r   benchmark_all_kernelsJ   s|   
(
 


r]   c                   @   s.   e Zd ZU eed< eed< eed< eed< dS )ProfileEventcategoryrS   self_device_time_mscountN)r   r   r   rX   __annotations__r   r   r   r   r   r^      s
   
 r^   
event_listwall_time_msnrunsdevice_namec           	         s  dt jjjdtffddtt dt jjjdtdd f fdd}|D ]:}|jr/J d|j	t
jkr6q&d	}|jd
r[|jdrGd}n|jdrPd}n|jdrYd}nd}||| q&dtdtt dtffddd fdd}|  d S )Nevr   c                    s   | j d   S )zV
        ev.self_device_time_total is in microsecond. Convert to millisecond.
          )self_device_time_totalrg   )re   r   r   get_self_device_time   s   z6parse_profile_event_list.<locals>.get_self_device_timer_   c                    s.   t || j| | j d} | | d S )N)r_   rS   r`   ra   )r^   rS   ra   append)rg   r_   Z
profile_ev)
all_eventsrk   re   r   r   	add_event   s   z+parse_profile_event_list.<locals>.add_eventz!Don't support the legacy profilerr#   r0   Z
triton_poitriton_pointwiseZ
triton_redtriton_reductionZ
triton_pertriton_persistent_reductiontriton_unknownprofile_eventsc                    s    sdS ddl m } |jdd dd g }d}td|  d	 |D ]"}||j7 }|j d
 dd}||jd d |j|j|g q!|d|d| d
 ddg t||dd   dddgd |S )N        r   )tabulatec                 S   s   | j S r   )r`   rj   r   r   r   rO      s    zCparse_profile_event_list.<locals>.report_category.<locals>.<lambda>T)rS   reversez
  == z category kernels == d   .2f%x   ZTotalr<   ZKernelzSelf z
 TIME (ms)ZCountZPercent)headers)ru   sortrZ   r`   rl   rS   ra   rY   )r_   rs   ru   rows
total_timerg   percent)rf   rd   r   r   report_category   s2   
"z1parse_profile_event_list.<locals>.report_categoryc                     s  g d} t   t | sJ t   i }d}| D ]}| v r3| | }|||< ||7 }q| d dd}rMtd  d|  ntd td	d
d d }| D ]}||d d dd}|d| 7 }qa|d| dd
d7 }t| d S )N)ro   rp   rq   rr   r#   rt   rw   rx   ry   z
Percent of time when z
 is busy: zNo device detectedzTotal wall time .3fz mszOutput for tabulate: z, r=   )r   keysissubsetlistrZ   rY   rW   )Zcategory_listZper_category_wall_timeZtotal_device_msr_   _timeZdevice_busy_percentZtabulate_liner   )rm   r7   rf   r   rd   r   r   report   s4   
z(parse_profile_event_list.<locals>.report)r   N)torchautogradprofiler_util	EventListr   r   r   rX   	is_legacyZdevice_typer   ZCPUrS   r1   r^   )	r7   rc   rd   re   rf   rn   rg   r_   r   r   )rm   r7   rf   rk   re   r   rd   r   parse_profile_event_list   s<    
,r   r   r   benchmark_compiled_module_fnc                 C   s   t jjdd}|||d W d    n1 sw   Y  t  d}|| td| d td|  |jdd}t|jd	d
d t	||| || |j
 d S )NT)Zrecord_shapesr   r   z/compiled_module_profile.jsonz4Profiling result for a compiled module of benchmark :z+Chrome trace for the profile is written to )Zgroup_by_input_shaperi   rM   )Zsort_byZ	row_limit)r   Zprofilerprofiletempfile
gettempdirZexport_chrome_tracerZ   Zkey_averagestabler   Z
use_device)rd   r   r   r7   r   ppathrc   r   r   r   perf_profile)  s   
r   c                 C   s  dd l }dd l}dd l}||}|j|}|j|j|d }t	 }t
j
 d}	|j|d|	 d}
d| d| d}dd	d
dddddddddddd|
dd|g}z|j|dd td|  d td|
  W d S  |jy } ztd|  W Y d }~d S d }~ww )Nr   z%Y%m%d_%H%M%SZncu_output_z.ncu-repz import sys; sys.path.insert(0, 'z	'); from zO import benchmark_compiled_module; benchmark_compiled_module(times=1, repeat=1)ncuz--target-processesallz--replay-modeZkernelz--kernel-name-basefunctionz--print-unitsbasez--setfullz--import-sourceyesz--force-overwritez--exportpython-cT)checkz%
NCU profiling results for benchmark r   zNCU report has been written to z!NCU profiling failed with error: )inspectos
subprocessgetfiler   dirnamesplitextbasenamer   r   datetimenowstrftimejoinrunrZ   CalledProcessError)r7   r   r   r   r   Zmodule_fileZ
module_dirmodule_nameZncu_dir	timestampZ
ncu_outputZ
python_cmdZncu_cmder   r   r   ncu_analyzer>  sT   
r   c                 C   sh   t j sJ t jjjdd | ddd t  d}t jj| t jjjd d td|  d S )	Ni )Zmax_entriesrM   r	   r   z/memory_snapshot.pickle)enabledz0The collect memory snapshot has been written to )	r   cudais_availableZmemoryZ_record_memory_historyr   r   Z_dump_snapshotrZ   )r   Zsnapshot_pathr   r   r   collect_memory_snapshotr  s   r   c           	      C   s   ddl }| }|jddddd |jdd	dd
d |jddddd |jdddd |jdddd | }|jrBt| |j dS d}d}tj	 rPtj
  |||dd }tj	 rmtj }td|d dd tj	 ry|jryt| |jrt|||| | |jrt| | dS dS )zM
    This is the function called in __main__ block of a compiled module.
    r   Nz--benchmark-kernelsz-k
store_truez,Whether to benchmark each individual kernels)actionhelpz--benchmark-all-configsr   z8Whether to benchmark each individual config for a kernelz	--profilez-pz&Whether to profile the compiled modulez--cuda-memory-snapshotz
            Whether to collect CUDA memory snapshot. Refer to
            "https://pytorch.org/blog/understanding-gpu-memory-1/
            for details about how to visualize the collected snapshot
        z--ncuzWhether to run ncu analysisrM   r   rh   zPeak GPU memory usage g    .Ar   z MB)argparseArgumentParseradd_argument
parse_argsZbenchmark_kernelsr]   r8   r   r   r   Zreset_peak_memory_statsZmax_memory_allocatedrZ   Zcuda_memory_snapshotr   r   r   r   r   )	r7   r   r   parserrN   r   r   rd   Zpeak_memr   r   r   compiled_module_main  sl   	



r   ))dataclassesr   r   collectionsr   typesr   typingr   r   r   r   Ztorch.autogradr   Ztorch.utils._ordered_setr   Zruntime.benchmarkingr
   Zruntime.runtime_utilsr   r   r   r%   rX   r(   r,   r6   dictr]   	dataclassr^   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s    

W	
~

4
