o
    Zh                      @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZ d dlmZ d dlmZmZmZmZ d dlmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d d	lm Z  d d
l!m"Z" d dl#m$Z$m%Z%m&Z&m'Z'm(Z( d dl)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/ erd dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 ddl8m9Z9 ddl:m;Z; ddl8m<Z< ddl=m>Z> ddl?m@Z@ dZAdaBe-eCdZDeEeCZFG dd dZGG dd dZHG d d! d!eIZJejKdDd$d%ZLejMG d&d' d'ZNejMG d(d) d)ZOeO ZPee"jQe"jRf ZSejMG d*d+ d+ZTejMG d,d- d-ZUG d.d/ d/eUZVG d0d1 d1ZWG d2d3 d3ZXG d4d5 d5eUZYG d6d7 d7eWeYZZG d8d9 d9eXeYZ[G d:d; d;eWeUZ\G d<d= d=eXeUZ]dEdBdCZ^dS )F    )annotationsN)IterableSequence)ThreadPoolExecutor)byrefc_size_tc_void_pCDLL)AnyCallableOptionalTYPE_CHECKINGUnion)multiprocessing)get_interface_for_device)rand_strided)ir)CppCodeCacheCUDACodeCache
DLLWrapperget_hashPyCodeCache)get_gpu_typeis_gpu)getArtifactLogger)
OrderedSet)BaseProcess)Queue)
ModuleType)TritonTemplateCaller   )WorkspaceArg)config)WorkspaceZeroMode)benchmarker)VCUDA_VISIBLE_DEVICESFZ
autotuningc                   @     e Zd ZdS )PingN__name__
__module____qualname__ r-   r-   O/var/www/auris/lib/python3.10/site-packages/torch/_inductor/autotune_process.pyr(   ;       r(   c                   @  r'   )PongNr)   r-   r-   r-   r.   r0   ?   r/   r0   c                   @  r'   )!NonzeroWorkspaceNotSupportedErrorNr)   r-   r-   r-   r.   r1   C   r/   r1   deviceOptional[int]c              	   c  sx    | du r
dV  dS t jt}t| t jt< zdV  W |du r&t jt= dS |t jt< dS |du r6t jt= w |t jt< w )z
    Context manager to set the CUDA_VISIBLE_DEVICES environment variable to the
    specified single device. If device is None, don't manipulate the environment.
    N)osenvirongetr&   str)r2   currentr-   r-   r.   set_cuda_visible_deviceG   s   r9   c                   @  s   e Zd ZU dZdZded< dZded< dZded< dZded	< e	d(ddZ
e	d(ddZd)ddZd*ddZd*ddZd+ddZ	d,d-dd Zd*d!d"Zd*d#d$Zd.d*d&d'ZdS )/TuningProcessz
    Abstraction for launching a helper process to benchmark kernels. Spawns
    the parent process and uses multiprocessing queues to send benchmark
    requests and return results.
    Nr3   r2   zOptional[BaseProcess]processzOptional[Queue[Any]]request_queueresponse_queue
Queue[Any]returnNonec                 C  sF   t dtjt z	t| | W dS  ty"   t 	d Y dS w )z4
        Entry point for the child process.
        z2Entering TuningProcess child. Visible devices = %szException in TuningProcessN)
autotuning_logdebugr4   r5   r6   r&   r:   workloop	Exception	exception)r<   r=   r-   r-   r.   process_maini   s   
zTuningProcess.process_mainc                 C  s\   	 |   }|du rdS t|tr|t  nt|tr$||  n	tdt| q)z<
        Work loop for the benchmarking subprocess.
        TNzInvalid request type )	r6   
isinstancer(   putr0   BenchmarkRequest	benchmarkRuntimeErrortype)r<   r=   objr-   r-   r.   rC   z   s   

zTuningProcess.workloopboolc                 C  s   | j duo| jduo| jduS )z?
        True if the sub-process has been initialized.
        Nr;   r<   r=   selfr-   r-   r.   valid   s
   
zTuningProcess.validc                 C  s   d | _  | _| _dS )z2
        Reset to an uninitialized state.
        NrO   rP   r-   r-   r.   clear   s   zTuningProcess.clearc                 C  s   |   rdS td}| | _| | _|j| j| j| jfd| _| jdus)J t	| j
 | j  W d   dS 1 s?w   Y  dS )z
        Create child process, request/response queues, and do the warm up.
        Set the environment to make only the provided GPU device visible
        to the process.
        NZspawn)targetargs)rR   r   Zget_contextr   r<   r=   ProcessrF   r;   r9   r2   start)rQ   ctxr-   r-   r.   
initialize   s   


"zTuningProcess.initializerM   r
   c                 C  s&   |    | jdusJ | j| dS )z8
        Push a work item to the child process.
        N)rY   r<   rH   )rQ   rM   r-   r-   r.   rH      s   zTuningProcess.put      ^@      @      ?c                 C  s   | j dusJ | jdusJ 	 z@|}d}|durB|dkrB|d8 }z	| jjdd}W n tjy9   | j  s7 Y nw |durB|dks|du rM| jj|d}|W S  tjyl   | j j}|du rg| j||d  |    w )a,  
        Get a response from the child process. Raises queue.Empty on timeout
        or if the process dies.

        This method is (so far) only used by TuningProcessPool, where torch._inductor.config entries are being used
        to populate the timeouts:

        Arguments:

            @param result_timeout: Timeout in seconds, defaults to 120.0 or to
                                   config.max_autotune_subproc_result_timeout_seconds when called by TuningProcessPool
            @param graceful_timeout: Timeout in seconds to allow graceful shutdown (SIGTERM is sent after this time).
                                    Defaults to 3.0 or to config.max_autotune_subproc_graceful_timeout_seconds
            @param terminate_timeout: Timeout in seconds after SIGTERM, until we send SIGKILL if the process
                                      remains alive. Defaults to 1.0 or to
                                      config.max_autotune_subproc_terminate_timeout_seconds.
        Returns:
            A response from the child process (Any type)
        NTr\   g      ?timeout)graceful_timeoutterminate_timeout)	r;   r=   r6   queueEmptyis_aliveexitcodekillrS   )rQ   result_timeoutr_   r`   Zremaining_timeoutresstatusr-   r-   r.   r6      s>   
zTuningProcess.getc                 C  s8   |   r| jdusJ | jdusJ | jd dS dS )z8
        Signal the child process to terminate.
        N)rR   r;   r<   rH   rP   r-   r-   r.   	terminate   s
   zTuningProcess.terminatec                 C  s$   | j dur| j   |   dS dS )z5
        Wait for the child process to exit.
        N)r;   joinrS   rP   r-   r-   r.   wait   s   

zTuningProcess.wait      @c                 C  s   | j d urA|   | j j|d | j  r;td| j j | j   | j j|d | j  r;td| j j | j   | 	  d S d S )Nr]   z&Sending SIGTERM to process with PID %dz&Sending SIGKILL to process with PID %d)
r;   ri   rj   rc   rA   warningpiderrorre   rS   )rQ   r_   r`   r-   r-   r.   re     s$   




zTuningProcess.kill)r<   r>   r=   r>   r?   r@   )r?   rN   r?   r@   )rM   r
   r?   r@   )rZ   r[   r\   )r?   r
   )rl   r\   )r*   r+   r,   __doc__r2   __annotations__r;   r<   r=   staticmethodrF   rC   rR   rS   rY   rH   r6   ri   rk   re   r-   r-   r-   r.   r:   \   s&   
 






3
	r:   c                   @  s\   e Zd ZU dZdZded< dZded< dd	d
ZdddZdddZ	dddZ
dddZdS )TuningProcessPoolz
    Maintains a pool of TuningProcesses to benchmark kernels in parallel
    across devices. By default, we create one TuningProcess per device and
    set the sub-process environment to make only that device visible.
    Nz$Optional[queue.Queue[TuningProcess]]	processeszOptional[ThreadPoolExecutor]executorr?   r@   c                 C  s   | j du | jdu ksJ | j durdS |  }td| t | _ |D ]}t|d}|  |	t
  | j 	| q$| j jD ]}t|jddtsMJ q@tt|d| _tsfdaddl}|| j dS dS )z,
        Start the child processes.
        Nz$Sub-process autotune device list: %s)r2   )rf   )max_workersTr   )ru   rv   get_device_listlogrB   ra   r   r:   rY   rH   r(   rG   r6   r0   r   lenEXIT_HANDLER_REGISTEREDatexitregisterri   )rQ   devicesr2   pr|   r-   r-   r.   rY   $  s&   


zTuningProcessPool.initializeSequence[Optional[int]]c                 C  sf   t jsdgS t }t|}| }ttjv r-dd tjt dD }t	||ks+J |S t
t|S )zD
        Gather the list of devices to be used in the pool.
        Nc                 S  s   g | ]}t |qS r-   )int).0dr-   r-   r.   
<listcomp>W      z5TuningProcessPool.get_device_list.<locals>.<listcomp>,)r"   Zautotune_multi_devicer   r   Zdevice_countr&   r4   r5   splitrz   listrange)rQ   Zgpu_typedevice_interfacecountr~   r-   r-   r.   rx   I  s   
z!TuningProcessPool.get_device_listc                 C  s^   | j dur| j   d| _ | jdur-| jjD ]}|  q| jjD ]}|  q!d| _dS dS )z:
        Signal all child processes to terminate.
        N)rv   shutdownru   ra   ri   rk   )rQ   r   r-   r-   r.   ri   ]  s   





zTuningProcessPool.terminatechoicer   floatc              	   C  s   |j dusJ | jdusJ | j }||j  z1z|tjtjtjW W | j| S  tj	yJ   t
d| d td Y W | j| S w | j| w )z
        Entry point for the thread-pool helper threads: Wait for an open TuningProcess,
        remove it from the queue, execute the benchmark in that subprocess, and return
        the TuningProcess to the queue.
        NzFailed to benchmark choice 'z['. It will be ignored. Please debug the root cause in case the choice can bring perf gains.inf)Zbmreqru   r6   rH   r"   Z+max_autotune_subproc_result_timeout_secondsZ-max_autotune_subproc_graceful_timeout_secondsZ.max_autotune_subproc_terminate_timeout_secondsra   rb   warningswarnr   )rQ   r   r;   r-   r-   r.   rT   l  s&   

zTuningProcessPool.targetchoiceslist[TritonTemplateCaller]!dict[TritonTemplateCaller, float]c                 C  sP   | j dus	J d| jdusJ i }t|| j| j|D ]\}}|||< q|S )z>
        Benchmark each choice in a separate process.
        Nz&Tuning process pool is not initialized)ru   rv   zipmaprT   )rQ   r   resultsr   resultr-   r-   r.   rJ     s   
zTuningProcessPool.benchmarkrp   )r?   r   )r   r   r?   r   r   r   r?   r   )r*   r+   r,   rq   ru   rr   rv   rY   rx   ri   rT   rJ   r-   r-   r-   r.   rt     s   
 

%

rt   c                   @  sZ   e Zd ZU ded< ded< ded< ded< d	ed
< dZded< edddZdddZdS )
TensorMetaztorch.devicer2   ztorch.dtypedtypeztorch._prims_common.ShapeTypesizesztorch._prims_common.StrideTypestridesr   offsetNzOptional[str]nameirnodes/Union[LayoutOrBuffer, Sequence[LayoutOrBuffer]]r?   #Union[TensorMeta, list[TensorMeta]]c              	     s   t |tr fdd|D }tdd |D sJ |S |}t |tjr*tjd|d}| }|d us4J | }|d us>J t||t	j
jj| tjdt	j
jj| tjdt	j
jj| jtjd| dS )	Nc                   s   g | ]}  |qS r-   )from_irnodesr   xclsr-   r.   r     s    z+TensorMeta.from_irnodes.<locals>.<listcomp>c                 s  s    | ]}t |tV  qd S N)rG   r   r   r-   r-   r.   	<genexpr>  s    z*TensorMeta.from_irnodes.<locals>.<genexpr>Zfake)r   Zlayout)fallback)r2   r   r   r   r   r   )rG   r   allr   LayoutBufferZ	get_dtypeZ
get_devicer   r%   graphZsizevarsZ
size_hintsget_sizer"   Zunbacked_symint_fallbackZ
get_strideZ	size_hintZ
get_layoutr   get_name)r   r   r   noder   r2   r-   r   r.   r     s8   
zTensorMeta.from_irnodestorch.Tensorc                 C  s   t | j| j| j| j| jdS )N)r2   r   
extra_size)r   r   r   r2   r   r   rP   r-   r-   r.   	to_tensor  s   zTensorMeta.to_tensor)r   r   r?   r   )r?   r   )r*   r+   r,   rr   r   classmethodr   r   r-   r-   r-   r.   r     s   
 #r   c                   @  sN   e Zd ZdZdddZdddZdddZddd ddZddd ddZdS )!rI   a1  
    Only handle triton template benchmark for now. The extern kernel benchmark
    can be done inside the same process since they usually don't cause crash.

    Important: Instances of this class and subclasses have to be serializable
    across process boundaries. Do not put CUDA Tensors in here!
    kernel_namer7   input_tensor_metar   output_tensor_meta
extra_argsIterable[Any]r?   r@   c                   sh   || _ t|tr|g}|| _t ttfr,t dkr(t fdd D s(J  d   | _|| _	d S )Nr    c                 3  s2    | ]}d D ]}t  d |t ||kV  qqdS ))r2   r   r   r   r   r   N)getattr)r   r   attrr   r-   r.   r     s    z,BenchmarkRequest.__init__.<locals>.<genexpr>r   )
r   rG   r   r   tupler   rz   r   r   r   )rQ   r   r   r   r   r-   r   r.   __init__  s   

zBenchmarkRequest.__init__input_tensorsr   output_tensorCallable[[], None]c                G     t r   NotImplementedErrorrQ   r   r   r-   r-   r.   make_run_fn  s   zBenchmarkRequest.make_run_fnc                 C  s   d S r   r-   rP   r-   r-   r.   cleanup_run_fn  s   zBenchmarkRequest.cleanup_run_fnNr   Optional[torch.Tensor]r   c                G  r   r   r   rQ   fnr   r   r-   r-   r.   do_bench  s   zBenchmarkRequest.do_benchc          
      G  s   t tj}|rt }|d u r't|dksJ tdd | jD }| j	 }|r3t | }t }z
| j
|d|i}W n tyO   t d td Y S w |r\t | }t }| j|g||R  }|rzt | }	t dt| |||	 |   |S )Nr   c                 s  s    | ]}|  V  qd S r   )r   r   r-   r-   r.   r     s    z-BenchmarkRequest.benchmark.<locals>.<genexpr>r   z0Skipping op due to nonzero workspace requirementr   z6InChildProcess %s: load %f, create tensor %f, bench %f)rA   isEnabledForloggingDEBUGtimerz   r   r   r   r   r   r1   infor   r   rB   r7   r   )
rQ   r   r   rB   Zstart_tsZcreate_tensor_elapser   Zload_elapseoutZbench_elapser-   r-   r.   rJ     s>   

zBenchmarkRequest.benchmark)
r   r7   r   r   r   r   r   r   r?   r@   r   r   r   r   r?   r   rp   r   r   r   r   r?   r   )	r*   r+   r,   rq   r   r   r   r   rJ   r-   r-   r-   r.   rI     s    


rI   c                   @  s,   e Zd ZdZddddZdd	dddZdS )TestBenchmarkRequestz
    Supports unit testing. Defined in this file so that the TuningProcess
    sub-process knows how to unpickle these objects.
    NvalueOptional[float]r?   r@   c                 C  s
   || _ d S r   )r   )rQ   r   r-   r-   r.   r   ?  s   
zTestBenchmarkRequest.__init__r   r   r   r   r   r   c                G  s   | j d u r	td| j S )NzFailed to run)r   rD   r   r-   r-   r.   rJ   B  s   
zTestBenchmarkRequest.benchmarkr   )r   r   r?   r@   r   )r*   r+   r,   rq   r   rJ   r-   r-   r-   r.   r   9  s
    r   c                   @     e Zd Zdddd	d
ZdS )GPUDeviceBenchmarkMixinNr   r   r   r   r   r?   r   c          	      G  s   t dd g ||D }t|dksJ d| tdd |D d}t|}t|dkr5tt|}n| }|| t|}|	  W d    |S 1 sSw   Y  |S )Nc                 s  s<    | ]}t |tjrt|jjr|jjd ur|jjV  qd S r   )rG   torchZTensorr   r2   rL   indexr   Ztensorr-   r-   r.   r   Q  s    

z3GPUDeviceBenchmarkMixin.do_bench.<locals>.<genexpr>r    zCan not mix devices c                 s  s$    | ]}t |jjr|jjV  qd S r   )r   r2   rL   r   r-   r-   r.   r   Z  s    

cuda)
r   rz   nextr   iterZcurrent_devicer2   r$   Zbenchmark_gpusynchronize)	rQ   r   r   r   Zdevice_idx_setdevice_typer   Z
device_idxr   r-   r-   r.   r   K  s*   



z GPUDeviceBenchmarkMixin.do_benchr   r*   r+   r,   r   r-   r-   r-   r.   r   J      r   c                   @  r   )CPUDeviceBenchmarkMixinNr   r   r   r   r   r?   r   c                G  s
   t |S r   )r$   Zbenchmark_cpur   r-   r-   r.   r   n     
z CPUDeviceBenchmarkMixin.do_benchr   r   r-   r-   r-   r.   r   m  r   r   c                      sD   e Zd Z				d"d# fddZd$ddZdd Zd%d d!Z  ZS )&TritonBenchmarkRequestr   Nr   r7   r   r   r   r   r   module_pathmodule_cache_key
num_stagesr   	num_warpsmatrix_instr_nonkdimwaves_per_eukpackworkspace_argOptional[WorkspaceArg]r?   r@   c                   sF   t  |||| || _|| _|| _|| _|	| _|
| _|| _|| _	d S r   )
superr   r   r   r   r   r   r   r   r   )rQ   r   r   r   r   r   r   r   r   r   r   r   r   	__class__r-   r.   r   z  s   
zTritonBenchmarkRequest.__init__r   r   r   r   c                  s0  t | j| j}td| j| j t|| jjt	| j
 dj_i dd l}d|jv r3dd< jjdkr<dnjj}t|}|| jjj| jd urb| j fdd}|S tt|| jtjjjjrtjg R i diS tjg R i d	d
S )Nz"benchmark module key: %s, path: %sFr   Zwarmupcpuc                    s`   j } tj| fdtjjd}jtjkr|  g | R i dd d S )N)r    r   r2   TstreamZbenchmark_run)	r   r   Zempty_stridedZuint8r2   Z	zero_moder#   ZUNINITIALIZEDZzero_)workspace_sizeZworkspace_tensorr   r   r   
run_methodr   Z
warmup_argr   r-   r.   run_with_workspace  s.   z>TritonBenchmarkRequest.make_run_fn.<locals>.run_with_workspacer   Tr   )r   load_by_key_pathr   r   rA   rB   r   r   runr   r   __self__Zwith_bandwidth_infoinspect	signature
parametersr2   rL   r   Zget_raw_streamr   r   r   rG   r   Z	_inductorZruntimeZtriton_heuristicsZDebugAutotuner	functoolspartial)rQ   r   r   modr   r   r   r   r-   r   r.   r     sn   



	
z"TritonBenchmarkRequest.make_run_fnc                 C  s$   t | j| j}t|| j  d S r   )r   r   r   r   r   r   
precompile)rQ   r  r-   r-   r.   r    s   z!TritonBenchmarkRequest.precompilec                 C     d| j d| jd| jS )Nself.kernel_name=z, self.module_path=z, self.module_cache_key=)r   r   r   rP   r-   r-   r.   __str__     zTritonBenchmarkRequest.__str__)r   r   r   N)r   r7   r   r   r   r   r   r   r   r7   r   r7   r   r   r   r   r   r   r   r   r   r   r   r   r?   r@   r   r?   r7   )r*   r+   r,   r   r   r  r  __classcell__r-   r-   r   r.   r   w  s    
Tr   c                   @  r'   )TritonGPUBenchmarkRequestNr)   r-   r-   r-   r.   r    r/   r  c                   @  r'   )TritonCPUBenchmarkRequestNr)   r-   r-   r-   r.   r    r/   r  c                      sV   e Zd Zd fddZdd ZdddZdddZdd ZdddZd ddZ	  Z
S )!CUDABenchmarkRequestr   r7   r   r   r   r   r   source_coder?   r@   c                   sV   t  |||| || _d| _d | _d | _d| _d| _d| _t	
| jd\| _| _d S )Nr   F so)r   r   r  r   	workspaceDLL_workspace_size_updatedhash_keysource_filer   writerQ   r   r   r   r   r  r   r-   r.   r     s   zCUDABenchmarkRequest.__init__c                 C  s*   t d|  t| jd t d|  d S )NPrecompiling %sr  Done precompiling %s)rA   rB   r   compiler  rP   r-   r-   r.   r    s   zCUDABenchmarkRequest.precompiler   r   r   r   c             	   G  s   |    |   dd t||g D }td| j| j| j| j|| j	 t
tj j}t| j| j}t
d}| jdkrStj| jd d tj|jd| _t
| j }tj|g|| j	d ||R  S )Nc                 S  s   g | ]}t | qS r-   )r   data_ptrr   r-   r-   r.   r     s    
z4CUDABenchmarkRequest.make_run_fn.<locals>.<listcomp>zqmake_run_fn: self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sr         r   )ensure_dll_loadedupdate_workspace_sizer   rA   rB   r   r  r  r  r   r   r   r   current_streamcuda_streamr   r   ZzerosZfloat64r2   r  r  r  r  )rQ   r   r   rU   
stream_ptrr   Zworkspace_ptrr-   r-   r.   r     sJ   	
z CUDABenchmarkRequest.make_run_fnc              
   C  s   | j rd S |   tdd | jD }dd t|d D }ttj j	}t
| j| j}t }|g || jt|d |R   tj  |j| _td| j| j| j| j| j|| j d| _ d S )Nc                 S  s   h | ]}|j qS r-   )r   )r   metar-   r-   r.   	<setcomp>@  s    z=CUDABenchmarkRequest.update_workspace_size.<locals>.<setcomp>c                 S  s   g | ]}t d qS r   )r   )r   _r-   r-   r.   r   B  r   z>CUDABenchmarkRequest.update_workspace_size.<locals>.<listcomp>r    zupdate_workspace_size called: new workspace size=%d, self.kernel_name=%s, self.source_file=%s, self.hash_key=%s, self.DLL=%s, args=%s, self.extra_args=%sT)r  r  rz   r   r   r   r   r   r!  r"  r   r  r   r   r   r   r   r   r   rA   rB   r  r  )rQ   Zunique_input_countrU   r#  r   Zc_workspace_sizer-   r-   r.   r   ;  sH   
	

z*CUDABenchmarkRequest.update_workspace_sizec                 C  s,   | j d u rt| jd\| _ | _| _d S d S )Nr  )r  r   loadr  r  r  rP   r-   r-   r.   r  _  s
   
z&CUDABenchmarkRequest.ensure_dll_loadedc                 C  s   | j d ur
| j   d | _d S r   )r  closer  rP   r-   r-   r.   r   e  s   


z#CUDABenchmarkRequest.cleanup_run_fnc                 C  r  )Nr  z, self.source_file=z, self.hash_key=)r   r  r  rP   r-   r-   r.   r  j  r	  zCUDABenchmarkRequest.__str__r   r7   r   r   r   r   r   r   r  r7   r?   r@   r   rp   r
  )r*   r+   r,   r   r  r   r   r  r   r  r  r-   r-   r   r.   r    s    

'$
r  c                      sD   e Zd Zd fddZdd ZdddZdddZdddZ  ZS )CppBenchmarkRequestr   r7   r   r   r   r   r   r  r?   r@   c                   s,   t  |||| || _t|| _d | _d S r   )r   r   r  r   r  r  r  r   r-   r.   r   r  s   

zCppBenchmarkRequest.__init__c                 C  s,   t d|  tj| jdd t d|  d S )Nr  r   r   r  )rA   rB   r   r'  r  rP   r-   r-   r.   r    s   zCppBenchmarkRequest.precompiler   r   r   r   c                G  s   t j| jdd| _dd t||g D }td| j| j|| j t	| j| j}t
dd | jD s4J tjgt|tt| j  |_tj|g|| jR  S )Nr   r+  c                 S  s   g | ]}|  qS r-   )r  r   r-   r-   r.   r     r   z3CppBenchmarkRequest.make_run_fn.<locals>.<listcomp>zJmake_run_fn: self.kernel_name=%s, self.DLL=%s, args=%s, self.extra_args=%sc                 s  s    | ]	}t |tjV  qd S r   )rG   ctypesc_ulonglong)r   argr-   r-   r.   r     s    z2CppBenchmarkRequest.make_run_fn.<locals>.<genexpr>)r   r'  r  r  r   rA   rB   r   r   r   r   r,  r-  rz   argtypesr  r  )rQ   r   r   rU   r   r-   r-   r.   r     s*   zCppBenchmarkRequest.make_run_fnc                 C  s.   | j d ur	 t| j dr| j   d S d S d S )Nr(  )r  hasattrr(  rP   r-   r-   r.   r     s   
z"CppBenchmarkRequest.cleanup_run_fnc                 C  s   d| j S )Nr  )r   rP   r-   r-   r.   r    s   zCppBenchmarkRequest.__str__r)  r   rp   r
  )	r*   r+   r,   r   r  r   r   r  r  r-   r-   r   r.   r*  n  s    

r*  r   r   r?   r   c                 C  s
   t | S )zO
    Do benchmarking in a subprocess and return the perf number (latency).
    )tuning_poolrJ   )r   r-   r-   r.   benchmark_in_sub_process  r   r2  )r2   r3   r   )_
__future__r   
contextlibr,  dataclassesr  r   r4   ra   r   r   collections.abcr   r   concurrent.futuresr   r   r   r   r	   typingr
   r   r   r   r   r   Ztorch._inductor.async_compiler   Ztorch._dynamo.device_interfacer   Ztorch._dynamo.testingr   Ztorch._inductorr   Ztorch._inductor.codecacher   r   r   r   r   Ztorch._inductor.utilsr   r   Ztorch._loggingr   Ztorch.utils._ordered_setr   Zmultiprocessing.processr   Zmultiprocessing.queuesr   typesr   Z torch._inductor.select_algorithmr   Zcodegen.commonr!   r  r"   r#   Zruntime.benchmarkingr$   Zvirtualizedr%   r&   r{   r*   rA   	getLoggerry   r(   r0   rD   r1   contextmanagerr9   	dataclassr:   rt   r1  r   r   ZLayoutOrBufferr   rI   r   r   r   r   r  r  r  r*  r2  r-   r-   r-   r.   <module>   s   

 = 6`#
xw?