o
    Zh+                     @   s,  d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZmZmZ d dlZd dlmZmZ d dlmZ ejed	ZeoOej Zd
ZedZedZdeee
ef ef deee
ef ef fddZ G dd dZ!G dd de!Z"G dd de"Z#ere# Z$dS e" Z$dS )    N)cached_propertywraps)chain)median)AnyCallable)Concatenate	ParamSpecSelfTypeVar)countersdynamo_timed)use_experimental_benchmarkerZbenchmarkingi  PTfnreturnc              	      s.   t  dtdtjdtjdtf fdd}|S )zWraps `fn` with `dynamo_timed` context, and increments the appropriate dynamo
    counters. It is expected that `fn` is a method of `Benchmarker` or one of its
    subclasses; typing limitations prevent us from declaring this directly.
    selfargskwargsr   c                    st   | j j d j }td d|   d7  < t|dd  | g|R i |W  d    S 1 s3w   Y  d S )N.Zinductorzbenchmarking.   T)Zlog_pt2_compile_event)	__class____name__r   r   )r   r   r   Zfn_qual_namer    S/var/www/auris/lib/python3.10/site-packages/torch/_inductor/runtime/benchmarking.pywrapper"   s
   $ztime_and_count.<locals>.wrapper)r   r   r   r   r   r   )r   r   r   r   r   time_and_count   s   $r   c                   @   s   e Zd ZdeddfddZedededef deedf d	e	e
ef d
edefddZe	ddedeg ef dededef
ddZededed
edefddZdS )Benchmarkerr   r   Nc                 C   s   d S Nr   )r   r   r   r   __init__-   s   zBenchmarker.__init__r   .fn_args	fn_kwargsr   c                    s   d}t  D ]}t|tjsq	|du r|j}q	|j|kr#tdq	|du r,td fdd}|tdkrD| j|fi |S | j|fi |S )a  Benchmark `fn(*fn_args, *fn_kwargs)` and return the runtime, in milliseconds (the
        actual runtime calculation is dictated by the benchmarking implementation, but may be
        one of [mean, median, minimum, etc.]). Functions as a convenience wrapper around
        device-specific implementations, like `benchmark_cpu` and `benchmark_gpu`. Raises
        `ValueError(...)` if we can't safely infer the device type of `fn`; for example,
        if multiple device types are found in `fn_args` and `fn_kwargs`, or if no device
        types are found.

        Arguments:
        - fn: The function to benchmark.
        - fn_args: The function's arguments.
        - fn_kwargs: The function's kwargs.

        Keyword Arguments:
        - **kwargs: The benchmarking implementation's kwargs.

        Returns:
        - The runtime of `fn(*fn_args, **fn_kwargs)`, in milliseconds.
        NzcCan't safely infer the device type of `fn` with multiple device types in `fn_args` and `fn_kwargs`!zCan't safely infer the device type of `fn` with no device types in `fn_args` or `fn_kwargs`! You should be calling `.benchmark_cpu` or `.benchmark_gpu` directly.c                      s    i S r    r   r   r   r"   r#   r   r   <lambda>Y   s    z'Benchmarker.benchmark.<locals>.<lambda>cpu)	r   values
isinstancetorchZTensordevice
ValueErrorbenchmark_cpubenchmark_gpu)r   r   r"   r#   r   Zinferred_deviceZarg_or_kwarg	_callabler   r$   r   	benchmark0   s&   
zBenchmarker.benchmark   d   r.   warmuprepc                    s.   dt dtt f fdd}|| t||S )a  Benchmark the CPU callable, `_callable`, and return the median runtime,
        in milliseconds.

        Arguments:
        - _callable: The CPU callable to benchmark.

        Keyword Arguments:
        - warmup: Optionally, the duration, in milliseconds, to run `_callable`
        before benchmarking starts.
        - rep: Optionally, the duration, in milliseconds, to run `_callable`
        during benchmarking.

        Returns:
        - The median runtime of `_callable`, in milliseconds.
        msr   c                    sN   g }t  }	 t  }   t  }||| t  || t | kr&	 |S qr    )timeperf_counterappendMILLISECONDS_PER_SECOND)r4   ZtimingsZrun_start_tZstart_tZend_tr.   r   r   run_foru   s   z*Benchmarker.benchmark_cpu.<locals>.run_for)intlistfloatr   )r   r.   r2   r3   r:   r   r9   r   r,   a   s   zBenchmarker.benchmark_cpur   c                 O   s   t r    )NotImplementedError)r   r   r   r   r   r   r-      s   zBenchmarker.benchmark_gpu)r0   r1   )r   
__module____qualname__r
   r!   r   r   r   tupledictstrr=   r/   r;   r,   r-   r   r   r   r   r   ,   s>    


0
" r   c                	   @   sP   e Zd Zedededef fddZededeg ef dede	fdd	Z
d
S )TritonBenchmarkerr   r   .c              
   C   s6   z	ddl m} W |S  ty } ztd|d}~ww )z"Lazily import Triton's `do_bench`.r   )do_benchzrequires TritonN)Ztriton.testingrE   ImportErrorr>   )r   rE   er   r   r   triton_do_bench   s   
z!TritonBenchmarker.triton_do_benchr.   r   c                 K   s   t | jj}t| D ]	}||vr||= qd|v r&| j|fi |d S d|v r3| j|fi |S | j|fi |ddiS )a  Benchmark the GPU callable, `_callable`, and return the runtime, in milliseconds.

        Arguments:
        - _callable: The GPU callable to benchmark.

        Keyword Arguments:
        - quantiles: Optionally, a tuple of floats denoting the requested quantiles.
        - return_mode: Optionally, the requested return mode. Currently, Triton's
        `do_bench` supports min, max, mean, and median return modes.
        - **kwargs: Additional kwargs passed to Triton's `do_bench`.

        Returns:
        - The runtime of `callable`, in milliseconds. If `kwargs["quantiles"]` is specified,
        this is the first requested quantile. Else, if `kwargs["return_mode"]` is specified,
        this is the requested return mode. Otherwise, this is the median.
        Z	quantilesr   Zreturn_moder   )inspect	signaturerH   
parametersr<   keys)r   r.   r   Zdo_bench_paramskwargr   r   r   r-      s   zTritonBenchmarker.benchmark_gpuN)r   r?   r@   r   r
   r   r   rH   r   r=   r-   r   r   r   r   rD      s
    (rD   c                   @   s   e Zd ZededefddZdededeee	j
je	j
jf  fddZdedeee	j
je	j
jf  defd	d
Ze				ddedeg ef dedededededefddZdS )InductorBenchmarkerr   r   c                 C   s   t j }t j|}|jS )z7Get the L2 cache size, in bytes, of the current device.)r)   cudaZcurrent_deviceZget_device_propertiesL2_cache_size)r   r*   propsr   r   r   rP      s   
z!InductorBenchmarker.L2_cache_sizeitersc                 C   s   dd t |D S )z!Get `iters` pairs of CUDA events.c                 S   s(   g | ]}t jjd dt jjd dfqS )T)Zenable_timing)r)   rO   Event).0_r   r   r   
<listcomp>   s    z7InductorBenchmarker.get_event_pairs.<locals>.<listcomp>)range)r   rR   r   r   r   get_event_pairs   s   z#InductorBenchmarker.get_event_pairsevent_pairsc                 C   s   t dd |D S )zIGet the minimum timing, in milliseconds, for a group of CUDA event pairs.c                 S   s   g | ]	\}}| |qS r   )Zelapsed_time)rT   start_event	end_eventr   r   r   rV      s    zBInductorBenchmarker.get_event_pairs_min_timing.<locals>.<listcomp>)min)r   rY   r   r   r   get_event_pairs_min_timing   s
   z.InductorBenchmarker.get_event_pairs_min_timing   r1      r.   estimation_itersmemory_warmup_itersbenchmark_itersmax_benchmark_durationr   c                 K   s  t j  |  t j  t j| jd t jdd}|  | |}|D ]\}	}
|  |	  |  |
  q$t j  | 	|}t
t|t|| d}t|D ]}|  qR| |}|D ]\}	}
|  |	  |  |
  q`t j  | 	|}~t||S )a<  Benchmark a GPU callable using a custom benchmarking implementation.

        Arguments:
        - _callable: The callable to benchmark.

        Keyword Arguments:
        - estimation_iters: Optionally, the number of iterations to run `_callable`
        during runtime estimation.
        - memory_warmup_iters: Optionally, the number of iterations to flush the L2
        cache before starting benchmarking.
        - benchmark_iters: Optionally, the number of iterations to run `_callable`
        during the benchmarking.
        - max_benchmark_duration: Optionally, the maximum duration of the benchmarking,
        in milliseconds. An estimated duration is calculated based on the values
        of `memory_warmup_iters` and `benchmark_iters`, along with the estimated
        runtime of `_callable` and various other factors, and we then shrink
        `benchmark_iters` to fit in the alloted maximum duration.
        - **kwargs: Additional kwargs that may be passed to the fallback.

        Returns:
        - The minimum runtime of `_callable`, in milliseconds.
           rO   )Zdtyper*   r   )r)   rO   ZsynchronizeemptyrP   r;   Zzero_rX   recordr]   maxr\   rW   )r   r.   r`   ra   rb   rc   r   bufferrY   rZ   r[   Zestimated_timingrU   Zbenchmarked_timingr   r   r   r-      s8   
!










z!InductorBenchmarker.benchmark_gpuN)r^   r1   r1   r_   )r   r?   r@   r   r
   r;   rP   r<   rA   r)   rO   rS   rX   r=   r]   r   r   r   r-   r   r   r   r   rN      sN    


rN   )%rI   r5   	functoolsr   r   	itertoolsr   
statisticsr   typingr   r   Ztyping_extensionsr   r	   r
   r   r)   Ztorch._dynamo.utilsr   r   Ztorch._inductor.configr   Z_loggingZgetArtifactLoggerr   loggerrO   Zis_availabler8   r   r   r   r   rD   rN   Zbenchmarkerr   r   r   r   <module>   s8    
]'q