
    $ThPN                        S SK r S SKrS SKrS SKrS SKrS SKrS SKJr  S SKJ	r	J
r
Jr  SSKJr  SSKJr  S rS rS	 rSS
 jrSS jrSS jr " S S5      r " S S5      rS rSS jrSS jrS r\SS j5       rSS jrg)    N)contextmanager)AnyDictList   )language)runtimec                    SR                  U 5      n SSSSU -   S/n[        R                  " U5      nUR                  [        R
                  R                  5      R                  S5      nU Vs/ s H  n[        U5      PM     nnU$ s  snf )N,
nvidia-smi-i0z--query-gpu=z--format=csv,noheader,nounits)	join
subprocesscheck_outputdecodesysstdoutencodingsplitint)attrscmdoutretxs        F/var/www/auris/envauris/lib/python3.13/site-packages/triton/testing.pynvsmir      sz    HHUOEsNU$:<[
\C

!
!#
&C
**SZZ((
)
/
/
4C
3a3q63C
J  s   .Bc                 z   ^ ^ [        T 5      m[        T 5      m U U4S jnU Vs/ s H
  o" U5      PM     sn$ s  snf )Nc                    > SU s=::  a  S::  d  O  [        S5      eU TS-
  -  n[        R                  " U5      n[        R                  " U5      nX-
  nSU-
  TU   -  UTU   -  -   $ )Nr   r   z%Quantiles must be in the range [0, 1])
ValueErrormathfloorceil)qpointloweruppertans        r   get_quantile_quantile.<locals>.get_quantile   sj    Q!DEEQU

5!		% MA5!A%L00    )lensorted)r*   r%   r,   r+   s   `  @r   	_quantiler1      s8    AAq	A1 &''QLOQ'''s   8c                    Ub!  [        X5      n[        U5      S:X  a  US   nU$ US:X  a  U $ US:X  a  [        U 5      $ US:X  a  [        U 5      $ US:X  a  [        R
                  " U 5      $ US:X  a  [        R                  " U 5      $ g )Nr   r   allminmaxmeanmedian)r1   r/   r4   r5   
statisticsr6   r7   )times	quantilesreturn_moder   s       r   _summarize_statisticsr<   *   s    )s8q=a&C
e		5z		5z		u%%		   '' 
!r.   c                     SSK nUS;   d   eUR                  R                  UR                  R                  5       5         U " 5         Ub1  U H+  nUR	                  5         UR                  S5        SUl        M-     UR                  R                  SS9nUR                  R                  SS9nUR                  5         [        S5       H
  n	U " 5         M     UR                  5         UR                  R                  5         UR                  U5      S-  n
[        S[        X-  5      5      nUR                  R                  5       nUR                  R                  U5         [        U5       H  n	Ub  U H
  nSUl        M     U " 5         M     SSS5        UR                  R                  5         / nSn[        U5       H  n	UR                  R                  SS9nUR                  R                  SS9nUR                  5         UR!                  5         UR                  5         UR                  R                  5         XR                  U5      U-  /-  nM     [#        XU5      sSSS5        $ ! , (       d  f       N= f! , (       d  f       g= f)	a  
Benchmark the runtime of the provided function.

:param fn: Function to benchmark
:type fn: Callable
:param rep: Repetition time (in ms)
:type rep: int
:param grad_to_none: Reset the gradient of the provided tensor to None
:type grad_to_none: torch.tensor, optional
:param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
:type return_mode: str
r   Nr4   r5   r6   r7   r3   Tenable_timing   r   
   )torchcudastreamStreamdetach_requires_grad_gradEventrecordrangesynchronizeelapsed_timer5   r   	CUDAGraphgraphreplayr<   )fnrepgrad_to_noner:   r;   rC   r   start_event	end_event_estimate_msn_repeatgr   	n_retriess                  r   do_bench_cudagraphr\   <   s    AAAA			5::,,.	/
#!		  & " jj&&T&:JJ$$4$8	qAD 

 !..y9A=q#c/01 JJ  "ZZa 8_+)!% *	 % ! 	

 	y!A*****>K

((t(<I HHJJJ""$,,Y7(BCCC " %S[AY 
0	/4 ! 5 
0	/s&    D!I?!-I.CI?.
I<	8I??
Jc                    US;   d   e[         R                  R                  R                  5       nU " 5         UR	                  5         [         R                  R                  R                  5       nUR                  SS9nUR                  SS9n	UR                  5         [        S5       H3  n
[         R                  R                  R                  U5        U " 5         M5     U	R                  5         UR	                  5         UR                  U	5      S-  n[        S[        X-  5      5      n[        S[        X+-  5      5      n[        U5       Vs/ s H  oR                  SS9PM     nn[        U5       Vs/ s H  oR                  SS9PM     n	n[        U5       H
  n
U " 5         M     [        U5       Hj  nUb  U H
  nSUl        M     [         R                  R                  R                  U5        X   R                  5         U " 5         X   R                  5         Ml     UR	                  5         [        X5       VVs/ s H  u  nnUR                  U5      PM     nnn[        UXE5      $ s  snf s  snf s  snnf )a  
Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with
the 20-th and 80-th performance percentile.

:param fn: Function to benchmark
:type fn: Callable
:param warmup: Warmup time (in ms)
:type warmup: int
:param rep: Repetition time (in ms)
:type rep: int
:param grad_to_none: Reset the gradient of the provided tensor to None
:type grad_to_none: torch.tensor, optional
:param quantiles: Performance percentile to return in addition to the median.
:type quantiles: list[float], optional
:param return_mode: The statistical measure to return. Options are "min", "max", "mean", "median", or "all". Default is "mean".
:type return_mode: str
r>   Tr?   rA   r   N)r	   driveractiveget_device_interfacerM   get_empty_cache_for_benchmarkrJ   rK   rL   clear_cacherN   r5   r   rI   zipr<   )rR   warmuprS   rT   r:   r;   dicacherU   rV   rW   rX   n_warmuprY   ir   ser9   s                      r   do_benchrk   {   s   $ AAAA				3	3	5BDNNNN!!??AE (((.Kt,I1X))%0
  NN**959K 1c&./0H1c#+,-H9>xIA88$8/KI7<XG!-IG8_
  8_ #! " 	))%0
  NN+.{+FG+F41aQ^^A+FEG 	??- JG( Hs   I-6I2 I7c                    SSK nSSKn[        XR                  5      (       d  UR	                  U 5      n [        XR                  5      (       d  UR	                  U5      nUc  Sn[        U5      (       a  U" U R                  5      OUnUc  Sn[        U5      (       a  U" U R                  5      OUn[        XR                  5      (       aV  U R                  UR                  :X  a  U R                  5       n U R                  5       R                  5       R                  5       n [        XR                  5      (       aV  UR                  UR                  :X  a  UR                  5       nUR                  5       R                  5       R                  5       nU R                  S:  d  UR                  S:  a  UR                  R                  XX#SS9  gUR                  XX#S9(       d  [        U S	U  S
U SU SU S3
5      eg)a  
Asserts that two inputs are close within a certain tolerance.

:param x: The first input.
:type x: scala, list, numpy.ndarray, or torch.Tensor
:param y: The second input.
:type y: scala, list, numpy.ndarray, or torch.Tensor
:param atol: The absolute tolerance. Default value is 1e-2.
:type atol: float, optional
:param rtol: The relative tolerance. Default value is 0.
:type rtol: float, optional
:param err_msg: The error message to use if the assertion fails.
:type err_msg: str
r   Ng{Gz?g        r   T)atolrtol	equal_nan)rm   rn    z is not close to z (atol=z, rtol=))numpyrC   
isinstanceTensortensorcallabledtypebfloat16floatcpudetachsizetestingassert_allcloseallcloseAssertionError)r   yrm   rn   err_msgnprC   s          r   assert_closer      s     a&&LLOa&&LLO|$TNN4=D|$TNN4=D !\\""77enn$	AEEGNN""$!\\""77enn$	AEEGNN""$ 	vvzQVVaZ


""1d"N;;q$;2y!,=aSvWUYTZZ[\]] 3r.   c                   ~    \ rS rSrSr     SS\\   S\\   S\S\\   S\\   S	\S
\\\4   S\S\S\	S\	4S jjr
Srg)	Benchmark   zc
This class is used by the :code:`perf_report` function to generate line plots with a concise API.
Nx_namesx_valsline_arg	line_vals
line_names	plot_nameargsxlabelylabelx_logy_logc                     Xl         X l        Xl        X0l        X@l        XPl        Xl        Xl        Xl        Xl	        X`l
        Xpl        g)a  
Constructor.
x_vals can be a list of scalars or a list of tuples/lists. If x_vals is a list
of scalars and there are multiple x_names, all arguments will have the same value.
If x_vals is a list of tuples/lists, each element should have the same length as
x_names.

:param x_names: Name of the arguments that should appear on the x axis of the plot.
:type x_names: List[str]
:param x_vals: List of values to use for the arguments in :code:`x_names`.
:type x_vals: List[Any]
:param line_arg: Argument name for which different values correspond to different lines in the plot.
:type line_arg: str
:param line_vals: List of values to use for the arguments in :code:`line_arg`.
:type line_vals: List[Any]
:param line_names: Label names for the different lines.
:type line_names: List[str]
:param plot_name: Name of the plot.
:type plot_name: str
:param args: Dictionary of keyword arguments to remain fixed throughout the benchmark.
:type args: Dict[str, Any]
:param xlabel: Label for the x axis of the plot.
:type xlabel: str, optional
:param ylabel: Label for the y axis of the plot.
:type ylabel: str, optional
:param x_log: Whether the x axis should be log scale.
:type x_log: bool, optional
:param y_log: Whether the y axis should be log scale.
:type y_log: bool, optional
:param styles: A list of tuples, where each tuple contains two elements: a color and a linestyle.
:type styles: list[tuple[str, str]]
N)r   r   r   r   r   r   r   stylesr   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   s                r   __init__Benchmark.__init__   sA    ^ 
 "$
"	r.   )r   r   r   r   r   r   r   r   r   r   r   r   ) r   FFN)__name__
__module____qualname____firstlineno____doc__r   strr   r   boolr   __static_attributes__ r.   r   r   r      s     ;c; S	; 	;
 9; I; ; 38n; ; ; ; ; ;r.   r   c            	       F    \ rS rSrS r  SS\S\S\S\4S jjrSS jr	S	r
g
)Marki3  c                     Xl         X l        g NrR   
benchmarks)r   rR   r   s      r   r   Mark.__init__5  s    $r.   bench	save_path
show_plots
print_datac           	      	   SS K nSS KJn	  SS Kn
UR                  nUR                   Vs/ s H  o S3PM	     nnUR                   Vs/ s H  o S3PM	     nn[        UR                  5      nU
R                  X-   U-   U-   S9nUR                   GH   n[        U[
        [        45      (       d  U Vs/ s H  nUPM     nn[        U5      [        U5      :w  a  [        S[        U5       SU 35      e[        [        X5      5      n/ / / nnnUR                   HI  nU R                   " S0 UDUR"                  U0DUR$                  DUD6n Uu  pnUU/-  nUU/-  nUU/-  nMK     [        U5      U-   U-   U-   UR(                  [        U5      '   GM     UR*                  (       Ga-  U	R-                  5         U	R/                  5       nUS   n[1        UR                  5       GH  u  nnUUS-      UUS-      pUR2                  (       a  UR2                  U   S   OS nUR2                  (       a  UR2                  U   S   OS nUR5                  UU   UU   UUUS9  UR7                  5       R9                  5       (       a  M  UR7                  5       R9                  5       (       a  M  UR;                  [<        5      nUR;                  [<        5      nUR?                  UU   XS	US
9  GM     URA                  5         URC                  URD                  =(       d    U5        URG                  URH                  5        URK                  URL                  (       a  SOS5        URO                  URP                  (       a  SOS5        U(       a  U	RS                  5         U(       a7  U	RU                  URV                  RY                  X!R*                   S35      5        UXR                  -      nU(       a>  URZ                  S   S:X  a+  UR\                  R_                  5       u  nnUU   UU   -
  US'   U(       a1  [a        UR*                  S-   5        [a        URc                  5       5        U(       a;  URe                  URV                  RY                  X!R*                   S35      SU S3SS9  U$ s  snf s  snf s  snf ! [&         a	    US S pn GNwf = f)Nr   z-minz-max)columnsz	Expected z values, got r   )labelcolorlsg333333?)alphar   loglinearz.png   Diff:z.csvz%.fF)float_formatindexr   )3osmatplotlib.pyplotpyplotpandasr   listr   	DataFramer   rs   tupler/   r!   dictrc   r   rR   r   r   	TypeErrorlocr   figuresubplot	enumerater   plotisnullr3   astypery   fill_betweenlegend
set_xlabelr   
set_ylabelr   
set_xscaler   
set_yscaler   showsavefigpathr   shaper   tolistprint	to_stringto_csv)r   r   r   r   r   diff_colsave_precisionkwragsr   pltpdy_meanr   y_miny_maxr   dfrW   x_argsrow_meanrow_minrow_maxr   r   axfirst_xrh   colstycol0col1s                                  r   _run	Mark._run9  s
   '!!%*%5%56%53d%56%*%5%56%53d%56u}}%\\'"2U":U"B\CAa$// '(1Q(1vW% 9S\N-s!KLL#g/*F)+RwgH__ggVV5>>1*=VVvV;+.(F5 VH$E7"E7" % #1g07:WDBFF3r7O' * ???JJLBajG!%"2"231!!f*~r!f*~u,1LLell1oa(d,1LLell1oa(d7RU!33G||~))++ELLN4F4F4H4H!LL/E!LL/EOOBwKTQTOU 4 IIKMM%,,1'2MM%,,'MM5;;%H=MM5;;%H=
BGGLL6Gt4LMN***+q(**,JD$DBtH,BvJ%//C'(",,.!IIbggll90A.FGXZ[iZjjkVl!  #	y 76 ) ! ;+.d5F5;s#   R%R*2R/
R44SSc           	      L   [        U R                  [        5      nU(       a  U R                  /OU R                  n/ nU(       aP  [        R                  " USS9  [        [        R                  R                  US5      S5      n	U	R                  S5        U HN  n
UR                  U R                  " XX40 UD65        U(       d  M/  W	R                  SU
R                   S35        MP     U(       a!  W	R                  S5        U	R                  5         U(       a  U(       a  US	   $ U$ g )
NT)exist_okzresults.htmlwz<html><body>
z<image src="z.png"/>
z</body></html>
r   )rs   r   r   r   makedirsopenr   r   writeappendr   r   close)r   r   r   r   	return_dfkwargshas_single_benchr   
result_dfshtmlr   s              r   runMark.run~  s    %dooyA*:doo&

KK	D1Y?EDJJ'(Edii*[TZ[\y

]5??*;:FG   JJ)*JJL!!}$!!r.   )r   rR   N)F   )FFr   F)r   r   r   r   r   r   r   r   r   r   r   r   r.   r   r   r   3  s>    % chC) C C CSW CJr.   r   c                    ^  U 4S jnU$ )z
Mark a function for benchmarking. The benchmark can then be executed by using the :code:`.run` method on the return value.

:param benchmarks: Benchmarking configurations.
:type benchmarks: List of :class:`Benchmark`
c                    > [        U T5      $ r   )r   r   s    r   <lambda>perf_report.<locals>.<lambda>  s    b*-r.   r   )r   wrappers   ` r   perf_reportr    s     .GNr.   c                    SSK nSSKJn  U (       d  UR                  R	                  5       n UR
                  R                  R                  U 5      S   nUR
                  R                  R                  U 5      S   nX4-  S-  S-  S	-  nU$ )
zreturn DRAM bandwidth in GB/s r   Nr   r^   mem_clock_ratemem_bus_widthr   g    .A   )rC   r	   r^   rD   current_devicer_   utilsget_device_properties)devicerC   r^   mem_clock_khz	bus_widthbw_gbpss         r   get_dram_gbpsr    sx    **,MM''==fEFVWM##99&A/RI'!+c1A5GNr.   c                 Z   SS K nSSKJn  U(       d  UR                  R	                  5       nUR
                  R                  R                  U5      S   S-  nUR                  R                  U5      nUS   S:  a  XR                  :X  d   eSnOXR                  UR                  4;   a  SnOtXR                  UR                  UR                  4;   a  SnOKXR                  [        R                   [        R"                  [        R$                  4;   a  S	nO['        S
5      eXQ-  U-  S-  nU$ )Nr   r   r  multiprocessor_count   r     i   i   dtype not supported&.>)rC   r	   r^   rD   r	  r_   r
  r  get_device_capabilityfloat16float32int32rx   int16int8tl
float8e4nvfloat8e4b15float8e5RuntimeError	rw   
clock_rater  rC   r^   num_subcores
capabilityops_per_sub_coretflopss	            r   get_max_tensorcore_tflopsr(    s    **,==&&<<VDE[\_``L11&9J!}q%%%]]EKK00"}}ennekkBB"zz2==".."++NN#455&)99D@FMr.   c                     ^  U 4S jnU$ )Nc                 J   >^  [         R                  " T 5      UU 4S j5       nU$ )Nc                    > SS K nUR                  [        R                  " 5       5      R	                  5       nT
R                  5       UR                  5       :*  nU(       a  US:w  a  [        R                  R                  TR                  S   5      n[        R                  S   SS.nSU;   d   S5       eUS   R                  R                  R                  nU S	TR                   S
U S3n[        R                  " SSSU/SUS9n	U	R                   S:X  d   S5       eS[#        U	R$                  5      ;   d   eg T" U 0 UD6  g )Nr   zcuda-memcheck__file__PATH1)r-  PYTORCH_NO_CUDA_MEMORY_CACHINGrequestz@memcheck'ed test must have a (possibly unused) `request` fixturez::[]pytestz-vsT)capture_outputenvz7cuda-memcheck returned an error: bounds checking failedzERROR SUMMARY: 0 errors)psutilProcessr   getppidnameitemsr   realpath__globals__environnodecallspecidr   r   r   
returncoder   r   )r   r   r6  	ppid_namerun_cuda_memcheckr   r5  test_idr   r   target_kwargstest_fns             r   r  1cuda_memcheck.<locals>.decorator.<locals>.wrapper  s!   rzz|499;I - 3 3 5 G Y/%Aww''(;(;J(GH!zz&1UXY F*n,nn* +0099<<b!1!1 2!G9A> nnox%L]agjk~~*e,ee*0C

OCCC((r.   )	functoolswraps)rF  r  rE  s   ` r   	decorator cuda_memcheck.<locals>.decorator  s%    		!	) 
"	)" r.   r   )rE  rJ  s   ` r   cuda_memcheckrL    s    , r.   c           	   #     #     [         R                  " / SQ5        [         R                  " SSSSU  SU  3/5        [         R                  " SSSSU SU 3/5        [        S/5      S	   n[        S
/5      S	   n[        X -
  5      S:  d   SU  S35       e[        X1-
  5      S:  d   SU S35       eSU -  nSU-  S-  nXE4v   [         R                  " / SQ5        [         R                  " / SQ5        [         R                  " / SQ5        g ! [         R                  " / SQ5        [         R                  " / SQ5        [         R                  " / SQ5        f = f7f)N)r   r   r   -pmr.  r   r   r   z--lock-gpu-clocks=r   z--lock-memory-clocks=zclocks.current.smr   zclocks.current.memoryrB   zGPU SMs must run at z MHzg 3O?i   gMbP?)r   r   r   rN  r   )r   r   r   z-rgc)r   r   r   z-rmc)r   r   r   abs)ref_sm_clockref_mem_clockcur_sm_clockcur_mem_clockr'  gbpss         r   set_gpu_clockrU    sf    C EF a~>	!
 	 	#M?!M?C	!
 	 123A6678;<./"4_8L\NZ^6__4=01B6b:N}o]a8bb6)L8&-l EF AB AB 	 EF AB ABs   EC D A	EA
EEc                    SS K nSSKJn  U(       d  UR                  R	                  5       nUR
                  R                  R                  U5      S   S-  nUR                  R                  5       nUS   S:  a/  XR                  :X  a  SnOXXR                  :X  a  SnOF[        S	5      eXR                  :X  a  SnO)XR                  UR                  4;   a  SnO[        S	5      eXQ-  U-  S
-  nU$ )Nr   r   r  r  r  r      @   r  r  )rC   r	   r^   rD   r	  r_   r
  r  r  r  r  r!  rx   r"  s	            r   get_max_simd_tflopsrY    s    **,==&&<<VDE[\_``L113J!}qMM!!mm#!455MM!!}}enn55!455&)99D@FMr.   )   NNr6   )   d   NNr6   )NNr   r   )iF  i  )rH  r"   r   r8   r   r   
contextlibr   typingr   r   r   r   r   r  r	   r   r1   r<   r\   rk   r   r   r   r  r  r(  rL  rU  rY  r   r.   r   <module>r_     s      	   
 % " "  ( ($<B~?@D0^f@ @F` `F
:6 C C8r.   