
    &ThM                     :   S SK r S SKrS SKrS SKrS SKrS SKrS SKJr  S SKJ	r	  S SK
Jr  S SKJr  S SKJr  S SKJr  \R$                  R'                  \R$                  R)                  \5      5      r\R$                  R-                  \S5      /r\R$                  R-                  \S	5      rS
/r\ R4                  " 5       S 5       r\ R4                  " 5       S 5       r\ R4                  " 5       S 5       rS r " S S\5      r S r!S r" " S S\5      r# " S S\5      r$g)    N)Path)_build)get_cache_manager)_allocation)	GPUTarget)	GPUDriverincludelibcudac            	      "   [         R                  " S5      n U (       a  U /$ [        R                  " SS/5      R	                  5       nUR                  5        Vs/ s H  nSU;   d  M  UR                  5       S   PM      nnU Vs/ s H"  n[         R                  R                  U5      PM$     nn[         R                  " S5      nU(       al  U(       de  UR                  S5       Vs/ s HI  n[         R                  R                  [         R                  R                  US5      5      (       d  MG  UPMK     nnSnU(       a  US	[        U5      -  -  nUS
-  nO
US-  nUS-  n[        S U 5       5      (       d   U5       eU$ s  snf s  snf s  snf )NTRITON_LIBCUDA_PATHz/sbin/ldconfigz-plibcuda.so.1LD_LIBRARY_PATH:zlibcuda.so cannot found!
z!Possible files are located at %s.z:Please create a symlink of libcuda.so to any of the files.z<Please make sure GPU is set up and then run "/sbin/ldconfig"z- (requires sudo) to refresh the linker cache.c              3      #    U  HB  n[         R                  R                  [         R                  R                  US 5      5      v   MD     g7f)r   N)ospathexistsjoin).0r   s     U/var/www/auris/envauris/lib/python3.13/site-packages/triton/backends/nvidia/driver.py	<genexpr>libcuda_dirs.<locals>.<genexpr>)   s/     Sddrww~~bggll4@AAds   A
A)r   getenv
subprocesscheck_outputdecode
splitlinessplitr   dirnamer   r   strany)	env_libcuda_pathlibslinelocslocdirsenv_ld_library_pathdirmsgs	            r   libcuda_dirsr-      s\   yy!67 !!""$4d#;<CCED *.):U):nPT>TDJJL):DU,01DSBGGOOC DD1))$564288=s=PRPWPWP\P\]`bpPqAr=s
&C2SY>>KKMM>>SdSSSXUXXSK V1 ts   
F'F)F&AF0Fc                  $    [         /[        5       Q$ N)libdevice_dirr-        r   library_dirsr3   -   s    +LN++r2   c                  ^    SSK Jn JnJn  SR	                  U " 5       U" 5       /U" 5       Q5      $ )Nr   machinesystemarchitecture,)platformr6   r7   r8   r   r5   s      r   platform_keyr;   2   s'    6688WY:<>:;;r2   c           	         [         R                  " U [        5       -   R                  S5      5      R	                  5       n[        U5      n[        R                  " S5      R                  S5      S   nUR                  U SU 35      nUc  [        R                  " 5        n[        R                  R                  US5      n[        US5       nUR!                  U 5        S S S 5        [#        XU[%        5       [&        [(        5      n	[        U	S5       nUR+                  UR-                  5       U SU 3SS	9nS S S 5        S S S 5        S
S Kn
U
R0                  R3                  X5      nU
R0                  R5                  U5      nUR6                  R9                  U5        U$ ! , (       d  f       N= f! , (       d  f       N~= f! , (       d  f       N= f)Nzutf-8
EXT_SUFFIX.r   zmain.cwrbT)binaryr   )hashlibsha256r;   encode	hexdigestr   	sysconfigget_config_varr    get_filetempfileTemporaryDirectoryr   r   r   openwriter   r3   include_dir	librariesputreadimportlib.utilutilspec_from_file_locationmodule_from_specloaderexec_module)srcnamekeycacheext
cache_pathtmpdirsrc_pathfso	importlibspecmods                r   compile_module_from_srcrd   8   s\   
..#.66w?
@
J
J
LCc"E

"
"<
0
6
6s
;B
?C4&#0J((*fww||FH5Hh$ %YWBb$1"YYqvvxD63%YN
   + >>11$CD
..
)
)$
/CKKC J %$   +*s<    -G F3G %F/7G 
F,	(G /
F=	9G  
Gc                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	CudaUtilsQ   c                 n   > [        U S5      (       d  [        [        U ]  U 5      U l        U R                  $ )Ninstance)hasattrsuperrf   __new__ri   )cls	__class__s    r   rl   CudaUtils.__new__S   s-    sJ'' C8=CL||r2   c                 Z   [        [        [        R                  R	                  [
        S5      5      R                  5       S5      nUR                  U l        UR                  U l        UR                  U l	        UR                  U l
        UR                  U l        UR                  U l        g )Nzdriver.c
cuda_utils)rd   r   r   r   r   r!   	read_textload_binaryget_device_propertiescuOccupancyMaxActiveClustersset_printf_fifo_sizefill_1d_tma_descriptorfill_2d_tma_descriptor)selfrc   s     r   __init__CudaUtils.__init__X   s~    %d277<<+L&M&W&W&Y[gh??%(%>%>",/,L,L)$'$<$<!&)&@&@#&)&@&@#r2   )ru   rw   rx   rt   rs   rv   )__name__
__module____qualname____firstlineno__rl   rz   __static_attributes____classcell__rn   s   @r   rf   rf   Q   s    
A Ar2   rf   c                     U S   S:X  a  g0 SS_SS_SS	_S
S_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_U    $ )Nr   *CUdeviceptri1int32_ti8int8_ti16int16_ti32i64int64_tu1uint32_tu8uint8_tu16uint16_tu32u64uint64_tfp16floatbf16fp32f32fp64double	nvTmaDescCUtensorMapr1   )tys    r   	ty_to_cppr   g   s    	!u|ih 	y 	y	
 	y 	j 	i 	z 	z 	z 	 	 	 	w 	  	]!" 	#
 
r2   c                 |  ^^^ U4S jmU4S jmU4S jmSR                  UR                  5        Vs/ s H  nT" U5      PM     sn5      nSU-   nSR                  [        TUR                  5       5      5      n[        [	        [
        UR                  S5      5      5      n[        U5       VVs0 s H  u  pVXV_M	     nnn[        U5      S:  a)  SSR                  S	 UR                  5        5       5      -   OSnSR                  S
 UR                  5        5       5      n/ n	UR                  5        H]  u  pRUS   S:X  a  U	R                  SU S35        M%  US:X  a  U	R                  SU 35        MA  US:w  d  MI  U	R                  SU 35        M_     [        [        U5      5      n
SnUR                  5        VVs/ s H  u  pRUS   S:X  d  M  SU SU SU SU S3	PM!     nnnUR                  5        VVs/ s H  u  pRUS:X  d  M  SU SU SU S3PM     nnnUR                  5        VVs/ s H  u  pRUS:w  d  M  SU 3PM     n
nnU
R                  S5        S[        U5      S:  a  SU-   OS SSR                  U
5       SUR                  UR                  5        VVs/ s H  u  pRT" U5       S U S!3PM     snn5       S"U S#U S$UR                  U5       SUR                  U5       S%[        U	5      S:  a  SSR                  U	5      -   OS S&3nU$ s  snf s  snnf s  snnf s  snnf s  snnf s  snnf )'Nc                 h   > [        U [        5      (       a  SR                  [        TU 5      5      $ U $ )Nr9   )
isinstancetupler   map)sig_serialize_signatures    r   r   +make_launcher.<locals>._serialize_signature   s,    c5!!88C 4c:;;
r2   c                    > [        U [        5      (       a!  SR                  [        TU 5      5      nSU S3$ U S   S:X  a  gU S;   a  g[	        U 5      $ )Nr9   []r   r   z	PyObject*	constexprr   r   r   r   r   r   )r   val_extracted_types     r   r   &make_launcher.<locals>._extracted_type   sW    b%  ((334Cse1:a5C<++}r2   c                    > [        U [        5      (       a!  SR                  [        TU 5      5      nSU S3$ U S   S:X  a  gU S;   a  gSS	S
SSSSSSSSS.[	        U 5         $ )N ()r   r   Or   r_   dlbhiLBHIK)r   r   longr   r   r   r   r   r   r   r   r   )r   r   	format_ofs     r   r    make_launcher.<locals>.format_of   s    b%  ''#i,-Cse1:a5C<++
 B- 	r2   r   iiiKKpOOOOOr9   r   z, c              3   0   #    U  H  u  pS U 3v   M     g7f)z&_argNr1   r   r   r   s      r   r    make_launcher.<locals>.<genexpr>   s      L:K5:Ks   c              3   V   #    U  H  u  pUS :w  d  M  [        U5       SU 3v   M!     g7f)r   z argN)r   r   s      r   r   r      s/     hARVX\gVg3Yr]O4s3ARs   ))r   ptr_infoz.dev_ptrr   z*tma_ptrr   _argz
  zDevicePtrInfo ptr_infoz = getPointer(_argz); if (!ptr_infoz.valid) return NULL;zCUtensorMap* tma_ptrz = getTmaDesc(_argz); if (!tma_ptrz) return NULL;z&argz&global_scratchaB  
#include "cuda.h"
#include <stdbool.h>
#include <Python.h>
#include <dlfcn.h>

static inline void gpuAssert(CUresult code, const char *file, int line)
{
   if (code != CUDA_SUCCESS)
   {
      const char* prefix = "Triton Error [CUDA]: ";
      const char* str;
      cuGetErrorString(code, &str);
      char err[1024] = {0};
      strcat(err, prefix);
      strcat(err, str);
      PyGILState_STATE gil_state;
      gil_state = PyGILState_Ensure();
      PyErr_SetString(PyExc_RuntimeError, err);
      PyGILState_Release(gil_state);
   }
}

#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }

typedef CUresult (*cuLaunchKernelEx_t)(const CUlaunchConfig* config, CUfunction f, void** kernelParams, void** extra);

static cuLaunchKernelEx_t getLaunchKernelExHandle() {
  // Open the shared library
  void* handle = dlopen("libcuda.so.1", RTLD_LAZY);
  if (!handle) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to open libcuda.so.1");
    return NULL;
  }
  // Clear any existing error
  dlerror();
  cuLaunchKernelEx_t cuLaunchKernelExHandle = (cuLaunchKernelEx_t)dlsym(handle, "cuLaunchKernelEx");
  // Check for errors
  const char *dlsym_error = dlerror();
  if (dlsym_error) {
    PyErr_SetString(PyExc_RuntimeError, "Failed to retrieve cuLaunchKernelEx from libcuda.so.1");
    return NULL;
  }
  return cuLaunchKernelExHandle;
}

static void _launch(int gridX, int gridY, int gridZ, int num_warps, int num_ctas, int launch_cooperative_grid, int clusterDimX, int clusterDimY, int clusterDimZ, int shared_memory, CUstream stream, CUfunction function, CUdeviceptr global_scratchz) {
  void *params[] = { aO   };
  if (gridX*gridY*gridZ > 0) {
    if ((num_ctas == 1) && (0 == launch_cooperative_grid)) {
      CUDA_CHECK(cuLaunchKernel(function, gridX, gridY, gridZ, 32*num_warps, 1, 1, shared_memory, stream, params, 0));
    } else if ((num_ctas == 1) && (0 != launch_cooperative_grid)) {
      CUlaunchAttribute launchAttr[1];
      CUlaunchAttribute coopAttr = { .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1};
      launchAttr[0] = coopAttr;

      CUlaunchConfig config;
      config.gridDimX = gridX;
      config.gridDimY = gridY;
      config.gridDimZ = gridZ;
      config.blockDimX = 32 * num_warps;
      config.blockDimY = 1;
      config.blockDimZ = 1;
      config.sharedMemBytes = shared_memory;
      config.hStream = stream;
      config.attrs = launchAttr;
      config.numAttrs = 1;

      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
      if (cuLaunchKernelExHandle == NULL) {
        cuLaunchKernelExHandle = getLaunchKernelExHandle();
      }
      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));

    } else {
      CUlaunchAttribute launchAttr[3];
      launchAttr[0].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
      launchAttr[0].value.clusterDim.x = clusterDimX;
      launchAttr[0].value.clusterDim.y = clusterDimY;
      launchAttr[0].value.clusterDim.z = clusterDimZ;
      launchAttr[1].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE;
      launchAttr[1].value.clusterSchedulingPolicyPreference = CU_CLUSTER_SCHEDULING_POLICY_SPREAD;

      unsigned numAttrs = 2;
      if (0 != launch_cooperative_grid) {
        CUlaunchAttribute coopAttr = { .id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, .value = 1};
        launchAttr[2] = coopAttr;
        numAttrs = 3;
      }

      CUlaunchConfig config;
      config.gridDimX = gridX * clusterDimX;
      config.gridDimY = gridY * clusterDimY;
      config.gridDimZ = gridZ * clusterDimZ;
      config.blockDimX = 32 * num_warps;
      config.blockDimY = 1;
      config.blockDimZ = 1;
      config.sharedMemBytes = shared_memory;
      config.hStream = stream;
      config.attrs = launchAttr;
      config.numAttrs = numAttrs;
      static cuLaunchKernelEx_t cuLaunchKernelExHandle = NULL;
      if (cuLaunchKernelExHandle == NULL) {
        cuLaunchKernelExHandle = getLaunchKernelExHandle();
      }
      CUDA_CHECK(cuLaunchKernelExHandle(&config, function, params, 0));
    }
  }
}

typedef struct _DevicePtrInfo {
    CUdeviceptr dev_ptr;
    bool valid;
} DevicePtrInfo;

static inline DevicePtrInfo getPointer(PyObject *obj, int idx) {
  DevicePtrInfo ptr_info;
  ptr_info.dev_ptr = 0;
  ptr_info.valid = true;
  if (PyLong_Check(obj)) {
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(obj);
    return ptr_info;
  }
  if (obj == Py_None) {
    // valid nullptr
    return ptr_info;
  }
  PyObject *ptr = PyObject_GetAttrString(obj, "data_ptr");
  if(ptr){
    PyObject *empty_tuple = PyTuple_New(0);
    PyObject *ret = PyObject_Call(ptr, empty_tuple, NULL);
    Py_DECREF(empty_tuple);
    Py_DECREF(ptr);
    if (!PyLong_Check(ret)) {
      PyErr_SetString(PyExc_TypeError, "data_ptr method of Pointer object must return 64-bit int");
      ptr_info.valid = false;
      return ptr_info;
    }
    ptr_info.dev_ptr = PyLong_AsUnsignedLongLong(ret);
    if(!ptr_info.dev_ptr)
      return ptr_info;
    uint64_t dev_ptr;
    int status = cuPointerGetAttribute(&dev_ptr, CU_POINTER_ATTRIBUTE_DEVICE_POINTER, ptr_info.dev_ptr);
    if (status == CUDA_ERROR_INVALID_VALUE) {
        PyErr_Format(PyExc_ValueError,
                     "Pointer argument (at %d) cannot be accessed from Triton (cpu tensor?)", idx);
        ptr_info.valid = false;
    } else if (status != CUDA_SUCCESS) {
        CUDA_CHECK(status);  // Catch any other cuda API errors
        ptr_info.valid = false;
    }
    ptr_info.dev_ptr = dev_ptr;
    Py_DECREF(ret);  // Thanks ChatGPT!
    return ptr_info;
  }
  PyErr_SetString(PyExc_TypeError, "Pointer argument must be either uint64 or have data_ptr method");
  ptr_info.valid = false;
  return ptr_info;
}

static inline CUtensorMap* getTmaDesc(PyObject *obj) {
  if (sizeof(CUtensorMap*) != 8) {
    PyErr_SetString(PyExc_SystemError, "getTmaDesc() requires 64-bit compilation");
    return NULL;
  }

  PyObject *method_handle = PyObject_GetAttrString(obj, "tma_desc_cpu_ptr");
  if (!method_handle) {
    PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() method does not exist");
    return NULL;
  }

  PyObject *empty_tuple = PyTuple_New(0);
  if (!empty_tuple) {
    Py_DECREF(method_handle);
    PyErr_SetString(PyExc_SystemError, "Internal Python error!");
    return NULL;
  }
  PyObject *method_ret = PyObject_Call(method_handle, empty_tuple, NULL);
  Py_DECREF(empty_tuple);
  Py_DECREF(method_handle);
  if (!method_ret) {
    PyErr_SetString(PyExc_SystemError, "Internal Python error!");
    return NULL;
  }

  if (!PyLong_Check(method_ret)) {
    PyErr_SetString(PyExc_TypeError, "tma_desc_cpu_ptr() must return 64-bit int");
    Py_DECREF(method_ret);
    return NULL;
  }

  uint64_t ptr_as_uint = PyLong_AsUnsignedLongLong(method_ret);
  Py_DECREF(method_ret);
  if (!ptr_as_uint) {
    PyErr_SetString(PyExc_ValueError, "received NULL ptr from tma_desc_cpu_ptr()");
    return NULL;
  }
  if (ptr_as_uint % 64 != 0) {
    PyErr_SetString(PyExc_ValueError, "tma_desc_cpu_ptr() must be 64-byte aligned");
    return NULL;
  }

  return (CUtensorMap*)(ptr_as_uint);
}

static void ensureCudaContext() {
  CUcontext pctx;
  CUDA_CHECK(cuCtxGetCurrent(&pctx));
  if (!pctx) {
    // Ensure device context.
    CUdevice device;
    CUDA_CHECK(cuDeviceGet(&device, 0));
    CUDA_CHECK(cuDevicePrimaryCtxRetain(&pctx, device));
    CUDA_CHECK(cuCtxSetCurrent(pctx));
  }
}

static PyObject* launch(PyObject* self, PyObject* args) {
  // ensure cuda context is valid before calling any CUDA APIs, e.g. before getPointer calls cuPointerGetAttributes
  ensureCudaContext();

  int gridX, gridY, gridZ;
  uint64_t _stream;
  uint64_t _function;
  int launch_cooperative_grid;
  PyObject *launch_enter_hook = NULL;
  PyObject *launch_exit_hook = NULL;
  PyObject *kernel_metadata = NULL;
  PyObject *launch_metadata = NULL;
  PyObject *global_scratch_obj = NULL;
  z _arg;z
  if(!PyArg_ParseTuple(args, "a*  ", &gridX, &gridY, &gridZ,
                                           &_stream, &_function, &launch_cooperative_grid, &global_scratch_obj,
                                           &kernel_metadata, &launch_metadata,
                                           &launch_enter_hook, &launch_exit_hookat  )) {
    return NULL;
  }

  int num_warps, num_ctas, shared_memory, clusterDimX, clusterDimY, clusterDimZ;
  if (!PyArg_ParseTuple(kernel_metadata, "iiiiii", &num_warps, &num_ctas, &shared_memory, &clusterDimX, &clusterDimY, &clusterDimZ)) {
    PyErr_SetString(PyExc_TypeError, "kernel_metadata must be a tuple");
    return NULL;
  }

  // extract launch metadata
  if (launch_enter_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_enter_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;
  }

  CUdeviceptr global_scratch = 0;
  if (global_scratch_obj != Py_None) {
    DevicePtrInfo global_scratch_info = getPointer(global_scratch_obj, -1);
    if (!global_scratch_info.valid) {
      return NULL;
    }
    global_scratch = global_scratch_info.dev_ptr;
  }

  // raise exception asap
  z
  Py_BEGIN_ALLOW_THREADS;
  _launch(gridX, gridY, gridZ, num_warps, num_ctas, launch_cooperative_grid, clusterDimX, clusterDimY, clusterDimZ, shared_memory, (CUstream)_stream, (CUfunction)_function, global_scratcha0  );
  Py_END_ALLOW_THREADS;
  if (PyErr_Occurred()) {
    return NULL;
  }

  if(launch_exit_hook != Py_None){
    PyObject* args = Py_BuildValue("(O)", launch_metadata);
    PyObject* ret = PyObject_CallObject(launch_exit_hook, args);
    Py_DECREF(args);
    if (!ret)
      return NULL;

  }

  Py_RETURN_NONE;
}

static PyMethodDef ModuleMethods[] = {
  {"launch", launch, METH_VARARGS, "Entry point for all kernels with this signature"},
  {NULL, NULL, 0, NULL} // sentinel
};

static struct PyModuleDef ModuleDef = {
  PyModuleDef_HEAD_INIT,
  "__triton_launcher",
  NULL, //documentation
  -1, //size
  ModuleMethods
};

PyMODINIT_FUNC PyInit___triton_launcher(void) {
  PyObject *m = PyModule_Create(&ModuleDef);
  if(m == NULL) {
    return NULL;
  }
  PyModule_AddFunctions(m, ModuleMethods);
  return m;
}
)r   valuesr   listfilterboolr    	enumeratelenitemsappendrange)	constants	signaturer   args_formatformatr   s	args_list	arg_declsinternal_args_listparamsnewline	ptr_decls	tma_declsrW   r   r   r   s                  @@@r   make_launcherr   ~   s   
, ''93C3C3EF3ER9R=3EFGK[(F193C3C3EFGIVD)//#"678I"+I"67"6$!"6I7PST]P^abPbtyy L)//:K LLLhjI 		hARhhI"a5C<%%8&<=;%%n5;%%QCj1 # 3y>"F G __&&EAa5C< 	f
 #5aS1#=MaSPde&   foetetevev\a\] 	X
qc!3A3oaSWev   '0oo&7M&7UQ2;LjQCj&7FM
MM#$.v\ KN  OX  KY  \]  K]  w{  ~G  wG  ce  vf fyy() x*p <<	@QR@QuqOB'(aS2@QRST U  &x (Q R[P[ \: <<	 
<<	 | eh  i{  e|  @  e@  }A  DH  DM  DM  N`  Da  }a  FH  |I 'IWrCf	 Ji
 G 8$
 NR Ss5   L0L =L&L&9L,	L,/L2?	L2$L8c                        \ rS rSrS rS rSrg)CudaLauncheri  c                   ^ [        TS5      (       a  TR                  O	[        5       nU4S jnUR                  5        VVs0 s H  u  pVU" U5      U_M     nnnTR                  R                  5        VVs0 s H  u  pVXV_M	     nnn[        X75      m[        TS5      nUR                  U l        UR                  U l        UR                  U l	        UR                  U l
        g s  snnf s  snnf )Nr   c                 ~   > [        U [        5      (       a&  TR                  R                  R	                  U 5      4$ U $ r/   )r   r"   fn	arg_namesindex)xrW   s    r   <lambda>'CudaLauncher.__init__.<locals>.<lambda>   s2    Z3=O=OSVV--33A69VUVVr2   __triton_launcher)rj   r   dictr   r   r   rd   launchglobal_scratch_sizeglobal_scratch_alignlaunch_cooperative_grid)	ry   rW   metadatar   arg_idxidxvaluer   rc   s	    `       r   rz   CudaLauncher.__init__  s    %,S+%>%>CMMDF	V;D??;LM;LZSWS\5(;L	M25--2E2E2GH2GJCSZ2G	HI1%c+>?jj#+#?#? $,$A$A!'/'G'G$ NHs   C$7C*c           	          U R                   S:  a7  X-  U-  nXpR                   -  n[        R                  " XR                  U5      n	OS n	U R                  " XX4XPR
                  U	/UQ76   g Nr   )r   r   
_allocatorr   r   r   )
ry   gridXgridYgridZstreamfunctionargs	grid_size
alloc_sizeglobal_scratchs
             r   __call__CudaLauncher.__call__
  sg    ##a'-I"%=%==J(33J@Y@Y[abN!NE%;W;WYgojnor2   )r   r   r   r   N)r|   r}   r~   r   rz   r  r   r1   r2   r   r   r     s    
Hpr2   r   c                   \   ^  \ rS rSrU 4S jrS rS rS r\S 5       r	S r
S rS	 rS
rU =r$ )
CudaDriveri  c                 V   > [        5       U l        [        U l        [        TU ]  5         g r/   )rf   utilsr   launcher_clsrk   rz   )ry   rn   s    r   rz   CudaDriver.__init__  s    [
(r2   c                 |    U R                  5       nU R                  U5      nUS   S-  US   -   nSn[        SX#5      $ )Nr   
          r   )get_current_deviceget_device_capabilityr   )ry   device
capability	warp_sizes       r   get_current_targetCudaDriver.get_current_target  sI    ((*//7
]R'*Q-7
	77r2   c                 J    SS K nUR                  SU R                  5       5      $ )Nr   r   )torchr  r  ry   r  s     r   get_active_torch_device"CudaDriver.get_active_torch_device"  s    ||FD$;$;$=>>r2   c                 "    SS K nUR                  $ r   )r  r   r  s     r   get_device_interfaceCudaDriver.get_device_interface&  s    zzr2   c                       SS K n U R                  R                  5       =(       a    U R                  R                  S L $ ! [
         a     gf = f)Nr   F)r  r   is_availableversionhipImportError)r  s    r   	is_activeCudaDriver.is_active*  sC    	::**,L%--2C2Ct2KL 		s   <? 
AAc                     SSK Jn  U$ )Nr   )do_bench)triton.testingr(  )ry   r(  s     r   get_benchmarkerCudaDriver.get_benchmarker2  s
    +r2   c                 \    SS K nSnUR                  [        US-  5      UR                  SS9$ )Nr   i      r   )dtyper  )r  emptyint)ry   r  
cache_sizes      r   get_empty_cache_for_benchmark(CudaDriver.get_empty_cache_for_benchmark6  s.    
 '
{{3zQ/uyy{PPr2   c                 $    UR                  5         g r/   )zero_)ry   rZ   s     r   clear_cacheCudaDriver.clear_cache?  s    r2   )r  r
  )r|   r}   r~   r   rz   r  r  r  staticmethodr%  r*  r2  r6  r   r   r   s   @r   r  r    s@    
8?  Q r2   r  )%	functoolsr   rF   rB   r   rI   pathlibr   triton.runtime.buildr   triton.runtime.cacher   triton.runtimer   triton.backends.compilerr   triton.backends.driverr   r   r!   realpath__file__r   rM   r0   rN   	lru_cacher-   r3   r;   rd   objectrf   r   r   r   r  r1   r2   r   <module>rD     s    	      ' 2 & . ,
''//"''**84
5ww||GY/0We,H	  0 , , < <
2A A,
.{|p6 p0, ,r2   