a
    h0                  	   @   s   d dl Z d dlZd dlmZmZmZ d dlZd dlmZ	 e j
dddZedddd	Ze j
dd
dZdeeee eee ee edddZG dd dZG dd dZdeeef eee  eeeedf f dddZdeeeedddZdS )    N)AnyOptionalUnion)_get_device_index)returnc                   C   s"   t jdkrtdS tdS d S )Nwin32z
nvcuda.dllzlibcuda.so.1sysplatformctypesCDLL r   r   ?/var/www/auris/lib/python3.9/site-packages/torch/cuda/_utils.py_get_cuda_library   s    

r   resultr   c                 C   sV   | dkrd S t  }t }|| t | |jd ur@|j nd}td| d S )Nr   Unknown CUDA errorCUDA error: )r   c_char_pr   ZcuGetErrorStringbyrefvaluedecodeRuntimeError)r   err_strlibcudaerror_messager   r   r   _check_cuda   s    r   c                   C   s"   t jdkrtdS tdS d S )Nr   znvrtc64_120_0.dllzlibnvrtc.sor   r   r   r   r   _get_nvrtc_library    s    

r    )kernel_sourcekernel_namecompute_capabilityheader_codecuda_include_dirsnvcc_optionsr   c              	      s  ddl }t d tdd fdd}|  ds@d|  } |rR|d |  }n| }|d	}	|du r|j|j }
|
j	 |
j
 }g }|d
|   |r|D ]}|d|   q|r|D ]}||d	 qddlm} dd |D }|dd |D  t|}tj| | }t }|t||	| d ddd |||}| krt }|t| t|j}|| td|j  t }||t| t|j}||| t| |jS )a  
    Compiles a CUDA kernel using NVRTC and returns the PTX code.

    Args:
        kernel_source (str): The CUDA kernel source code as a string
        kernel_name (str): The name of the kernel function to compile
        compute_capability (str, None): The compute capability to target (e.g., "86").
                                           If None, will detect from current device.
        header_code (str, optional): Additional header code to prepend to the kernel source
        cuda_include_dirs (list, None): List of directories containing CUDA headers
        nvcc_options (list, None): Additional options to pass to NVRTC

    Returns:
        str: The compiled PTX code
    r   Nr   c                    sL   |  krHt  }| t | |jd ur6|j nd}td| d S )Nr   r   )r   r   ZnvrtcGetErrorStringr   r   r   r   )r   r   r   ZNVRTC_SUCCESSZlibnvrtcr   r   check_nvrtcJ   s    z#_nvrtc_compile.<locals>.check_nvrtcz
extern "C"zextern "C" 
utf-8z--gpu-architecture=sm_z-I)COMMON_NVCC_FLAGSc                 S   s   g | ]}|d kr|qS )z--expt-relaxed-constexprr   .0flagr   r   r   
<listcomp>y   s   z"_nvrtc_compile.<locals>.<listcomp>c                 S   s   g | ]}| d qS )r(   )encoder*   r   r   r   r-   |       z.cuzKernel compilation failed:
) 
torch.cudar   intstrip
startswithr.   cudaZget_device_propertiesZcurrent_devicemajorminorappendZtorch.utils.cpp_extensionr)   extendlenr   r   c_void_pZnvrtcCreateProgramr   ZnvrtcCompileProgramc_size_tZnvrtcGetProgramLogSizecreate_string_bufferr   ZnvrtcGetProgramLogr   r   ZnvrtcGetPTXSizeZnvrtcGetPTXZnvrtcDestroyProgram)r   r    r!   r"   r#   r$   torchr&   Zfull_sourcesource_bytespropsoptions	directoryoptionr)   Znvrtc_compatible_flagsZnum_optionsZoptions_arrayprogresZlog_sizelogZptx_sizeptxr   r%   r   _nvrtc_compile)   sh    


rG   c                   @   s.   e Zd ZejddddZeddddZdS )	_CudaModuleN)moduler   c                 C   s   || _ i | _d S N)_module_kernels)selfrI   r   r   r   __init__   s    z_CudaModule.__init___CudaKernel)namer   c              
   C   s   || j v r| j | S ddlm} | }t }z<t|t|| j|	d t
|| j}|| j |< |W S  ty } ztd| d|W Y d }~n
d }~0 0 d S )Nr   )r   r(   zNo kernel named 'z' in this module)rL   Ztorch.cuda._utilsr   r   r:   r   cuModuleGetFunctionr   rK   r.   rO   r   AttributeError)rM   rP   r   r   funckernelerrr   r   r   __getattr__   s     


z_CudaModule.__getattr__)__name__
__module____qualname__r   r:   rN   strrV   r   r   r   r   rH      s   rH   c                   @   s\   e Zd ZdZejejddddZdeeeef eeeef e	e
 ee	e ddd	d
ZdS )rO   zT
    Represents a compiled CUDA kernel that can be called with PyTorch tensors.
    N)rS   rI   r   c                 C   s   || _ || _d S rJ   )rS   rI   )rM   rS   rI   r   r   r   rN      s    z_CudaKernel.__init__   r\   r\   r   )gridblockargs
shared_memstreamr   c                 C   sp  ddl }|jj }|sg }g }g }	|D ]}
t|
|jr~|
jsT|
jrL|
 sTt	dt
|
 }|| |	t
| q(t|
trt
|
}|	t
| q(t|
trt
|
}|	t
| q(tdt|
 q(t
jt|	  }t|	D ]\}}
t
|
t
j||< q|du r.ddl}|j }t|| j|d |d |d |d |d |d ||j|d dS )a  
        Call the compiled CUDA kernel

        Args:
            grid (tuple): Grid dimensions (grid_x, grid_y, grid_z)
            block (tuple): Block dimensions (block_x, block_y, block_z)
            args (list): List of arguments to pass to the kernel.
                         PyTorch tensor arguments will be automatically converted to pointers.
            shared_mem (int): Shared memory size in bytes
            stream (torch.cuda.Stream): CUDA stream to use. If None, uses current stream.
        r   Nz?All tensor arguments must be CUDA tensors or pinned CPU tensorszUnsupported argument type: r\      )r=   r4   _utilsr   
isinstanceZTensorZis_cudaZis_cpu	is_pinned
ValueErrorr   r:   Zdata_ptrr7   r   r1   c_intfloatc_float	TypeErrortyper9   	enumeratecastr0   current_streamr   ZcuLaunchKernelrS   Z_as_parameter_)rM   r]   r^   r_   r`   ra   r=   r   Zprocessed_argsZc_argsargptrrg   ri   Zc_args_arrayir   r   r   __call__   sV    






z_CudaKernel.__call__)r[   r[   Nr   N)rW   rX   rY   __doc__r   r:   rN   tupler1   r   listr   rr   r   r   r   r   rO      s        rO   )rF   kernel_namesr   c           	   	   C   s   ddl }t }t| tr"| d} t }|j }|& t	|
t||  W d   n1 sd0    Y  |szt|S i }|D ]8}t }t	|t|||d t||||< q|S )a,  
    Loads a CUDA module from PTX code and returns a module object that can access kernels.

    Args:
        ptx (bytes or str): The PTX code to load
        kernel_names (list, optional): List of kernel names to extract from the module.
                                      If None, will return a module object with __getattr__.

    Returns:
        object: If kernel_names is None, returns a module object with __getattr__ to access kernels.
               If kernel_names is provided, returns a dict mapping kernel names to _CudaKernel objects.
    r   Nr(   )r0   r   rd   rZ   r.   r   r:   r4   rn   r   ZcuModuleLoadDatar   rH   rQ   rO   )	rF   rv   r=   r   rI   ra   ZkernelsrP   rS   r   r   r   _cuda_load_module  s(    


4rw   F)deviceoptional	allow_cpur   c                 C   s   t | tr| S t | tr"t| } t | tjrd|rL| jdvrdtd|  n| jdkrdtd|  tj st | tj	jr| j
S t| ||S )a  Get the device index from :attr:`device`, which can be a torch.device object, a Python integer, or ``None``.

    If :attr:`device` is a torch.device object, returns the device index if it
    is a CUDA device. Note that for a CUDA device without a specified index,
    i.e., ``torch.device('cuda')``, this will return the current default CUDA
    device if :attr:`optional` is ``True``. If :attr:`allow_cpu` is ``True``,
    CPU devices will be accepted and ``-1`` will be returned in this case.

    If :attr:`device` is a Python integer, it is returned as is.

    If :attr:`device` is ``None``, this will return the current default CUDA
    device if :attr:`optional` is ``True``.
    )r4   cpuz(Expected a cuda or cpu device, but got: r4   z!Expected a cuda device, but got: )rd   r1   rZ   r=   rx   rk   rf   ZjitZis_scriptingr4   idx_torch_get_device_index)rx   ry   rz   r   r   r   r   N  s    





r   )Nr   NN)N)FF)r   r	   typingr   r   r   r=   Ztorch._utilsr   r}   r   r   r1   r   r   rZ   ru   bytesrG   rH   rO   dictrw   boolr   r   r   r   <module>   s@       |] 1 