a
    jº”h
K  ã                   @   sl  d dl mZmZmZ d dlmZmZmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd d	lmZ ed
œdd„Ze
j dœdd„Z!e "¡ dd„ ƒZ#e "¡ e$dœdd„ƒZ%e$dœdd„Z&e "¡ e$dœdd„ƒZ'e "d¡dd„ ƒZ(e$dœdd„Z)eddG d d!„ d!ƒƒZ*G d"d#„ d#eƒZ+dS )$é    )ÚBaseBackendÚ	GPUTargetÚLanguage)ÚirÚpassesÚllvmÚnvidia)Úknobs)Ú
PTXASError)Ú	dataclassN)ÚAnyÚDictÚTupleÚOptional)Ú
ModuleType)ÚPath©Útargetc                 C   s   t tttf dœdd„}|S )N©Úreturnc                 S   s4   | j j}|j j}||ks J dƒ‚|dkr,dS dS d S )Nz%lhs and rhs bitwidth must be the sameé   )é   r   é    )r   r   r   )ZscalarZprimitive_bitwidth)Zlhs_typeZrhs_typeZlhs_bitwidthZrhs_bitwidth© r   úM/var/www/auris/lib/python3.9/site-packages/triton/backends/nvidia/compiler.pyÚcheck_dot_compatibility   s    z-min_dot_size.<locals>.check_dot_compatibility)r   Úint)r   r   r   r   r   Úmin_dot_size   s    	r   r   c                   C   s   t jjS ©N)r	   r   Úptxasr   r   r   r   Ú	get_ptxas!   s    r    c                  C   s0   t jj} | d ur| S t tƒ jdg¡ d¡}|S )Nz	--versionúutf-8)r	   r   Zmock_ptx_versionÚ
subprocessÚcheck_outputr    ÚpathÚdecode)Zmock_verÚversionr   r   r   Úget_ptxas_version%   s
    r'   c                 C   sv   t | tƒsJ ‚tt|  d¡ƒ\}}|dkrF|dk r:d| S d| d S |dkrVd| S |dkrfd	| S td
|  ƒ‚dS )zK
    Get the highest PTX version supported by the current CUDA driver.
    Ú.é   é   éP   é   é   éF   é
   é?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: N)Ú
isinstanceÚstrÚmapr   ÚsplitÚRuntimeError)Úcuda_versionÚmajorÚminorr   r   r   Úptx_get_version.   s    r9   )Úarchc                 C   s"   | j }|d u rtƒ j}t|ƒ}|S r   )Úptx_versionr    r&   r9   )Úoptionsr:   r;   r6   r   r   r   Úget_ptx_version_from_optionsA   s
    r=   c                 C   s"   t | |ƒ}td|ƒ}d|› }|S )NéV   z+ptx)r=   Úmin)r<   r:   r;   Zllvm_ptx_versionÚfeaturesr   r   r   Úget_featuresI   s    


rA   c                 C   s@   t | dƒ"}t | ¡ ¡ ¡ W  d   ƒ S 1 s20    Y  d S )NÚrb)ÚopenÚhashlibÚsha256ÚreadÚ	hexdigest)r$   Úfr   r   r   Ú	file_hashW   s    rI   )Ú
capabilityc                 C   s   | dkrdnd}d| › |› S )NéZ   ÚaÚ Zsm_r   )rJ   Úsuffixr   r   r   Úsm_arch_from_capability]   s    rO   T)Úfrozenc                   @   s.  e Zd ZU dZeed< dZeed< dZeed< dZe	e ed< d	Z
eed
< dZeed< dZeed< dZe	e ed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< d"d#„ Z d$d%„ Z!dS )&ÚCUDAOptionsé   Ú	num_warpsr,   Únum_ctasé   Ú
num_stagesNÚmaxnreg)r,   r,   r,   Úcluster_dimsr;   Úptx_optionsÚir_overrideTÚenable_fp_fusionFÚlaunch_cooperative_gridÚ
launch_pdl)Zfp8e5Úfp8e4b15Úsupported_fp8_dtypesr   Ú!deprecated_fp8_dot_operand_dtypesÚtf32Údefault_dot_input_precision)ra   Ztf32x3ZieeeÚallowed_dot_input_precisionsÚmax_num_imprecise_acc_defaultÚextern_libsÚdebugÚcudaÚbackend_nameÚsanitize_overflowr:   c                 C   sŠ   t tƒjd }| jd u ri nt| jƒ}| dd ¡sJtjjpDt	|d ƒ|d< t
 | dt| ¡ ƒ¡ | jdkr~| j| jd @ dks†J dƒ‚d S )NÚlibÚ	libdevicezlibdevice.10.bcre   r   r,   znum_warps must be a power of 2)r   Ú__file__Úparentre   ÚdictÚgetr	   r   Zlibdevice_pathr2   ÚobjectÚ__setattr__ÚtupleÚitemsrS   )ÚselfZdefault_libdirre   r   r   r   Ú__post_init__}   s     ÿzCUDAOptions.__post_init__c                 C   sX   t | jƒ}tdd„ t|d ƒD ƒƒ|d< d dd„ t| ¡ ƒD ƒ¡}t | d¡¡ 	¡ S )Nc                 s   s   | ]\}}|t |ƒfV  qd S r   )rI   )Ú.0ÚkÚvr   r   r   Ú	<genexpr>‰   ó    z#CUDAOptions.hash.<locals>.<genexpr>re   Ú_c                 S   s   g | ]\}}|› d |› ‘qS )ú-r   )rv   ÚnameÚvalr   r   r   Ú
<listcomp>Š   rz   z$CUDAOptions.hash.<locals>.<listcomp>r!   )
rn   Ú__dict__rr   ÚsortedÚjoinrs   rD   rE   ÚencoderG   )rt   Z	hash_dictÚkeyr   r   r   Úhash‡   s    
zCUDAOptions.hash)"Ú__name__Ú
__module__Ú__qualname__rS   r   Ú__annotations__rT   rV   rW   r   rX   rr   r;   rY   r2   rZ   r[   Úboolr\   r]   r_   r   r`   rb   rc   rd   re   rn   rf   rh   ri   r:   ru   r…   r   r   r   r   rQ   c   s.   

rQ   c                       sØ   e Zd Zeedœdd„ƒZdd„ Zedœdd„Zed	d
œ‡ fdd„Z	e
dœdd„Zdd„ Zdd„ Zeeef dœdd„Zdd„ Zedd„ ƒZedd„ ƒZdd„ Zdd„ Zdd „ Zd!d"„ Zd#d$„ Ze ¡ d%d&„ ƒZ‡  ZS )'ÚCUDABackendr   c                 C   s
   | j dkS )Nrg   )Úbackendr   r   r   r   Úsupports_target   s    zCUDABackend.supports_targetc                 C   s0   d}t  ||¡}|s"td|› ƒ‚t| d¡ƒS )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r,   )ÚreÚ	fullmatchÚ
ValueErrorr   Úgroup)rt   r:   ÚpatternÚmatchr   r   r   Ú_parse_arch”   s
    zCUDABackend._parse_archr   c                 C   s   |   |j¡}d|› S )Núcuda:)r”   r:   )rt   r<   rJ   r   r   r   Úget_target_name›   s    zCUDABackend.get_target_nameN)r   r   c                    s   t ƒ  |¡ d| _d S )NÚcubin)ÚsuperÚ__init__Z
binary_ext)rt   r   ©Ú	__class__r   r   r™   Ÿ   s    zCUDABackend.__init__c                    sÌ   dt jjpd| jj› i}| ‡ fdd„tj ¡ D ƒ¡ t	|  
|d ¡ƒ}d|vr~ttjƒ}|dkrn| d¡ tt|ƒƒ|d< d|vr–|d	kr–d
|d< d|vrªt jj|d< |d	kr¶dnd|d< tf i |¤ŽS )Nr:   Úsmc                    s*   i | ]"}|ˆ v rˆ | d ur|ˆ | “qS r   r   )rv   rw   ©Úoptsr   r   Ú
<dictcomp>¥   rz   z-CUDABackend.parse_options.<locals>.<dictcomp>r_   éY   Zfp8e4nvr`   rK   )r^   r[   i   @r   rd   )r	   ZruntimeZoverride_archr   r:   ÚupdaterQ   Z__dataclass_fields__Úkeysr   r”   Úsetr_   Úaddrr   r   ÚlanguageZdefault_fp_fusion)rt   rž   ÚargsrJ   r_   r   r   r   Úparse_options£   s    

zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r,   é   )rS   rT   ÚsharedrX   )rt   Úmetadatar   r   r   Úpack_metadata¹   s    úzCUDABackend.pack_metadatac                 C   sL   dd l m  m  m} t|  |j¡ƒ}|dkr6|jn|jt	| j
ƒdœ}|S )Nr   r+   )Zconvert_custom_typesr   )Útriton.language.extra.cudar¥   Úextrarg   r   r”   r:   Zconvert_custom_float8_sm80Zconvert_custom_float8_sm70r   r   )rt   r<   rg   rJ   Zcodegen_fnsr   r   r   Úget_codegen_implementationÃ   s    ýz&CUDABackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )rk   ztriton.language.extra.libdevice)r¬   rk   )rt   rk   r   r   r   Úget_module_mapÍ   s    zCUDABackend.get_module_mapc                 C   s   t  |¡ d S r   )r   Úload_dialects)rt   Úctxr   r   r   r°   Ñ   s    zCUDABackend.load_dialectsc                 C   sš   t  | j¡}| ¡  tj |¡ tj |¡ |d dk rDtj 	|¡ tj 
|¡ tj |¡ tj |¡ tj |¡ tj |¡ tj |¡ | | ¡ | S )Nr/   é	   )r   Úpass_managerÚcontextÚenable_debugr   ÚcommonÚadd_inlinerÚttirZadd_rewrite_tensor_pointerZ(add_rewrite_tensor_descriptor_to_pointerÚadd_canonicalizerZadd_combineZadd_reorder_broadcastÚadd_cseÚadd_symbol_dceZadd_loop_unrollÚrun)Úmodrª   ÚoptrJ   Úpmr   r   r   Ú	make_ttirÔ   s    
zCUDABackend.make_ttirc                 C   sx  |j d ur&|  dt | j¡ |j ¡¡ t ¡ }|jd ur\|jd |_	|jd |_
|jd |_t | j¡}| ¡ }tj |d|› |jd|j¡ tj |¡ |d dkr²tj |¡ tjj ||¡ tj |¡ tj |¡ tj |¡ tj |¡ tj ||d	k¡ tjj |¡ tj |¡ |d d
v r¬tj |¡ tj  |¡ tj !|¡ tj  |¡ tj "|¡ tjj# $||j%|¡ tj &||j%¡ tj '|¡ tj (||j%|¡ n¾|d dkr^tj |¡ tj  |¡ tj !|¡ tj )|¡ tj *|¡ tjj +|¡ tj &||j%¡ tj '|¡ tj ,||j%¡ tj (||j%|¡ tj "|¡ tjj -|¡ ntj !|¡ tj  |¡ tj |¡ tj .|¡ tj ||d	k¡ tj /|¡ tjj 0|¡ tj |¡ tjj 1|¡ tj 2|¡ tj 3|¡ tj |¡ tj 4|¡ |d dkr.tjj 5|¡ tjj 6|¡ tj 7|¡ tj  |¡ | 8| ¡ |j	|j
|jf|d< |  9¡ }||d< | S )Nzttg.maxnregr   r,   r¨   r•   r   r/   r   r+   )r   r²   r²   rX   Útensordesc_meta):rW   Zset_attrr   Zbuilderr´   Zget_int32_attrr   ZClusterInforX   ZclusterDimXZclusterDimYZclusterDimZr³   rµ   r   r¸   Zadd_convert_to_ttgpuirrS   rT   ÚttgpuirZadd_coalesceZadd_f32_dot_tcÚ	ttnvgpuirZadd_plan_ctaZadd_remove_layout_conversionsZadd_optimize_thread_localityZadd_accelerate_matmulZadd_optimize_dot_operandsZ add_optimize_descriptor_encodingÚadd_loop_aware_cseZadd_fuse_nested_loopsr¶   r¹   Zadd_triton_licmÚ add_combine_tensor_select_and_ifZhopperZadd_hopper_warpspecrV   Zadd_assign_latenciesZadd_schedule_loopsZadd_pipelineZadd_optimize_accumulator_initZadd_hoist_tmem_allocZadd_promote_lhs_to_tmemZadd_warp_specializeZadd_remove_tmem_tokensZadd_prefetchZadd_coalesce_async_copyZadd_optimize_tmem_layoutsZadd_interleave_tmemZadd_reduce_data_duplicationZadd_reorder_instructionsr»   Zadd_tma_loweringZadd_fence_insertionÚadd_sccpr¼   Úget_tensordesc_metadata)r½   rª   r¾   rJ   Zcluster_infor¿   Zdump_enabledrÁ   r   r   r   Ú
make_ttgirå   s†    


zCUDABackend.make_ttgirc                 C   sn   |}t  |j¡}| ¡  tj |¡ tj |¡ tj	 
|¡ tj |¡ tj |¡ | |¡ | ¡ |d< |S )NrÁ   )r   r³   r´   rµ   r   rÂ   r·   r¶   rÆ   r¸   rÄ   r¹   rÅ   r¼   rÇ   )rt   Úsrcrª   r<   rJ   r½   r¿   r   r   r   Ú	ttgir_opt0  s    
zCUDABackend.ttgir_optc                 C   s  t || jjƒ}|}t |j¡}| ¡  tjj	 
|¡ tj |¡ tj |¡ tj |¡ tj |¡ tjj	 |¡ tj |¡ tjj |||¡ tj |¡ tj |¡ tjj	 |¡ tjj	 |¡ tj |¡ tj |¡ tj |¡ tjjsütj |¡ | |¡ t  !¡  t  ¡ }tjj"r(t#dƒ‚t  $||¡}	t%|ƒ}
t&|| jjƒ}d}t '¡  t  (|	||
|¡ t )|	¡ |j*r”dd„ |j*D ƒ}t  +|	|¡ t  ,|	t j-¡ | .d¡}|d ur¾||d< | .d¡|d< | .d	¡|d
< | .d¡|d< | .d¡|d< t/|	ƒ}~	~|S )NzYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendúnvptx64-nvidia-cudac                 S   s   g | ]\}}|‘qS r   r   )rv   r}   r$   r   r   r   r   h  rz   z)CUDABackend.make_llir.<locals>.<listcomp>zttg.total-num-warpsrS   z
ttg.sharedr©   zttg.tensor_memory_sizeZ	tmem_sizezttg.global_scratch_memory_sizeZglobal_scratch_sizez#ttg.global_scratch_memory_alignmentZglobal_scratch_align)0r=   r   r:   r   r³   r´   rµ   r   r   rÃ   Zadd_lower_mmarÂ   rÅ   Zadd_allocate_warp_groupsÚconvertZadd_scf_to_cfZadd_allocate_shared_memoryZadd_allocate_tensor_memoryZ"add_allocate_global_scratch_memoryZadd_to_llvmirr¶   r¹   rº   Zadd_nvgpu_to_llvmZadd_warp_specialize_to_llvmr»   r	   ÚcompilationÚdisable_line_infoZllvmirZadd_di_scoper¼   r   Zinit_targetsZenable_asanr5   Z	to_modulerO   rA   Zset_short_ptrZattach_datalayoutZset_nvvm_reflect_ftzre   Zlink_extern_libsZoptimize_moduleZOPTIMIZE_O3Zget_int_attrr2   )rt   rÉ   rª   r<   rJ   r;   r½   r¿   r´   Zllvm_modÚprocr@   ÚtripleÚpathsZtotal_num_warpsÚretr   r   r   Ú	make_llir?  sd    

ÿ


zCUDABackend.make_llirc              	   C   sÜ   t || jjƒ}d}t|ƒ}t|| jjƒ}t ||||g |jd¡}	t 	d|	¡}
t
|
ƒdks\J ‚|
d |d< |d › d|d › }tjd	d
|› |	tjd}	tjdd|› |	tjd}	t dd|	¡}	tjjrØtdƒ t|	ƒ |	S )NrË   Fz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r,   r   r}   r/   r(   z\.version \d+\.\d+z	.version )Úflagsz\.target sm_\d+z.target sm_z,\s*debug|debug,\s*rM   z // -----// NVPTX Dump //----- //)r=   r   r:   rO   rA   r   Ztranslate_to_asmr[   rŽ   ÚfindallÚlenÚsubÚ	MULTILINEr	   r   Z
dump_nvptxÚprint)rt   rÉ   rª   r¾   rJ   r;   rÐ   rÏ   r@   rÒ   Únamesr   r   r   Úmake_ptx{  s     zCUDABackend.make_ptxc                 C   s„  t ƒ j}tjddddT}tjdddd }| |¡ | ¡  |jd }tjj	r\dd	gndg}	|j
rlg nd
g}
t|ƒ}tjjrŠddgng }|jr |j d¡ng }|g|	¢|
¢d‘|¢|¢d|› ‘|j‘d‘|‘}zNtj|dd|d tj |j¡rt |j¡ tj |j¡r"t |j¡ W nÎ tjyò } z²t|jƒ}| ¡ }W d   ƒ n1 sb0    Y  tj |j¡rˆt |j¡ |jdkršd}n$|jdtj kr²d}nd|j› }t|› d|› dd |¡› dƒ‚W Y d }~n
d }~0 0 t|dƒ}| ¡ }W d   ƒ n1 s0    Y  tj |¡r@t |¡ W d   ƒ n1 sV0    Y  W d   ƒ n1 sv0    Y  |S )NFÚwz.ptx)ÚdeleteÚmoderN   Úrz.logz.oz	-lineinfoz-suppress-debug-infoz--fmad=falsez--opt-levelÚ0ú z-vz--gpu-name=z-oT)ÚcheckÚ	close_fdsÚstderréÿ   z!Internal Triton PTX codegen erroré€   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command: Ú
rB   )r    r$   ÚtempfileÚNamedTemporaryFileÚwriteÚflushr}   r	   rÍ   rÎ   r[   rO   r   Zdisable_ptxas_optrY   r4   r"   r¼   ÚosÚexistsÚremoveÚCalledProcessErrorrC   rF   Ú
returncodeÚsignalÚSIGSEGVr
   r‚   )rt   rÉ   rª   r¾   rJ   r   ÚfsrcZflogZfbinZ	line_infoZfmadr:   Zdisable_optZptx_extra_optionsZ	ptxas_cmdÚeZlog_fileÚlogÚerrorrH   r—   r   r   r   Ú
make_cubin‘  sv    ÿ

ÿÿÿÿÿÿÿÿÿþ(ÿþ (JzCUDABackend.make_cubinc                    sš   ˆ  ˆj¡‰ |tjkr@‡ ‡‡fdd„|d< ‡ ‡‡fdd„|d< n|tjkr^‡ ‡‡fdd„|d< ‡ ‡‡fdd„|d< ‡‡fd	d„|d
< ‡‡fdd„|d< d S )Nc                    s   ˆ  | |ˆˆ ¡S r   )rÀ   ©rÉ   rª   ©rJ   r<   rt   r   r   Ú<lambda>Ç  rz   z(CUDABackend.add_stages.<locals>.<lambda>r¸   c                    s   ˆ  | |ˆˆ ¡S r   )rÈ   rø   rù   r   r   rú   È  rz   Zttgirc                    s   ˆ  | |ˆˆ ¡S r   )rÊ   rø   rù   r   r   rú   Ê  rz   c                    s   ˆ  | |ˆˆ ¡S r   )rÓ   rø   rù   r   r   rú   Ë  rz   Zllirc                    s   ˆ  | |ˆ ˆjj¡S r   )rÛ   r   r:   rø   ©r<   rt   r   r   rú   Ì  rz   Zptxc                    s   ˆ  | |ˆ ˆjj¡S r   )r÷   r   r:   rø   rû   r   r   rú   Í  rz   r—   )r”   r:   r   ZTRITONZGLUON)rt   Zstagesr<   r¥   r   rù   r   Ú
add_stagesÄ  s    

zCUDABackend.add_stagesc                 C   s   t ƒ }|› d| jj› S )Nr|   )r'   r   r:   )rt   r&   r   r   r   r…   Ï  s    zCUDABackend.hash)r†   r‡   rˆ   Ústaticmethodr   r   r”   r2   r–   r™   r   r§   r«   r®   r   r   r¯   r°   rÀ   rÈ   rÊ   rÓ   rÛ   r÷   rü   Ú	functoolsÚ	lru_cacher…   Ú__classcell__r   r   rš   r   r‹   Ž   s*   



J<3r‹   ),Ztriton.backends.compilerr   r   r   Ztriton._C.libtritonr   r   r   r   Ztritonr	   Ztriton.runtime.errorsr
   Zdataclassesr   rþ   Útypingr   r   r   r   Útypesr   rD   rŽ   rè   rñ   rì   r"   Úpathlibr   r   Z
NvidiaToolr    rÿ   r'   r   r9   r=   rA   rI   rO   rQ   r‹   r   r   r   r   Ú<module>   s:   

*