o
    bZŽhãK  ã                   @   s†  d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
mZ d dlZd dlmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlZd	efd
d„Ze ¡ defdd„ƒZ e ¡ de!fdd„ƒZ"e ¡ de!fdd„ƒZ#e ¡ de!fdd„ƒZ$de!fdd„Z%e ¡ de!fdd„ƒZ&e d¡dd„ ƒZ'de!fdd„Z(ed d!G d"d#„ d#ƒƒZ)G d$d%„ d%eƒZ*dS )&é    )ÚBaseBackendÚ	GPUTarget)ÚirÚpassesÚllvmÚnvidia)Ú
PTXASError)Ú	dataclassN)ÚAnyÚDictÚTupleÚOptional)Ú
ModuleType)ÚPathÚtargetc                 C   s   dt tttf fdd„}|S )NÚreturnc                 S   s0   | j j}|j j}||ksJ dƒ‚|dkrdS dS )Nz%lhs and rhs bitwidth must be the sameé   )é   r   é    )r   r   r   )ZscalarZprimitive_bitwidth)Zlhs_typeZrhs_typeZlhs_bitwidthZrhs_bitwidth© r   úN/var/www/auris/lib/python3.10/site-packages/triton/backends/nvidia/compiler.pyÚcheck_dot_compatibility   s   z-min_dot_size.<locals>.check_dot_compatibility)r   Úint)r   r   r   r   r   Úmin_dot_size   s   	r   Úbinaryc                 C   s¾   | t  d¡7 } tj d|  ¡ › dd¡tj tj t	¡d| ¡g}|D ]5}tj 
|¡rWtj |¡rWtj|dgtjd}|d urWtjd| d	¡tjd
}|d urW|| d¡f  S q"td| › ƒ‚)NÚEXEZTRITON_Z_PATHÚ Úbinú	--version)Ústderrz.*release (\d+\.\d+).*úutf-8©Úflagsé   zCannot find )Ú	sysconfigÚget_config_varÚosÚenvironÚgetÚupperÚpathÚjoinÚdirnameÚ__file__ÚexistsÚisfileÚ
subprocessÚcheck_outputÚSTDOUTÚreÚsearchÚdecodeÚ	MULTILINEÚgroupÚRuntimeError)r   Úpathsr*   ÚresultÚversionr   r   r   Ú_path_to_binary!   s   þ€r<   Úarchc                 C   s   | dkrdnd}t |ƒS )Néd   zptxas-blackwellÚptxas)r<   )r=   Únamer   r   r   Ú	get_ptxas3   s   rA   c                 C   s8   t j d¡}|d ur|S t t| ƒd dg¡ d¡}|S )NZTRITON_MOCK_PTX_VERSIONr   r   r    )r&   r'   r(   r0   r1   rA   r5   )r=   Zmock_verr;   r   r   r   Úget_ptxas_version9   s
   rB   r   c                 C   sr   t | tƒsJ ‚tt|  d¡ƒ\}}|dkr#|dk rd| S d| d S |dkr+d| S |dkr3d	| S td
|  ƒ‚)zK
    Get the highest PTX version supported by the current CUDA driver.
    Ú.é   é   éP   r#   é   éF   é
   é?   z?Triton only support CUDA 10.0 or higher, but got CUDA version: )Ú
isinstanceÚstrÚmapr   Úsplitr8   )Úcuda_versionÚmajorÚminorr   r   r   Úptx_get_versionB   s   rR   c                 C   s&   | j }|d u rt|ƒ\}}t|ƒ}|S ©N)Úptx_versionrA   rR   )Úoptionsr=   rT   Ú_rO   r   r   r   Úget_ptx_version_from_optionsU   s
   rW   c                 C   s"   t | |ƒ}td|ƒ}d|› }|S )NéV   z+ptx)rW   Úmin)rU   r=   rT   Zllvm_ptx_versionÚfeaturesr   r   r   Úget_features]   s   


r[   c                 C   s@   t | dƒ}t | ¡ ¡ ¡ W  d   ƒ S 1 sw   Y  d S )NÚrb)ÚopenÚhashlibÚsha256ÚreadÚ	hexdigest)r*   Úfr   r   r   Ú	file_hashk   s   $ÿrc   Ú
capabilityc                 C   s   | dkrdnd}d| › |› S )NéZ   Úar   Zsm_r   )rd   Úsuffixr   r   r   Úsm_arch_from_capabilityq   s   rh   T)Úfrozenc                   @   s6  e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	eed	< dZ
eed
< dZeed< dZee ed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< dZeed< dZeed< dZeed< d Zeed!< dZeed"< dZ eed#< d$d%„ Z!d&d'„ Z"dS )(ÚCUDAOptionsé   Ú	num_warpsr#   Únum_ctasé   Ú
num_stagesr   Únum_buffers_warp_specÚnum_consumer_groupsÚreg_dec_producerÚreg_inc_consumerNÚmaxnreg)r#   r#   r#   Úcluster_dimsrT   TÚenable_fp_fusionFÚlaunch_cooperative_grid)Zfp8e5Úfp8e4b15Úsupported_fp8_dtypesr   Údeprecated_fp8_dtypesÚtf32Údefault_dot_input_precision)r{   Ztf32x3ZieeeÚallowed_dot_input_precisionsÚmax_num_imprecise_acc_defaultÚextern_libsÚdebugÚcudaÚbackend_nameÚsanitize_overflowr=   c                 C   sŠ   t tƒjd }| jd u ri nt| jƒ}| dd ¡s%t dt|d ƒ¡|d< t	 
| dt| ¡ ƒ¡ | jdkr?| j| jd @ dksCJ dƒ‚d S )	NÚlibÚ	libdeviceZTRITON_LIBDEVICE_PATHzlibdevice.10.bcr   r   r#   znum_warps must be a power of 2)r   r-   Úparentr   Údictr(   r&   ÚgetenvrL   ÚobjectÚ__setattr__ÚtupleÚitemsrl   )ÚselfZdefault_libdirr   r   r   r   Ú__post_init__’   s    ÿzCUDAOptions.__post_init__c                 C   sX   t | jƒ}tdd„ t|d ƒD ƒƒ|d< d dd„ t| ¡ ƒD ƒ¡}t | d¡¡ 	¡ S )Nc                 s   s     | ]\}}|t |ƒfV  qd S rS   )rc   )Ú.0ÚkÚvr   r   r   Ú	<genexpr>   s   € z#CUDAOptions.hash.<locals>.<genexpr>r   rV   c                 S   s   g | ]\}}|› d |› ‘qS )ú-r   )r   r@   Úvalr   r   r   Ú
<listcomp>ž   s    z$CUDAOptions.hash.<locals>.<listcomp>r    )
r‡   Ú__dict__r‹   Úsortedr+   rŒ   r^   r_   Úencodera   )r   Z	hash_dictÚkeyr   r   r   Úhash›   s   
zCUDAOptions.hash)#Ú__name__Ú
__module__Ú__qualname__rl   r   Ú__annotations__rm   ro   rp   rq   rr   rs   rt   r   ru   r‹   rT   rv   Úboolrw   ry   r   rL   rz   r|   r}   r~   r   r‡   r€   r‚   rƒ   r=   rŽ   rš   r   r   r   r   rj   w   s2   
 	rj   c                       sÄ   e Zd Zedefdd„ƒZdd„ Zdeddf‡ fdd	„Zdefd
d„Z	dd„ Z
dd„ Zdeeef fdd„Zdd„ Zedd„ ƒZedd„ ƒZdd„ Zdd„ Zdd„ Zdd„ Ze ¡ d d!„ ƒZ‡  ZS )"ÚCUDABackendr   c                 C   s
   | j dkS )Nr   )Úbackend)r   r   r   r   Úsupports_target¤   s   
zCUDABackend.supports_targetc                 C   s0   d}t  ||¡}|std|› ƒ‚t| d¡ƒS )Nz	^sm(\d+)$z(TRITON_OVERRIDE_ARCH must have the form r#   )r3   Ú	fullmatchÚ
ValueErrorr   r7   )r   r=   ÚpatternÚmatchr   r   r   Ú_parse_arch¨   s
   zCUDABackend._parse_archr   Nc                    s   t ƒ  |¡ d| _d S )NÚcubin)ÚsuperÚ__init__Z
binary_ext)r   r   ©Ú	__class__r   r   rª   ¯   s   
zCUDABackend.__init__c                    sÔ   dt  dd| jj› ¡i}| ‡ fdd„tj ¡ D ƒ¡ t|  	|d ¡ƒ}d|vr?t
tjƒ}|dkr7| d¡ tt|ƒƒ|d< d	|vrK|d
krKd|d	< d|vrYt  dd¡dk|d< |d
kr_dnd|d< tdi |¤ŽS )Nr=   ZTRITON_OVERRIDE_ARCHÚsmc                    s*   i | ]}|ˆ v rˆ | d ur|ˆ | “qS rS   r   )r   r   ©Úoptsr   r   Ú
<dictcomp>µ   s   * z-CUDABackend.parse_options.<locals>.<dictcomp>ry   éY   Zfp8e4nvrz   re   )rx   rv   ZTRITON_DEFAULT_FP_FUSIONÚ1i   @r   r~   r   )r&   rˆ   r   r=   Úupdaterj   Ú__dataclass_fields__Úkeysr   r§   Úsetry   Úaddr‹   r—   )r   r¯   Úargsrd   ry   r   r®   r   Úparse_options³   s   

zCUDABackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r#   é   )rl   rm   Úsharedru   )r   Úmetadatar   r   r   Úpack_metadataÉ   s   úzCUDABackend.pack_metadatac                 C   sL   dd l m  m  m} t|  |j¡ƒ}|dkr|jn|jt	| j
ƒdœ}|S )Nr   rF   )Zconvert_custom_typesr   )Útriton.language.extra.cudaÚlanguageÚextrar   r   r§   r=   Zconvert_custom_float8_sm80Zconvert_custom_float8_sm70r   r   )r   rU   r   rd   Zcodegen_fnsr   r   r   Úget_codegen_implementationÓ   s   ýz&CUDABackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )r…   ztriton.language.extra.libdevice)r¾   r…   )r   r…   r   r   r   Úget_module_mapÝ   s   zCUDABackend.get_module_mapc                 C   s   t  |¡ d S rS   )r   Úload_dialects)r   Úctxr   r   r   rÃ   á   s   zCUDABackend.load_dialectsc                 C   s‚   t  | j¡}| ¡  tj |¡ tj |¡ tj 	|¡ tj 
|¡ tj |¡ tj |¡ tj |¡ tj |¡ | | ¡ | S rS   )r   Úpass_managerÚcontextÚenable_debugr   ÚcommonZadd_inlinerÚttirZadd_rewrite_tensor_pointerÚadd_canonicalizerZadd_combineZadd_reorder_broadcastÚadd_cseÚadd_symbol_dceZadd_loop_unrollÚrun)Úmodr¼   ÚoptÚpmr   r   r   Ú	make_ttirä   s   
zCUDABackend.make_ttirc                 C   sˆ  t  ¡ }|jd ur|jd |_|jd |_|jd |_t | j¡}| 	¡ }t
j |d|› |jd|j¡ t
j |¡ |d dkrFt
j |¡ t j
j ||¡ t
j |¡ t
j |¡ t
j |¡ t
j |¡ t
j ||dk¡ t
j |¡ |d d	v rßt
j |¡ t
j |¡ t
j |¡ t
j |¡ t
j |¡ t
j |¡ t
j ||j ¡ t
j !||j ¡ t
j "||j ¡ t
j #||j$|j |j%|j&¡ t
j '||j(|¡ t
j )||j ¡ t
j *||j ¡ nw|d dkrPt
j |¡ t
j |¡ t
j |¡ t
j |¡ t
j ||j ¡ t
j !||j ¡ t
j "||j ¡ t
j #||j$|j |j%|j&¡ t
j '||j(|¡ t
j |¡ t j
j +|¡ t j
j ,|¡ t
j *||j ¡ t
j |¡ nt
j |¡ t
j -|¡ t
j ||dk¡ t
j .|¡ t
j |¡ t
j /|¡ t
j 0|¡ t
j |¡ t
j 1|¡ |d d
kržt j
j 2|¡ t j
j 3|¡ t
j |¡ |d d
kr³t
j 4||j ¡ | 5| ¡ |j|j|jf|d< | S )Nr   r#   rº   zcuda:r   rI   r   rF   )r   é	   rÒ   ru   )6r   ZClusterInforu   ZclusterDimXZclusterDimYZclusterDimZr   rÅ   rÆ   rÇ   r   rÉ   Zadd_convert_to_ttgpuirrl   rm   ÚttgpuirZadd_coalesceZadd_f32_dot_tcÚ	ttnvgpuirZadd_plan_ctaZadd_remove_layout_conversionsZadd_optimize_thread_localityZadd_accelerate_matmulZadd_optimize_dot_operandsrÈ   rË   Zadd_fuse_nested_loopsrÊ   Zadd_licmZadd_optimize_accumulator_initÚ add_combine_tensor_select_and_ifZadd_ws_task_partitionrq   Zadd_taskid_propagateZadd_ws_data_partitionZadd_ws_code_partitionrp   rr   rs   Zadd_pipelinero   Zadd_ping_pong_syncZadd_ws_loweringZadd_promote_lhs_to_tmemZadd_keep_acc_in_tmemZadd_prefetchZadd_coalesce_async_copyZadd_reduce_data_duplicationZadd_reorder_instructionsrÌ   Zadd_fence_insertionZadd_tma_loweringZadd_ws_canonicalizationrÍ   )rÎ   r¼   rÏ   rd   Zcluster_inforÐ   Zdump_enabledr   r   r   Ú
make_ttgiró   sŠ   
ÿÿ
zCUDABackend.make_ttgirc                 C   s@  t || jjƒ}|}t |j¡}| ¡  tjj	 
|¡ tj |¡ tj |¡ tj |¡ tj |¡ tjj	 |¡ tj |¡ tjj |||¡ tj |¡ tj |¡ tjj	 |¡ tjj	 |¡ tj |¡ tj |¡ tj |¡ tj dd¡dkrƒtj |¡ | |¡ t  !¡  t  ¡ }tj dd¡dkrt"dƒ‚t  #||¡}	t$|ƒ}
t%|| jjƒ}d}t  &|	||
|¡ t '|	¡ |j(d ur×|	 )¡ D ]}| *¡ sÖ| +¡ rÖ| ,|j(¡ qÆ|j-rèdd„ |j-D ƒ}t  .|	|¡ t  /|	t j0¡ | 1d	¡}|d urü||d
< | 1d¡|d< | 1d¡|d< | 1d¡|d< | 1d¡|d< t2|	ƒ}~	~|S )NÚTRITON_DISABLE_LINE_INFOÚ0ZTRITON_ENABLE_ASANr²   zYAddress Sanitizer Error: Address sanitizer is currently only supported on the AMD backendúnvptx64-nvidia-cudac                 S   s   g | ]\}}|‘qS r   r   )r   r@   r*   r   r   r   r•   j  ó    z)CUDABackend.make_llir.<locals>.<listcomp>zttg.total-num-warpsrl   z
ttg.sharedr»   zttg.tensor_memory_sizeZ	tmem_sizezttg.global_scratch_memory_sizeZglobal_scratch_sizez#ttg.global_scratch_memory_alignmentZglobal_scratch_align)3rW   r   r=   r   rÅ   rÆ   rÇ   r   r   rÔ   Zadd_lower_mmarÓ   rÕ   Zadd_allocate_warp_groupsÚconvertZadd_scf_to_cfZadd_allocate_shared_memoryZadd_allocate_tensor_memoryZ"add_allocate_global_scratch_memoryZadd_to_llvmirrÈ   rÊ   rË   Zadd_nvgpu_to_llvmZadd_warp_specialize_to_llvmrÌ   r&   r'   r(   ZllvmirZadd_di_scoperÍ   r   Zinit_targetsr8   Z	to_modulerh   r[   Zattach_datalayoutZset_nvvm_reflect_ftzrt   Zget_functionsZis_declarationZis_external_linkageZset_nvvm_maxnregr   Zlink_extern_libsZoptimize_moduleZOPTIMIZE_O3Zget_int_attrrL   )r   Úsrcr¼   rU   rd   rT   rÎ   rÐ   rÆ   Zllvm_modÚprocrZ   Útripler   r9   Ztotal_num_warpsÚretr   r   r   Ú	make_llir<  sl   
ÿ

€
zCUDABackend.make_llirc              	   C   sè   t || jjƒ}d}t|ƒ}t|| jjƒ}t ||||dg|jd¡}	t 	d|	¡}
t
|
ƒdks/J ‚|
d |d< |d › d	|d › }tjd
d|› |	tjd}	tjdd|› |	tjd}	t dd|	¡}	tj dd¡dkrrtdƒ t|	ƒ |	S )NrÙ   znvptx-short-ptrFz(.visible .entry ([a-zA-Z_][a-zA-Z0-9_]*)r#   r   r@   rI   rC   z\.version \d+\.\d+z	.version r!   z\.target sm_\d+z.target sm_z,\s*debug|debug,\s*r   ZNVPTX_ENABLE_DUMPrØ   r²   z // -----// NVPTX Dump //----- //)rW   r   r=   rh   r[   r   Ztranslate_to_asmrv   r3   ÚfindallÚlenÚsubr6   r&   r'   r(   Úprint)r   rÜ   r¼   rÏ   rd   rT   rÞ   rÝ   rZ   rß   Únamesr   r   r   Úmake_ptx}  s    zCUDABackend.make_ptxc                 C   s~  t | jjƒ\}}tjdddd#}tjdddd }| |¡ | ¡  |jd }	tj	 
dd	¡d
kr7ddgndg}
|jr?g ndg}t|ƒ}tj	 
dd	¡d
krSdd	gng }|g|
¢|¢d‘|¢d|› ‘|j‘d‘|	‘}z%tj|dd|d tj |j¡rƒt |j¡ tj |j¡rt |j¡ W n\ tjyí } zOt|jƒ}| ¡ }W d   ƒ n1 s®w   Y  tj |j¡rÀt |j¡ |jdkrÈd}n|jdtj krÓd}nd|j› }t|› d|› dd |¡› dƒ‚d }~ww t|	dƒ}| ¡ }W d   ƒ n	1 sw   Y  tj |	¡rt |	¡ W d   ƒ n1 sw   Y  W d   ƒ |S W d   ƒ |S 1 s8w   Y  |S )NFÚwz.ptx)ÚdeleteÚmoderg   Úrz.logz.or×   rØ   r²   z	-lineinfoz-suppress-debug-infoz--fmad=falseZDISABLE_PTXAS_OPTz--opt-levelz-vz--gpu-name=z-oT)ÚcheckÚ	close_fdsr   éÿ   z!Internal Triton PTX codegen erroré€   z`ptxas` raised SIGSEGVz`ptxas` failed with error code z
`ptxas` stderr:
z
Repro command: ú Ú
r\   )rA   r   r=   ÚtempfileÚNamedTemporaryFileÚwriteÚflushr@   r&   r'   r(   rv   rh   r0   rÍ   r*   r.   ÚremoveÚCalledProcessErrorr]   r`   Ú
returncodeÚsignalÚSIGSEGVr   r+   )r   rÜ   r¼   rÏ   rd   r?   rV   ÚfsrcZflogZfbinZ	line_infoZfmadr=   Ú	opt_levelZ	ptxas_cmdÚeZlog_fileÚlogÚerrorrb   r¨   r   r   r   Ú
make_cubin“  sp   ÿ

ÿÿÿ.€
ÿ
ÿ
þ€ó
ÿ
€*Ú'Ù'Ù'zCUDABackend.make_cubinc                    sn   ˆ  ˆj¡‰ ‡‡fdd„|d< ‡ ‡‡fdd„|d< ‡ ‡‡fdd„|d< ‡‡fdd„|d	< ‡‡fd
d„|d< d S )Nc                    s   ˆ  | |ˆ ¡S rS   )rÑ   ©rÜ   r¼   ©rU   r   r   r   Ú<lambda>À  s    z(CUDABackend.add_stages.<locals>.<lambda>rÉ   c                    ó   ˆ  | |ˆˆ ¡S rS   )rÖ   r   ©rd   rU   r   r   r   r  Á  ó    Zttgirc                    r  rS   )rà   r   r  r   r   r  Â  r  Zllirc                    ó   ˆ  | |ˆ ˆjj¡S rS   )ræ   r   r=   r   r  r   r   r  Ã  rÚ   Zptxc                    r  rS   )rÿ   r   r=   r   r  r   r   r  Ä  rÚ   r¨   )r§   r=   )r   ZstagesrU   r   r  r   Ú
add_stages¾  s   zCUDABackend.add_stagesc                 C   s   t | jjƒ}|› d| jj› S )Nr“   )rB   r   r=   )r   r;   r   r   r   rš   Æ  s   zCUDABackend.hash)r›   rœ   r   Ústaticmethodr   r¢   r§   rª   r
   r¹   r½   rÁ   r   rL   r   rÂ   rÃ   rÑ   rÖ   rà   ræ   rÿ   r  Ú	functoolsÚ	lru_cacherš   Ú__classcell__r   r   r«   r   r    ¢   s(    



HA+r    )+Ztriton.backends.compilerr   r   Ztriton._C.libtritonr   r   r   r   Ztriton.runtime.errorsr   Údataclassesr	   r	  Útypingr
   r   r   r   Útypesr   r^   r3   rñ   rø   r&   r0   Úpathlibr   r$   r   r
  rL   r<   r   rA   rB   rR   rW   r[   rc   rh   rj   r    r   r   r   r   Ú<module>   sB    
*