o
    bZŽh¿J  ã                   @   sÒ   d dl mZmZ d dlmZmZmZmZ d dlm	Z	 d dl
mZmZmZ d dlmZ d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ defd	d
„Zdd„ Ze	ddG dd„ dƒƒZG dd„ deƒZdS )é    )ÚBaseBackendÚ	GPUTarget)ÚirÚpassesÚllvmÚamd)Ú	dataclass)ÚAnyÚDictÚTuple)Ú
ModuleTypeN)ÚPathÚtargetc                 C   s   dd„ S )Nc                 S   s   dS )N©é   r   r   © )ZlhsTypeZrhsTyper   r   úK/var/www/auris/lib/python3.10/site-packages/triton/backends/amd/compiler.pyÚ<lambda>   s    zmin_dot_size.<locals>.<lambda>r   ©r   r   r   r   Úmin_dot_size   s   r   c                 C   s    | dkrdnd}t  d|¡dkS )NÚgfx942Ú1Ú0ZTRITON_HIP_USE_BLOCK_PINGPONG)ÚosÚgetenv)ÚarchÚdefaultr   r   r   Úis_pingpong_enabled   s   r   T)Úfrozenc                   @   sV  e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	eed	< dZ
eed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZee ed< dZee ed< dZeed< dZee ed< dZeed< dZeed< dZeed < dZeed!< dZeed"< dZ eed#< d$Z!eed%< d&Z"eed'< d(d)„ Z#d*d+„ Z$dS ),Ú
HIPOptionsé   Ú	num_warpsr   Úwaves_per_eué   Ú
num_stagesÚnum_ctasr   Únum_buffers_warp_specÚnum_consumer_groupsÚreg_dec_producerÚreg_inc_consumerNÚextern_libsr   Úcluster_dimsFÚdebugTÚsanitize_overflowr   )Úfp8e5Úsupported_fp8_dtypesr   Údeprecated_fp8_dtypesÚieeeÚdefault_dot_input_precision)r1   Úallowed_dot_input_precisionsÚenable_fp_fusionÚlaunch_cooperative_gridÚmatrix_instr_nonkdimÚkpackÚallow_flush_denormÚmax_num_imprecise_acc_defaultÚhipÚbackend_nameÚnoneÚinstruction_sched_variantc                 C   sä   t tƒjd }| jd u ri nt| jƒ}d| jv s"d| jv s"d| jv r$dnd}t | d|¡ | jdkr4d	n| j}t | d
|¡ ddg}|D ]}t	||› d ƒ||< qDt | dt
| ¡ ƒ¡ | jdkrl| j| jd	 @ dkspJ dƒ‚d S )NÚlibZgfx10Zgfx11Zgfx12é    é@   Ú	warp_sizeÚgfx950r   r7   ZocmlZocklz.bcr*   r   znum_warps must be a power of 2)r   Ú__file__Úparentr*   Údictr   ÚobjectÚ__setattr__r7   ÚstrÚtupleÚitemsr!   )ÚselfÚdefault_libdirr*   rA   r7   Zlibsr>   r   r   r   Ú__post_init__G   s   & ÿzHIPOptions.__post_init__c                 C   s.   d  dd„ | j ¡ D ƒ¡}t | d¡¡ ¡ S )NÚ_c                 S   s   g | ]\}}|› d |› ‘qS )ú-r   )Ú.0ÚnameÚvalr   r   r   Ú
<listcomp>X   s    z#HIPOptions.hash.<locals>.<listcomp>úutf-8)ÚjoinÚ__dict__rJ   ÚhashlibÚsha256ÚencodeÚ	hexdigest)rK   Úkeyr   r   r   ÚhashW   s   zHIPOptions.hash)%Ú__name__Ú
__module__Ú__qualname__r!   ÚintÚ__annotations__r"   r$   r%   r&   r'   r(   r)   r*   rE   r+   rI   r,   Úboolr-   r   rH   r/   r   r0   r2   r3   r4   r5   r6   r7   r8   r9   r;   r=   rM   r\   r   r   r   r   r      s8   
 r   c                       s  e Zd Zedefdd„ƒZdeddf‡ fdd„Zdefdd	„Zd
d„ Z	dd„ Z
deeef fdd„Zdd„ Zee ¡ dd„ ƒƒZedd„ ƒZedd„ ƒZedd„ ƒZedd„ ƒZedd„ ƒZedd„ ƒZed d!„ ƒZed"d#„ ƒZed$d%„ ƒZd&d'„ Ze ¡ d(d)„ ƒZ‡  ZS )*Ú
HIPBackendr   c                 C   s
   | j dkS )Nr:   )Úbackendr   r   r   r   Úsupports_target^   s   
zHIPBackend.supports_targetÚreturnNc                    s&   t ƒ  |¡ t|jtƒsJ ‚d| _d S )NÚhsaco)ÚsuperÚ__init__Ú
isinstancer   rH   Z
binary_ext)rK   r   ©Ú	__class__r   r   ri   b   s   
zHIPBackend.__init__c                    sæ   dt  d| jj¡i}| jjdv r#ttjƒ}| dh¡ tt	|ƒƒ|d< dˆ vrOttj
ƒ}| jjdv r:| h d£¡ n| jjdv rG| d	d
h¡ tt	|ƒƒ|d< dˆ vr]t  dd¡dk|d< | ‡ fdd„tj ¡ D ƒ¡ tdi |¤ŽS )Nr   ZTRITON_OVERRIDE_ARCH)Zgfx940Zgfx941r   Ztf32r3   r/   >   Zfp8e4b8Zfp8e5b16Úfp8e4nvrB   rm   r.   r4   ZTRITON_DEFAULT_FP_FUSIONr   c                    s*   i | ]}|ˆ v rˆ | d ur|ˆ | “qS ©Nr   )rP   Úk©Úoptsr   r   Ú
<dictcomp>z   s   * z,HIPBackend.parse_options.<locals>.<dictcomp>r   )r   r   r   r   Úsetr   r3   ÚupdaterI   Úsortedr/   Ú__dataclass_fields__Úkeys)rK   rq   Úargsr3   r/   r   rp   r   Úparse_optionsg   s    

zHIPBackend.parse_optionsc                 C   s(   |j |j|j|jd |jd |jd fS )Nr   r   r#   )r!   r%   Úsharedr+   )rK   Úmetadatar   r   r   Úpack_metadata}   s   úzHIPBackend.pack_metadatac                 C   s   dt | jƒi}|S )Nr   )r   r   )rK   ÚoptionsZcodegen_fnsr   r   r   Úget_codegen_implementation‡   s   z%HIPBackend.get_codegen_implementationc                 C   s   ddl m} d|iS )Nr   )Ú	libdeviceztriton.language.extra.libdevice)Ztriton.language.extra.hipr   )rK   r   r   r   r   Úget_module_map‹   s   zHIPBackend.get_module_mapc                 C   s   t  |¡ d S rn   )r   Úload_dialects)rK   Úctxr   r   r   r      s   zHIPBackend.load_dialectsc                   C   s   t j dd¡dkS )NZAMDGCN_USE_BUFFER_OPSr   r   )r   ÚenvironÚgetr   r   r   r   Úuse_buffer_ops“   s   zHIPBackend.use_buffer_opsc                 C   sL   dd l }d}t| dƒr|  ¡ |kS t| |jƒr$t| dƒr$|  ¡  ¡ |kS dS )Nr   iÿÿÿÚ	ptr_rangeÚuntyped_storageF)ÚtorchÚhasattrr†   rj   ZTensorr‡   Úsize)Úargrˆ   Z
MAX_INT_32r   r   r   Úis_within_2gb˜   s   
zHIPBackend.is_within_2gbc                 C   s$   t  | ¡}d| v r|ddgg7 }|S )NÚSztt.pointer_ranger?   )r   Ú
parse_attr)ZdescÚretr   r   r   rŽ   £   s   
zHIPBackend.parse_attrc                 K   s:   t j| |fi |¤Ž}t ¡ r|dkrt | ¡r|d7 }|S )NZtensorr   )r   Úget_arg_specializationrc   r…   rŒ   )r‹   ÚtyÚkwargsr   r   r   r   r   ª   s   z!HIPBackend.get_arg_specializationc                  C   sp   t  d¡} | d urt| ƒ}| ¡ r|S ttƒjd }| ¡ r |S tdƒ}| ¡ r*|S tdƒ}| ¡ r4|S tdƒ‚)NZTRITON_HIP_LLD_PATHzllvm/bin/ld.lldz/opt/rocm/llvm/bin/ld.lldz/usr/bin/ld.lldzWROCm linker /opt/rocm/llvm/bin/ld.lld not found. Set 'TRITON_HIP_LLD_PATH' to its path.)r   r   r   Úis_filerC   rD   Ú	Exception)Zlld_env_pathZlldr   r   r   Úpath_to_rocm_lld³   s   
zHIPBackend.path_to_rocm_lldc                 C   sŽ   t  | j¡}| ¡  tj |¡ tj |¡ tj 	|¡ tj 
|¡ tj |¡ tj |¡ tj |¡ tj |¡ tj |¡ | | ¡ | S rn   )r   Úpass_managerÚcontextÚenable_debugr   ÚcommonZadd_inlinerÚttirZadd_rewrite_tensor_pointerÚadd_canonicalizerZadd_combineZadd_reorder_broadcastÚadd_cseZadd_licmÚadd_symbol_dceZadd_loop_unrollÚrun)Úmodr{   r}   Úpmr   r   r   Ú	make_ttirÇ   s   
zHIPBackend.make_ttirc                 C   s&  t  | j¡}| ¡  tj |d|j› |j|j	|j
¡ | | ¡ t  | j¡}| ¡  tj |¡ tj |¡ tj |¡ tjj ||j|j|j¡ tj |¡ tjj |¡ tj |d¡ tjj |¡ tt dd¡ƒ}tt dd¡ƒ}|jdkr}d }}t |j¡r|jdksŒJ d	ƒ‚tjj ||j||¡ tj |¡ |j  ¡ d
kr­tjj !||j¡ tj |d¡ tj |¡ tj "|¡ t |j¡ràtjj #|¡ t$|jƒ}|rà|jdkràtjj %|¡ t& '¡ rútjj (|¡ tj |¡ tjj )||j¡ tj |¡ tj *|¡ tj +|¡ | | ¡ | S )Nzhip:TZTRITON_HIP_GLOBAL_PREFETCHr   ZTRITON_HIP_LOCAL_PREFETCHzlocal-prefetchr   r   zÕTriton AMD backend pipeliner has been updated. We used to trigger software pipelining with num_stages == 0. Now it will not happen anymore; please update to use num_stages == 2 for equivalent behavior in the past.r<   r#   ),r   r–   r—   r˜   r   rš   Zadd_convert_to_ttgpuirr   r!   rA   r%   rž   ÚttgpuirZadd_coalesceZadd_remove_layout_conversionsZadd_optimize_thread_localityr   Zadd_accelerate_matmulr6   r7   Zadd_optimize_epilogueZadd_optimize_dot_operandsZadd_hoist_layout_conversionsr`   r   r   r=   Zhas_matrix_core_featurer$   Zadd_stream_pipeliner™   r›   ÚlowerZinsert_instruction_sched_hintsZadd_reduce_data_duplicationZadd_reorder_instructionsr   Zadd_block_pingpongrc   r…   Zadd_canonicalize_pointersZadd_convert_to_buffer_opsrœ   r   )rŸ   r{   r}   r    Zglobal_prefetchZlocal_prefetchZuse_block_pingpongr   r   r   Ú
make_ttgir×   sV   ÿ



zHIPBackend.make_ttgirc                    sH  | }t  |j¡}| ¡  tjj ||j¡ d}tjj 	||j|¡ tj
 |¡ tj
 |¡ tj |¡ d}tjj ||j|¡ tj |¡ tj |¡ tj
 |¡ tj
 |¡ tj |¡ tj |¡ tj |¡ |j ¡ dkr{tjj ||j|j¡ tj dd¡dkrŠtj |¡ tjj ||¡ | |¡ t  !¡  t  ¡ }t  "||¡‰ t #ˆ ¡ d}tj dd¡dkr·d	}t  $ˆ tj%|j|¡ t &ˆ |j¡ t 'ˆ d
¡ t (ˆ dd¡ t (ˆ dd¡ t (ˆ dd¡ t (ˆ d|j)dk¡ dd„ ˆ  *¡ D ƒ}	|	d  +tj,¡ |	d  -dd|j.|j) › ¡ |	d  -d|j/› ¡ |j0rdnd}
|	d  -d|
¡ tj dd¡dkr>|	d  1d	¡ |	d  2¡  t 3|	d ¡ tj dd¡dkrnt4t5ƒj6d }t7|d ƒt7|d ƒt7|d ƒg}t  8ˆ |¡ n|j9r‚‡ fdd„|j9D ƒ}t  8ˆ |¡ t  :ˆ t j;|jdg |j<¡ |  =d¡|d< t >ˆ ¡ t ?ˆ ¡ t7ˆ ƒS ) Nr   Tr<   ZTRITON_DISABLE_LINE_INFOr   Ú ÚTRITON_ENABLE_ASANr   ú+xnackiô  Z__oclc_finite_only_optFZ__oclc_correctly_rounded_sqrt32Z__oclc_unsafe_math_optZ__oclc_wavefrontsize64r@   c                 S   s   g | ]}|  ¡ s|‘qS r   )Zis_declaration)rP   Úfnr   r   r   rS   K  s    z(HIPBackend.make_llir.<locals>.<listcomp>zamdgpu-flat-work-group-sizez1,zamdgpu-waves-per-euzpreserve-signr1   zdenormal-fp-math-f32r>   z
asanrtl.bczocml.bczockl.bcc                    s    g | ]\}}t  ˆ |¡r|‘qS r   )r   Zneed_extern_lib)rP   rQ   Úpath©Zllvm_modr   r   rS   k  s     z
ttg.sharedrz   )@r   r–   r—   r˜   r   r   r¢   Z%add_decompose_unsupported_conversionsr   Zadd_optimize_lds_usageÚconvertZadd_scf_to_cfZadd_index_to_llvmirZadd_allocate_shared_memoryZadd_to_llvmirr™   r›   rœ   Zadd_cf_to_llvmirZadd_arith_to_llvmirr   r=   r£   Zlower_instruction_sched_hintsr$   r   rƒ   r„   ZllvmirZadd_di_scopeZadd_builtin_func_to_llvmirrž   r   Zinit_targetsZ	to_moduleZattach_target_tripleZattach_datalayoutÚTARGET_TRIPLEZset_isa_versionZset_abi_versionZset_bool_control_constantrA   Zget_functionsZset_calling_convZCALLING_CONV_AMDGPU_KERNELZadd_fn_attrr!   r"   r8   Zadd_fn_target_featureZadd_fn_asan_attrZset_all_fn_arg_inregr   rC   rD   rH   Zlink_extern_libsr*   Zoptimize_moduleZOPTIMIZE_O3r4   Zget_int_attrZcleanup_bitcode_metadataZdisable_print_inline)Úsrcr{   r}   rŸ   r    Zcustom_lds_sizeZ_HIPBackend__HIP_FTZr—   Útarget_featuresÚfnsZdenormal_moderL   Úpathsr   rª   r   Ú	make_llir  s~   




ý

zHIPBackend.make_llirc              	   C   sj   t  d| ¡}t|ƒdksJ ‚|d |d< t | tj|jdg |jd¡}t	j
 dd¡d	kr3td
ƒ t|ƒ |S )Nz3define amdgpu_kernel void @([a-zA-Z_][a-zA-Z0-9_]*)r   r   rQ   r¥   FZAMDGCN_ENABLE_DUMPr   r   z!// -----// AMDGCN Dump //----- //)ÚreÚfindallÚlenr   Ztranslate_to_asmr   r¬   r   r4   r   rƒ   r„   Úprint)r­   r{   r}   ÚnamesÚamdgcnr   r   r   Úmake_amdgcny  s   zHIPBackend.make_amdgcnc                 C   s  d}t j dd¡dkrd}t | |j|¡}t ¡ }t 	¡ h}t 	¡ 1}t
|jdƒ}| |¡ W d   ƒ n1 s9w   Y  t |ddd	|jd
|jg¡ W d   ƒ n1 sVw   Y  t
|jdƒ}	|	 ¡ }
W d   ƒ n1 spw   Y  W d   ƒ |
S W d   ƒ |
S 1 sˆw   Y  |
S )Nr¥   r¦   r   r   r§   Úwbz-flavorZgnuz-sharedz-oÚrb)r   rƒ   r„   r   Zassemble_amdgcnr   rc   r•   ÚtempfileÚNamedTemporaryFileÚopenrQ   ÚwriteÚ
subprocessÚ
check_callÚread)r­   r{   r}   r®   rg   Z	rocm_pathZtmp_outZtmp_inZfd_inZfd_outr   r   r   r   Ú
make_hsacoˆ  s,   

ÿý
ÿ
ûþ
ûùzHIPBackend.make_hsacoc                    s^   ‡ ‡fdd„|d< ‡ ‡fdd„|d< ‡ ‡fdd„|d< ‡ ‡fdd„|d	< ‡ ‡fd
d„|d< d S )Nc                    ó   ˆ  | |ˆ ¡S rn   )r¡   ©r­   r{   ©r}   rK   r   r   r   š  ó    z'HIPBackend.add_stages.<locals>.<lambda>rš   c                    rÃ   rn   )r¤   rÄ   rÅ   r   r   r   ›  rÆ   Zttgirc                    rÃ   rn   )r±   rÄ   rÅ   r   r   r   œ  rÆ   Zllirc                    rÃ   rn   )r¸   rÄ   rÅ   r   r   r     rÆ   r·   c                    rÃ   rn   )rÂ   rÄ   rÅ   r   r   r   ž  rÆ   rg   r   )rK   Zstagesr}   r   rÅ   r   Ú
add_stages™  s
   zHIPBackend.add_stagesc                 C   s&   t jt ¡ dgdd}|› d| j› S )Nz	--versionrT   )ÚencodingrO   )r¿   Úcheck_outputrc   r•   r   )rK   Úversionr   r   r   r\      s   zHIPBackend.hash)r]   r^   r_   Ústaticmethodr   re   ri   r	   ry   r|   r~   r
   rH   r   r€   r   Ú	functoolsÚ	lru_cacher…   rŒ   rŽ   r   r•   r¡   r¤   r±   r¸   rÂ   rÇ   r\   Ú__classcell__r   r   rk   r   rc   \   sB    







5
k

rc   )Ztriton.backends.compilerr   r   Ztriton._C.libtritonr   r   r   r   Údataclassesr   Útypingr	   r
   r   Útypesr   rW   r»   r   r²   r¿   rÌ   Úpathlibr   r   r   r   rc   r   r   r   r   Ú<module>   s"    B