a
    khz                    @  s   d dl mZ d dlZd dlmZmZmZmZmZm	Z	m
Z
 d dlZd dlmZ ddlmZ ddlmZ ed	Zed
ZG dd deZG dd de	e ZdS )    )annotationsN)ListOptionalSequenceTupleTypeVarGenericType)driver   )ir   )coreTTensorTyc                      s   e Zd Z fddZ  ZS )IncompatibleTypeErrorImplc                   s@   || _ || _d| j   d | j  | _tt| | j d S )Nzinvalid operands of type  and )type_atype_b__repr__messagesuperr   __init__)selfr   r   	__class__ F/var/www/auris/lib/python3.9/site-packages/triton/language/semantic.pyr      s    z"IncompatibleTypeErrorImpl.__init__)__name__
__module____qualname__r   __classcell__r   r   r   r   r      s   r   c                   @  sj  e Zd ZU ejZded< eZded< dd Zddd	d
dZddd	ddZ	ddddddZ
dddddddddZd;ddddZdddddddZd<ddd d!d"d#Zddd$d%d&d'Zddddd(d)d*Zddddd(d+d,Zddddd(d-d.Zdddd/d0d1Zdddd/d2d3Zddddd4d5d6Zdddd/d7d8Zddd9d:d;d<Zddd9d:d=d>Zdddd9d?d@dAZddd d/dBdCZdddd/dDdEZdddd/dFdGZdddd/dHdIZdddd/dJdKZdddd/dLdMZddNdOdPZ dddd/dQdRZ!dddd/dSdTZ"dddd/dUdVZ#dddWdXdYZ$dddWdZd[Z%dddWd\d]Z&dd^d_d`daZ'dddd/dbdcZ(dddd/dddeZ)dddd/dfdgZ*dddd/dhdiZ+dddd/djdkZ,dddd/dldmZ-dndoddd^ddpdqdrZ.dddsdtduZ/dddsdvdwZ0dxdddydzd{Z1ddxdd|d}d~Z2ddxdddddZ3ddddddZ4dddddddZ5ddddddZ6dd dddZ7ddddddZ8ddddddZ9dddd!ddZ:ddddZ;ddddddZ<d=dddddddZ=dd Z>dd Z?dd Z@dd ZAdd ZBdd ZCdd ZDdd ZEdd ZFdddddddddd	ddZGdddddddZHddddddZIddddddZJddddddÄZKddń ZLddǄ ZMddddddɄZNdddddd˄ZOdddddd̈́ZPddddddτZQddddddфZRddddҜddԄZSddd՜ddׄZTddل ZUddۄ ZVdddddddܜddބZWdddddddߜddZXddddddddZYdddddZZdddddddddZ[dddddddddZ\dddddddddZ]dddddddddZ^dddddddddZ_dddddddddZ`dddddddddZadd ZbddddddddddZcddddZdddd ddZedddddddddddddddZfddddddd	Zgd
d ZhddddddZidddddddZjdddddddZkdddddddZlddxddddZmddxddddZnddxdddd Zodd!d"d#Zpdd$ddd%d&d'Zqdddd(d)d*Zrdd!d+d,Zsd-d. Ztd>d/d0Zuddd1d2d3Zvddd1d4d5Zwdd$d$d6d7d8d9d:ZxdnS (?  TritonSemanticzType[TensorTy]tensorz
ir.builderbuilderc                 C  s
   || _ d S N)r$   )r   r$   r   r   r   r      s    zTritonSemantic.__init__intr   )axisreturnc                 C  s,   |dvrt d| | | j|tjS )Nr   r   r   z+program_id axis must be 0, 1, or 2 but got )
ValueErrorr#   r$   Zcreate_get_program_idtlint32r   r'   r   r   r   
program_id&   s    zTritonSemantic.program_idc                 C  s,   |dvrt d| | | j|tjS )Nr)   z-num_programs axis must be 0, 1, or 2 but got )r*   r#   r$   Zcreate_get_num_programsr+   r,   r-   r   r   r   num_programs+   s    zTritonSemantic.num_programsztl.dtype)a_tyb_tyr(   c                 C  s   |j }|j }|j}|j}||kr0||kr,|S |S |tjjjkrN||krJ|S |S |tjjjkrl||krh|S |S td| d| d S )Nzunexpected signedness r   )int_bitwidthint_signednessr+   dtypeZ
SIGNEDNESSZUNSIGNED	TypeError)r   r0   r1   Za_rankZb_rankZa_snZb_snr   r   r   integer_promote_impl4   s    z#TritonSemantic.integer_promote_implbool)r0   a_is_scalarr1   b_is_scalar
div_or_modr(   c                 C  s^  ||krR|r||fn||f\}}|  j|  jkrR|rN|tjtjfv rNtjS |S | sb| rhtjS | sx| r~tjS |	 s|	 r|rtjS tjS |
 r|
 r|rtjS tjS |
 s|
 rtjS | r| r||kr|S tjS | r
| std| d| |rR|j|jkrRtd|  d |  d | ||S )Nunexpected type r   zCannot use /, #, or % with x because they have different signedness;this is unlikely to result in a useful answer. Cast them to the same signedness.)kindvaluer+   float16bfloat16float32Zis_fp64float64is_fp32is_fp16is_bf16is_fp8is_intr5   r3   r   r6   )r   r0   r8   r1   r9   r:   	scalar_tyZ	tensor_tyr   r   r   computation_type_implC   s:    z$TritonSemantic.computation_type_implT)
check_typec                 C  s  t |tr | | j|tjS t |trd|  kr>dk rJn ntj}npd|  kr^dk rjn ntj	}nPd|  kr~dk rn ntj
}n0d|  krdk rn ntj}ntd| d| j||d	S t |trJd
}ddd  }td |}|tdks.|dks.||ks.||  kr*|kr6n ntj}ntj}| j||d	S t |tjrd| |jS t || jrv|S |rtd| dt| d|S )N           l                             l            zNonrepresentable integer .r4   g      8g   ?r      absinfg        zcannot convert z	 of type z
 to tensor)
isinstancer7   r#   r$   get_int1r+   int1r&   r,   uint32int64uint64r*   scalar_constantfloat__builtins__rA   rB   	constexpr	to_tensorr>   r5   type)r   xrJ   r4   Zmin_float32Zmax_float32Zabs_xr   r   r   r^   u   sH    


zTritonSemantic.to_tensorNone)r   r   allow_ptr_ar(   c                 C  sF   |  rB|st|||  r0||kr0t||| rBt||d S r%   )is_ptrr   is_floating)r   r   r   rb   r   r   r   check_ptr_type_impl   s    

z"TritonSemantic.check_ptr_type_implFzTensorTy | numbers.NumberzTuple[TensorTy, TensorTy])lhsrhsr(   c                 C  s  t |tj}t |tj}|r*|}	| |}|r<|}
| |}|jj}|jj}| ||| | ||| |r| s| s| |||||}|r|	dk r|	 s|r|
dk r|	 rt
d| rD|r| |	  kr| ksn t
d|	 d| |rD| |
  kr.| ksDn t
d|
 d| |rX| j|	|dn
| ||}|rx| j|
|dn
| ||}| ||\}}||fS )Nr   z{Cannot perform a binary operation between an unsigned tensor and a negative scalar. Perform a explicit cast on one of them.zScalar z is out of range for type rP   )rT   numbersNumberr^   r_   scalarre   rc   rI   is_int_unsignedr*   rG   get_int_min_valueget_int_max_valuerZ   castbroadcast_impl_value)r   rf   rg   Zallow_lhs_ptrZallow_rhs_ptrZarithmetic_checkr:   Zlhs_is_scalarZrhs_is_scalarZ
lhs_scalarZ
rhs_scalar
lhs_sca_ty
rhs_sca_ty
ret_sca_tyr   r   r   binary_op_type_checking_impl   sF    




  z+TritonSemantic.binary_op_type_checking_implcallable)rf   rg   	binary_opc                 C  s   |j jjdks| jjjsd S |j j}|j j}||ks8J | sDJ | |tj	}| |tj	}|||d}|
 }| |tj	}| }| |tj	}| | ||| ||}	d|j d|j }
| |	|
 d S )N@   Fr&   z! overflow detected for operation )r_   rj   r2   r$   optionssanitize_overflowrG   rn   r+   rX   rm   rZ   rl   and_
less_equalgreater_equalr   device_assert)r   rf   rg   ru   rp   rq   retZ	max_valueZ	min_valuecondmsgr   r   r    binary_op_sanitize_overflow_impl   s     z/TritonSemantic.binary_op_sanitize_overflow_impl)inputotherrx   r(   c                 C  s>  |  ||dd\}}|jj}|jj}| r<| r<td| rf| sf|| }}|jj}|jj}| r|j}|j r|jjdk r|j	t
j| j}| j|j|d}| | j|j||jS | r| | j|j|j|jS | r,|r| ||| j | | j|j|j|jS td| d S )NTzcannot add pointers togetherrv   Fr;   )rs   r_   rj   rc   r5   handler4   rk   r2   with_element_tyr+   rX   to_irr$   create_int_castr#   Zcreate_addptrrd   Zcreate_faddrG   r   addZ
create_add)r   r   r   rx   input_scalar_tyother_scalar_tyZother_handleZi64_tyr   r   r   r      s,    

zTritonSemantic.addc                 C  s   |  ||dd\}}|jj}| r:| j|| |ddS | r^| | j	|j
|j
|jS | r|rz| ||| j | | j|j
|j
|jS td| d S )NTF)rx   r;   )rs   r_   rj   rc   r   minusrd   r#   r$   Zcreate_fsubr   rG   r   subZ
create_subr5   r   r   r   rx   rH   r   r   r   r      s    zTritonSemantic.subc                 C  s   |  ||\}}|jj}| r<| | j|j|j|jS | rt|rX| 	||| j
 | | j|j|j|jS td| d S Nr;   )rs   r_   rj   rd   r#   r$   Zcreate_fmulr   rG   r   mulZ
create_mulr5   r   r   r   r   r     s    zTritonSemantic.mul)r   r   r(   c                 C  s   |  ||dddd\}}|jj}|jj}| rF| rF| ||}n| rd| rd| ||}nt| r| r| |tj}| |tj}nF| r| r|j|jkr| ||}q| ||}nt	d| | 
| j|j|j|jS NFTr;   )rs   r_   rj   rd   rG   rn   r+   rA   Zfp_mantissa_widthr5   r#   r$   create_fdivr   )r   r   r   r   r   r   r   r   truediv  s     zTritonSemantic.truedivc                 C  s   |  ||dddd\}}|jj}|jj}| r| r| ||}| ||}| ||}| r| | j	|j
|j
|jS | | j|j
|j
|jS td| d S r   )rs   r_   rj   rG   r6   rn   is_int_signedr#   r$   Zcreate_sdivr   Zcreate_udivr5   )r   r   r   r   r   ret_tyr   r   r   floordiv7  s    zTritonSemantic.floordiv)r   r   ieee_roundingr(   c                 C  s`   |j j}|j j}| r | s(td| ||dddd\}}| j|j|j}| ||j S )Nz4both operands of fdiv must have floating scalar typeFT)	r_   rj   rd   r5   rs   r$   r   r   r#   )r   r   r   r   r   r   r}   r   r   r   fdivE  s    zTritonSemantic.fdivc                 C  s   |  ||dddd\}}|jj}|jj}| rL| | j|j|j|jS | r|j	|j	krt
d|  d |  d | r| | j|j|j|jS | | j|j|j|jS t
d| d S )NFTzCannot mod z by r<   r;   )rs   r_   rj   rd   r#   r$   Zcreate_fremr   rG   r3   r5   r   r   Zcreate_sremZcreate_urem)r   r   r   rH   r   r   r   r   modN  s     zTritonSemantic.modztl.PropagateNan)r`   ypropagate_nanc                 C  s   |  ||\}}|j}| r~|tjjkrF| | j|j	|j	|j
S |tjjkrn| | j|j	|j	|j
S td| nV| r| | j|j	|j	|j
S | r| | j|j	|j	|j
S td| d S NzUnexpected propagate_nan Unexpected dtype )rs   r4   rd   r+   PropagateNanALLr#   r$   Zcreate_minimumfr   r_   NONEZcreate_minnumfr*   r   Zcreate_minsirk   Zcreate_minuir5   r   r`   r   r   r4   r   r   r   minimume  s    zTritonSemantic.minimumc                 C  s   |  ||\}}|j}| r~|tjjkrF| | j|j	|j	|j
S |tjjkrn| | j|j	|j	|j
S td| nV| r| | j|j	|j	|j
S | r| | j|j	|j	|j
S td| d S r   )rs   r4   rd   r+   r   r   r#   r$   Zcreate_maximumfr   r_   r   Zcreate_maxnumfr*   r   Zcreate_maxsirk   Zcreate_maxuir5   r   r   r   r   maximumv  s    zTritonSemantic.maximum)r`   minmaxr   c                 C  st   |  ||\}}|  ||\}}|  ||\}}|j}| r`| | j|j|j|j||jS td| dd S )Nr   z(. Only floating point clamp is supported)	rs   r4   rd   r#   r$   Zcreate_clampfr   r_   r5   )r   r`   r   r   r   r4   r   r   r   clamp  s    "zTritonSemantic.clampc                 C  sv   |  ||\}}|jj}|jj}| r0| s:t||| ||}||krZ| ||}||krn| ||}||fS r%   )rs   r_   rj   rG   r   r6   rn   )r   r   r   input_sca_tyZother_sca_tyrr   r   r   r   bitwise_op_type_checking_impl  s    
z,TritonSemantic.bitwise_op_type_checking_implc                 C  s,   |  ||\}}| | j|j|j|jS r%   )r   r#   r$   Z
create_andr   r_   r   r   r   r   r   r   ry     s    zTritonSemantic.and_c                 C  s,   |  ||\}}| | j|j|j|jS r%   )r   r#   r$   Z	create_orr   r_   r   r   r   r   or_  s    zTritonSemantic.or_c                 C  s,   |  ||\}}| | j|j|j|jS r%   )r   r#   r$   Z
create_xorr   r_   r   r   r   r   xor_  s    zTritonSemantic.xor_c                 C  s<   |j  s| |tj}|j  s0| |tj}| ||S r%   )r_   is_int1bitcastr+   rV   ry   r   r   r   r   logical_and  s
    

zTritonSemantic.logical_andc                 C  s<   |j  s| |tj}|j  s0| |tj}| ||S r%   )r_   r   r   r+   rV   r   r   r   r   r   
logical_or  s
    

zTritonSemantic.logical_orr   c                 C  s"   |j  s| |tj}| |S r%   )r_   r   r   r+   rV   invertr   r   r   r   r   not_  s    
zTritonSemantic.not_c                 C  s,   |  ||\}}| | j|j|j|jS r%   )r   r#   r$   Zcreate_lshrr   r_   r   r   r   r   lshr  s    zTritonSemantic.lshrc                 C  s,   |  ||\}}| | j|j|j|jS r%   )r   r#   r$   Zcreate_ashrr   r_   r   r   r   r   ashr  s    zTritonSemantic.ashrc                 C  s,   |  ||\}}| | j|j|j|jS r%   )r   r#   r$   Z
create_shlr   r_   r   r   r   r   shl  s    zTritonSemantic.shl)r   r(   c                 C  s   |S r%   r   r   r   r   r   plus  s    zTritonSemantic.plusc                 C  sN   |j j}| r$td|  d | | j|| j|}| 	||dS )Nz$wrong type argument to unary minus ()T)
r_   rj   rc   r*   r   r#   r$   get_null_valuer   r   )r   r   r   _0r   r   r   r     s
    zTritonSemantic.minusc                 C  sT   |j j}| s| r,td|  d | | j|	| j|}| 
||S )Nz%wrong type argument to unary invert (r   )r_   rj   rc   rd   r*   r   r#   r$   Zget_all_ones_valuer   r   )r   r   r   Z_1r   r   r   r     s
    zTritonSemantic.invertztl.block_type)vr(   c                 C  s   |j tjS r%   )r_   r   r+   rV   )r   r   r   r   r   
_bool_like  s    zTritonSemantic._bool_likec                 C  s   |  ||\}}|jj}| r@| | j|j|j| |S |	 r|
 rp| | j|j|j| |S | | j|j|j| |S td| d S r   )rs   r_   rj   rd   r#   r$   Zcreate_fcmpOGTr   r   rG   r   Zcreate_icmpSGTZcreate_icmpUGTr5   r   r   r   rH   r   r   r   greater_than  s       zTritonSemantic.greater_thanc                 C  s   |  ||\}}|jj}| r@| | j|j|j| |S |	 r|
 rp| | j|j|j| |S | | j|j|j| |S td| d S r   )rs   r_   rj   rd   r#   r$   Zcreate_fcmpOGEr   r   rG   r   Zcreate_icmpSGEZcreate_icmpUGEr5   r   r   r   r   r{     s       zTritonSemantic.greater_equalc                 C  s   |  ||\}}|jj}| r@| | j|j|j| |S |	 r|
 rp| | j|j|j| |S | | j|j|j| |S td| d S r   )rs   r_   rj   rd   r#   r$   Zcreate_fcmpOLTr   r   rG   r   Zcreate_icmpSLTZcreate_icmpULTr5   r   r   r   r   	less_than  s       zTritonSemantic.less_thanc                 C  s   |  ||\}}|jj}| r@| | j|j|j| |S |	 r|
 rp| | j|j|j| |S | | j|j|j| |S td| d S r   )rs   r_   rj   rd   r#   r$   Zcreate_fcmpOLEr   r   rG   r   Zcreate_icmpSLEZcreate_icmpULEr5   r   r   r   r   rz     s       zTritonSemantic.less_equalc                 C  sz   |  ||\}}|jj}| r@| | j|j|j| |S |	 rh| | j
|j|j| |S td| d S r   )rs   r_   rj   rd   r#   r$   Zcreate_fcmpOEQr   r   rG   Zcreate_icmpEQr5   r   r   r   r   equal"  s      zTritonSemantic.equalc                 C  sz   |  ||\}}|jj}| r@| | j|j|j| |S |	 rh| | j
|j|j| |S td| d S r   )rs   r_   rj   rd   r#   r$   Zcreate_fcmpUNEr   r   rG   Zcreate_icmpNEr5   r   r   r   r   	not_equal-  s      zTritonSemantic.not_equalN)r   )startendr   r(   c          	      C  s   t |trt |tstdt|d? }t|d? }|s<|rDtd||krTtd|| }||d @ dkrttd|g}|d u rttj|}|| j}| 	| j
||||S )Nz/arange's arguments must be of type tl.constexpr    zarange must fit in int32z=arange's end argument must be greater than the start argumentr   r   z#arange's range must be a power of 2)rT   r&   r*   r7   r+   
block_typer,   r   r$   r#   Zcreate_make_range)	r   r   r   r   Zis_start_int64Zis_end_int64rangeshapeZ	ret_ty_irr   r   r   arange<  s     zTritonSemantic.arange)r4   r(   c                 C  sV   |d u rt d|dkr.| j|| j}nt| jd|j }||}| ||S )Nz2dtype must be specified when value is not a tensorr   get_)r*   r$   r   r   getattrnamer#   )r   r>   r4   Zget_value_fnr   r   r   rZ   N  s    zTritonSemantic.scalar_constantc                 C  s8   t |tjr,|jjdks J d| ||S | ||S )Nr   zonly accepts size-1 tensor)rT   r+   r#   numelr>   rn   rZ   )r   r>   r4   r   r   r   make_scalarY  s    zTritonSemantic.make_scalarz	List[int])r   r4   r(   c                 C  s   |  | |||S r%   )splatr   )r   r   r>   r4   r   r   r   full`  s    zTritonSemantic.full)r>   r   r(   c                 C  sP   |j  rJ dt|dkr"|S t|j|}| | j|	| j|j
|S )NzCannot splat a block tensorr   )r_   is_blocklenr+   r   r4   r#   r$   create_splatr   r   )r   r>   r   r   r   r   r   r   g  s
    zTritonSemantic.splat)r   	dst_shapecan_reorderr(   c                 C  sT   d}|D ]}||9 }q|j j|kr*tdt|j j|}| | j|j	|||S )Nr   z:reshape() cannot change total number of elements in tensor)
r_   r   r*   r+   r   rj   r#   r$   Zcreate_reshaper   )r   r   r   r   r   sr   r   r   r   reshapen  s    
zTritonSemantic.reshape)r   r'   r(   c                 C  s\   dd |j D }||d |j s4| j||dS t|jj|}| | j	
|j||S )Nc                 S  s   g | ]}t |qS r   r+   _unwrap_if_constexpr.0r`   r   r   r   
<listcomp>x      z.TritonSemantic.expand_dims.<locals>.<listcomp>r   r   )r   insertr_   r   r   r+   r   rj   r#   r$   create_expand_dimsr   )r   r   r'   r   r   r   r   r   expand_dimsw  s    
zTritonSemantic.expand_dims)rf   rg   r   r(   c                 C  sZ   |sJ dt |jdksJ t|jj|jd |jd  g}| | j|j	|j	|S )Nz;current implementation of `cat` always may reorder elementsr   r   )
r   r   r+   r   r_   rj   r#   r$   Z
create_catr   )r   rf   rg   r   ret_typer   r   r   cat  s    "zTritonSemantic.cat)abr(   c                 C  s   |  ||\}}|jg k}|r6| |d}| |d}t|jd tjrTtd}nd}|j|g }t|jj|}| 	| j
|j|j|}|r| j|dgdd}|S )Nr   r   Fr   )ro   r   r   rT   r+   r]   r   r_   rj   r#   r$   Zcreate_joinr   r   )r   r   r   Z
was_rank_1Ztwo	new_shaper   r}   r   r   r   join  s    
zTritonSemantic.join)r   r(   c                 C  sr   t |jdksJ t|jd dks*J |jd d }t|jj|}| j|j	\}}| 
||| 
||fS )Nr   r   r   )r   r   r+   r   r   r_   rj   r$   Zcreate_splitr   r#   )r   r   r   r   ZoutLHSZoutRHSr   r   r   split  s    

zTritonSemantic.splitz
Tuple[int])r   dimsr(   c                   s   t  jt |krtdtdd |D ttt |krJtd| t jj	 fdd|D }| 
| j j||S )Nz5permute dims must have the same length as input shapec                 s  s   | ]}t |V  qd S r%   r   r   dr   r   r   	<genexpr>  r   z)TritonSemantic.permute.<locals>.<genexpr>z?permute dims must be a permutation of 0, 1, ..., n-1, but were c                   s   g | ]} j | qS r   r   r   r   r   r   r     r   z*TritonSemantic.permute.<locals>.<listcomp>)r   r   r*   sortedlistr   r+   r   r_   rj   r#   r$   Zcreate_transr   )r   r   r   r   r   r   r   permute  s    "zTritonSemantic.permute)r   r   r(   c                 C  s   |j  s| ||S |j  }t|t|krDtd| d| ||krP|S t|D ]F\}}|| |krX|dkrXtd||  d| d| d| d| 
qXt|j j	|}| 
| j|j||S )Nz!Cannot broadcast, rank mismatch: z, r   z3Cannot broadcast, the expanded size of the tensor (z ) must match the existing size (z) at non-singleton dimension : )r_   r   r   get_block_shapesr   r*   	enumerater+   r   rj   r#   r$   create_broadcastr   )r   r   r   Z	src_shapeiitemr   r   r   r   broadcast_impl_shape  s(    


z#TritonSemantic.broadcast_impl_shapec              	   C  st  |j }|j }| rL| sL||j}| | j|| j|j|}n | s| r||j}| | j|| j|j|}n| rl| rl|	 }|	 }t
|t
|k rtt
|t
|D ]<}| | j|jdt|jdg|j }|j }|	 }qndt
|t
|k rxtt
|t
|D ]>}| | j|jdt|jdg|j }|j }|	 }q8t
|t
|ksJ g }t|D ]p\}	}
||	 }|
dkr|| nH|dks||
kr||
 n(tdt|	 d t|
 d t| q||kr<t|j|}| | j|j||}||krlt|j|}| | j|j||}||fS )Nr   r   z?Cannot make_shape_compatible: incompatible dimensions at index r   r   )r_   r   r   rj   r#   r$   r   r   r   r   r   r   r   r+   r   valuesr   appendr*   strr   )r   rf   rg   Zlhs_tyZrhs_tyZ	lhs_shapeZ	rhs_shape_	ret_shaper   leftrightr   r   r   r   ro     sf    $$



z#TritonSemantic.broadcast_impl_valuezOptional[str])rounding_modec                 C  s@   |d u rd S |dkrt jjS |dkr,t jjS td| dd S )NZrtneZrtzzInvalid rounding mode: z0. Supported rounding modes are 'rtne' and 'rtz'.)r   ROUNDING_MODERTNEZRTZr*   )r   r   r   r   r   _str_to_rounding_mode  s    z$TritonSemantic._str_to_rounding_mode)r   dst_tyr(   c                 C  s   |j }| r||j}||kr&|S |j}|j}| sB| rN| ||S |j}|j}||kr~tdt| d t| | 	| j
|j|| j
|S )Nz!Cannot bitcast data-type of size z to data-type of size )r_   r   r   rj   rc   rn   primitive_bitwidthr*   r   r#   r$   create_bitcastr   r   )r   r   r  src_ty
src_sca_ty
dst_sca_tyZsrc_bitsZdst_bitsr   r   r   r     s     zTritonSemantic.bitcast)r   r  fp_downcast_roundingr(   c                 C  s  |j }|j}|j}||kr|S | r0||}| |}d}| r~| r~|j|jk r~|d u rltjj	}q|tjj	krd}n$|d urt
dt| d t| | s| r| jjdd usJ d| jjd |||| dS | r| s| r
| s|r2| | j|j|| j||S | rF| rZ| rp| sp| | |tj|S | o| o|j|jk}|r| | j|j|| j|S | o| o|j|jk }	|	r| | j|j|| j|S | r| r|j|jks,|j|jkr|  o>|!  }
|! rz|j"| j}| | j#||j"}| $||S | | j%|j|| j|
|S |& r4| r4|! r|j"| j}| | j#||j"}| $||S |  r| | j'|j|| j|S | | j(|j|| j|S | r|& r|! s\|  s|| | j)|j|| j|S | | j*|j|| j|S |+ r| r|j}|dkr| | j,|j|| j|S |d	kr| $| |tj-| | j.d
tj-S | rF|+ rF| | j/|j|| j|S |+ rz|+ rz| | j0|j|| j|S dsJ d| d| d S )NFTz]fp_downcast_rounding should be set only for truncating fp conversions. Source scalar type is z and destination type is Zconvert_custom_typesz0target doesn't provide conversion for this type.)Z	_semanticrv   r   r   zcannot cast z to )1r_   rj   r   r   r   rd   r  r   r   r   r*   r   is_fp8e4b15r$   codegen_fnsgetrF   r#   Zcreate_fp_to_fpr   r   rD   rC   rE   rn   r+   rA   Zcreate_fp_truncZcreate_fp_extrG   r2   r3   r   is_boolr4   r   r   r   Zis_standard_floatingZcreate_fp_to_siZcreate_fp_to_uiZcreate_ui_to_fpZcreate_si_to_fprc   Zcreate_ptr_to_intrX   	get_int64Zcreate_int_to_ptrr  )r   r   r  r  r  r  r  Zuse_custom_roundingZtruncate_fpZext_fpZsign_extendtyr   bitwidthr   r   r   rn     s    




 

 




    
 
(  zTritonSemantic.castc                 C  sV   t jj}|rR|dkrt jj}n4|dkr0t jj}n"|dkrBt jj}ntd| d|S )Nz.ca.cgz.cvCache modifier  not supported)r   CACHE_MODIFIERr   CACGZCVr*   r   cache_modifiercacher   r   r   _str_to_load_cache_modifier  s    


z*TritonSemantic._str_to_load_cache_modifierc                 C  sh   t jj}|rd|dkrt jj}nF|dkr0t jj}n4|dkrBt jj}n"|dkrTt jj}ntd| d|S )Nz.wbr  z.csz.wtr  r  )r   r  r   ZWBr  CSZWTr*   r  r   r   r   _str_to_store_cache_modifier  s    



z+TritonSemantic._str_to_store_cache_modifierc                 C  sD   t jj}|r@|dkrt jj}n"|dkr0t jj}ntd| d|S )NZ
evict_lastZevict_firstzEviction policy r  )r   ZEVICTION_POLICYNORMALZ
EVICT_LASTZEVICT_FIRSTr*   )r   eviction_policyevictionr   r   r   _str_to_eviction_policy  s    

z&TritonSemantic._str_to_eviction_policyc                 C  s@   d }|r<|dkrt jj}n"|dkr,t jj}ntd| d|S )NZzeronanzPadding option r  )r   PADDING_OPTIONZPAD_ZEROPAD_NANr*   )r   padding_optionpaddingr   r   r   _str_to_padding_option  s    

z%TritonSemantic._str_to_padding_optionc                 C  sh   t jj}|rd|dkrt jj}nF|dkr0t jj}n4|dkrBt jj}n"|dkrTt jj}ntd| d|S )NacquirereleaseZacq_relZrelaxedMemory semantic r  )r   ZMEM_SEMANTICZACQUIRE_RELEASEZACQUIREZRELEASEZRELAXEDr*   )r   Z
sem_optionsemr   r   r   _str_to_sem  s    



zTritonSemantic._str_to_semc                 C  sV   t jj}|rR|dkrt jj}n4|dkr0t jj}n"|dkrBt jj}ntd| d|S )NZgpuZctasysr'  r  )r   ZMEM_SYNC_SCOPEZGPUZCTAZSYSTEMr*   )r   Zscope_optionscoper   r   r   _str_to_scope  s    


zTritonSemantic._str_to_scopec                 C  s   |rt |ds|g}dd |D }|D ],}t|trNd|  krLt|k s&n J q&t|dksdJ t|tt|ksJ dt|S dS )N__iter__c                 S  s"   g | ]}t |tjr|jn|qS r   rT   r+   r]   r>   r   elemr   r   r   r     r   z?TritonSemantic._canonicalize_boundary_check.<locals>.<listcomp>r   z'Duplicate dimension in `boundary_check`r   )hasattrrT   r&   r   setr   )r   boundary_checkblock_shapeZdimr   r   r   _canonicalize_boundary_check  s    
*z+TritonSemantic._canonicalize_boundary_checkc	              
   C  s   |d us|d urt d|jjj}	|	tjks4J d|	 rP|tjjkrPt d|jj}
| 	||

 }| | j|j||||||
S )NK`mask` and `other` arguments cannot be specified for loading block pointers4`tl.int1` should be rewritten in `tl.make_block_ptr`z@Padding option `nan` is not supported for integer block pointers)r*   r_   
element_tyr+   rV   rG   r   r   r!  r5  r   r#   r$   Zcreate_tensor_pointer_loadr   )r   ptrmaskr   r3  r#  r  r  is_volatileelt_tyr  r   r   r   _load_block_pointer  s    
z"TritonSemantic._load_block_pointerc	              
   C  s  |j j s"td|j   d|d u r:|d ur:td|sB|rJtd|j  s|rj|j  rjtd|r|j  rtd|j  r|d ur| ||j  }|d ur| ||j  }|j j}	|	j}
|
t	j
k}|rt	j}
t	|
|	j}	| ||	}|d ur| ||
}|j  r*|j |
}n|
}|d u rV| | j|j||||}n.| | j|j|j|rt|jnd ||||}|r| |t	j
}|S )NUnsupported ptr type z in `tl.load`z)`other` cannot be provided without `mask`z`padding_option` or `boundary_check` argument is not supported for loading a tensor ofpointers or loading a scalar. Because the compiler does not know the boundary; please use block pointers (defined by `make_block_ptr`) insteadEMask argument cannot be block type if pointer argument is not a blockzFOther argument cannot be block type if pointer argument is not a block)r_   rj   rc   r*   r   r   r   r   r8  r+   rV   int8pointer_typeaddress_spacern   r   r#   r$   Zcreate_loadr   Zcreate_masked_load)r   r9  r:  r   r3  r#  r  r  r;  ptr_tyr<  r  r  r}   r   r   r   _load_legacy  sN    




zTritonSemantic._load_legacyzOptional[TensorTy]r   r   )	r9  r:  r   r3  r"  r  r  r;  r(   c	              
   C  sh   |  |}	| |}
| |}|j rL|jj rL| ||||||	|
|S | ||||||	|
|S d S r%   )	r  r  r$  r_   rc   r8  r   r=  rD  )r   r9  r:  r   r3  r"  r  r  r;  r  r  r#  r   r   r   load2  s    


zTritonSemantic.loadztl.tensor_descriptor_base)descr  r  r(   c                 C  sz   t |tjsJ t|j}t||ks>J d| dt| | j|dd}| j|j|| 	|| 
|}| ||jS )N	expected  offsets, but got Frequire_i64)rT   r+   tensor_descriptor_baser   r4  _convert_to_ir_valuesr$   Zcreate_descriptor_loadr   r  r  r#   r   )r   rF  offsetsr  r  ndimr`   r   r   r   descriptor_load@  s    
$zTritonSemantic.descriptor_load)rF  r>   r(   c                 C  sR   t |tjsJ t|j}t||ks>J d| dt| |j|jksNJ d S )NrG  rH  )rT   r+   rK  r   r4  r   )r   rF  r>   rM  rN  r   r   r   validate_store_likeK  s    
$z"TritonSemantic.validate_store_likec                 C  s:   |  ||| | j|dd}| | j|j|j|tjS NFrI  )rP  rL  r#   r$   Zcreate_descriptor_storer   r+   void)r   rF  r>   rM  r   r   r   descriptor_storeQ  s    zTritonSemantic.descriptor_storec                 C  sn   |  ||| |jtjtjtjtjtjtjhv s8J d| j	|dd}t
jj}| | j||j|j|tjS NUnsupported dtypeFrI  )rP  r4   r+   rW   r,   rY   rA   r?   r@   rL  r   DESCRIPTOR_REDUCE_KINDADDr#   r$   create_descriptor_reducer   rR  r   rF  r>   rM  r=   r   r   r   descriptor_atomic_addV  s
    *z$TritonSemantic.descriptor_atomic_addc                 C  s   t j }|jdko|jdkS )NZcudaZ   )r
   activeZget_current_targetbackendarch)r   targetr   r   r   _has_native_tma]  s    
zTritonSemantic._has_native_tmac                 C  sL   |t jt jt jt jt jt jhv s(J d|t jt jhv rH|  sHJ dd S )NrU  z-16-bit float types require native tma support)r+   rW   r,   rY   rX   r?   r@   r`  )r   r4   r   r   r   $_descriptor_atomic_min_max_supporteda  s    (z3TritonSemantic._descriptor_atomic_min_max_supportedc                 C  sP   |  ||| | |j | j|dd}tjj}| | j	||j
|j
|tjS rQ  )rP  ra  r4   rL  r   rV  MINr#   r$   rX  r   r+   rR  rY  r   r   r   descriptor_atomic_minf  s
    z$TritonSemantic.descriptor_atomic_minc                 C  sP   |  ||| | |j | j|dd}tjj}| | j	||j
|j
|tjS rQ  )rP  ra  r4   rL  r   rV  MAXr#   r$   rX  r   r+   rR  rY  r   r   r   descriptor_atomic_maxm  s
    z$TritonSemantic.descriptor_atomic_maxc                 C  sf   |  ||| |jtjtjtjtjhv s0J d| j|dd}tj	j
}| | j||j|j|tjS rT  )rP  r4   r+   rW   r,   rY   rX   rL  r   rV  ANDr#   r$   rX  r   rR  rY  r   r   r   descriptor_atomic_andt  s
    "z$TritonSemantic.descriptor_atomic_andc                 C  sf   |  ||| |jtjtjtjtjhv s0J d| j|dd}tj	j
}| | j||j|j|tjS rT  )rP  r4   r+   rW   r,   rY   rX   rL  r   rV  ORr#   r$   rX  r   rR  rY  r   r   r   descriptor_atomic_or{  s
    "z#TritonSemantic.descriptor_atomic_orc                 C  sf   |  ||| |jtjtjtjtjhv s0J d| j|dd}tj	j
}| | j||j|j|tjS rT  )rP  r4   r+   rW   r,   rY   rX   rL  r   rV  XORr#   r$   rX  r   rR  rY  r   r   r   descriptor_atomic_xor  s
    "z$TritonSemantic.descriptor_atomic_xor)r  r  r(   c           
      C  sF  t |tjsJ |dks J d|dks0J dt|jdksNJ d|j |jd dkslJ d|j t|jdksJ d	|j |jd d
ksJ d|j |j}d|j d
 }|jd |ksJ d| d| d|jd  t|j|jd |jd g}| j	|fddd }| j
|j|j||| j
}	| |	|S )N z#cache modifier is not supported yetz$eviction policy is not supported yetr   descriptor must be 2D, but got r   r   *descriptor block must have 1 row, but got x offsets must be 1D, but got    z5descriptor gather must have at least 8 rows, but got r   zdescriptor gather of  must have at least  columns, but got FrI  )rT   r+   rK  r   r4  r   r4   r  r   rL  r$   Zcreate_descriptor_gatherr   r   r#   )
r   rF  	x_offsetsy_offsetr  r  r4   min_colsr_   r`   r   r   r   descriptor_gather  s(    z TritonSemantic.descriptor_gather)r>   r(   c                 C  s  t |tjsJ t|jdks.J d|j |jd dksLJ d|j t|jdksjJ d|j |jd dksJ d|j |j}d	|j d }|jd |ksJ d
| d| d|jd  | j	|fddd }| j
|j|j|j| | d tjS )Nr   rm  r   r   rn  ro  rp  z6descriptor scatter must have at least 8 rows, but got r   zdescriptor scatter of rq  rr  FrI  )rT   r+   rK  r   r4  r   Zshapaer4   r  rL  r$   Zcreate_descriptor_scatterr   r#   rR  )r   rF  r>   rs  rt  r4   ru  r   r   r   descriptor_scatter  s"    z!TritonSemantic.descriptor_scatterc           	   	   C  s   |d urt d|jj }|j s2| ||}|j sDJ d||j ksnJ d| d|j  d|jjj|jjksJ d|jjj d|jj d|jjj}|tjksJ d| ||}| 	||}| 
| j|j|j|||tjS )	Nr6  z-Value argument must be block type or a scalarzBlock shape(z) and value shape(z
) mismatchzBlock element type(z) and value element type(r7  )r*   r_   r8  r   r   r   r+   rV   r5  rn   r#   r$   Zcreate_tensor_pointer_storer   rR  )	r   r9  valr:  r3  r  r  r4  r<  r   r   r   _store_block_pointer  s"    
2
z#TritonSemantic._store_block_pointerc           	   	   C  s>  |j j s"td|j   d|r.td|j  s`|j  rJtd|r`|j  r`td|j  r| ||j  }|d ur| ||j  }|j j}|j}|t	j
krt	j}t	||j}| ||}| ||}|d u r| | j|j|j||t	jS |j j std| | j|j|j|j||t	jS )Nr>  z in `tl.store`z`boundary_check` argument is not supported for storing a tensor of pointers or storing a scalar. Because the compiler does not know the boundary; please use block pointers (defined by `make_block_ptr`) insteadzFValue argument cannot be block type if pointer argument is not a blockr?  "Mask must have boolean scalar type)r_   rj   rc   r*   r   r   r   r   r8  r+   rV   r@  rA  rB  rn   r#   r$   Zcreate_storer   rR  r  Zcreate_masked_store)	r   r9  rx  r:  r3  r  r  rC  r<  r   r   r   _store_legacy  s6    




 zTritonSemantic._store_legacy)r9  rx  r:  r  r  r(   c           	      C  st   |  |}| |}|j s*|jj r2td|j r\|jj r\| 	||||||S | 
||||||S d S )N"Cannot store to a constant pointer)r  r  r_   is_constrj   r*   rc   r8  r   ry  r{  )	r   r9  rx  r:  r3  r  r  r  r  r   r   r   store  s    

zTritonSemantic.store)r9  cmprx  r(  r+  r(   c              	   C  sT   |  |}| |}|jjj}|jdvr0td| | j	|j
|j
|j
|||jS )N)   r   rv   z9atomic_cas only supports elements with width {16, 32, 64})r)  r,  r_   rj   r8  r  r*   r#   r$   Zcreate_atomic_casr   )r   r9  r  rx  r(  r+  r8  r   r   r   
atomic_cas  s    



zTritonSemantic.atomic_casz#Tuple[TensorTy, TensorTy, TensorTy])r9  rx  r:  opr(   c                 C  sn  |j j std|j   |j  s4|j j r<td|j jj}|tju rh|dkrhtd| d |tj	u r|dkrtd| d |tj
tjfv s|jdk rtd| d t| |j  r|d ur| ||j  }|d ur| ||j  }| ||j jj}|d u rd| jd	}tj}|j  rX|j tj}| j|| j|}| ||}|||fS )
Nz)Pointer argument of store instruction is r|  r   Zatomic_z does not support fp16z does not support bf16r  z does not support T)r_   rj   rc   r*   r   r}  r8  r+   r?   r@   int16uint16r  r   r   r   r   rn   r$   rU   rV   r   r   r   r#   )r   r9  rx  r:  r  r8  Zmask_irZmask_tyr   r   r   atom_red_typechecking_impl  s2    


z)TritonSemantic.atom_red_typechecking_impl)r`   r(   c                 C  s@   |j j}tj|dd}| ||}| ||d }| |tjS )NF)r  signedr   )r4   r  r+   Zget_int_dtyper   r   rn   rV   )r   r`   r  ZidtypeixZsignbitr   r   r   _signbit6  s
    zTritonSemantic._signbit)r9  rx  r:  r(  r+  r(   c                 C  s  |  |||d\}}}| |}| |}|jj}| r| rl| | j	t
jj|j|j|j|||jS | | j	t
jj|j|j|j|||jS |tjtjhvrtd| |tjkrtjntj}| ||}| |t|d}	|tjkrtjntj}
| ||
}| |t|
d}| |}| |}| | j	t
jj|	j|j| ||j|||j}| | j	t
jj|j|j| ||j|||j}| |||}| ||S )Nr   z#atomic_max not supported for dtype r   )r  r)  r,  r_   rj   rG   r   r#   r$   create_atomic_rmwr   	ATOMIC_OPrd  r   UMAXr+   rA   rB   r5   r,   rX   r   rA  rW   rY   r  r   ry   UMINwherer   r9  rx  r:  r(  r+  sca_tyZi_typeZi_valZi_ptrZui_typeZui_valZui_ptrnegposZpos_retZneg_retr}   r   r   r   
atomic_max=  sL    



zTritonSemantic.atomic_maxc                 C  s  |  |||d\}}}| |}| |}|jj}| r| rl| | j	t
jj|j|j|j|||jS | | j	t
jj|j|j|j|||jS |tjtjhvrtd| |tjkrtjntj}| ||}| |t|d}	|tjkrtjntj}
| ||
}| |t|
d}| |}| |}| | j	t
jj|	j|j| ||j|||j}| | j	t
jj|j|j| ||j|||j}| |||}| ||S )Nr   z#atomic_min not supported for dtype r   )r  r)  r,  r_   rj   rG   r   r#   r$   r  r   r  rb  r   r  r+   rA   rB   r5   r,   rX   r   rA  rW   rY   r  r   ry   r  r  r  r   r   r   
atomic_minc  sL    



zTritonSemantic.atomic_minc              
   C  sp   |  |||d\}}}| |}| |}|jj}| rBtjjntjj	}| 
| j||j|j|j|||jS )Nr   )r  r)  r,  r_   rj   rd   r   r  ZFADDrW  r#   r$   r  r   )r   r9  rx  r:  r(  r+  r  r  r   r   r   
atomic_add  s    

zTritonSemantic.atomic_addc              
   C  sT   |  |||d\}}}| |}| |}| | jtjj|j	|j	|j	|||j
S )Nand)r  r)  r,  r#   r$   r  r   r  rf  r   r_   r   r9  rx  r:  r(  r+  r   r   r   
atomic_and  s    

"zTritonSemantic.atomic_andc              
   C  sT   |  |||d\}}}| |}| |}| | jtjj|j	|j	|j	|||j
S )Nor)r  r)  r,  r#   r$   r  r   r  rh  r   r_   r  r   r   r   	atomic_or  s    

"zTritonSemantic.atomic_orc              
   C  sT   |  |||d\}}}| |}| |}| | jtjj|j	|j	|j	|||j
S )Nxor)r  r)  r,  r#   r$   r  r   r  rj  r   r_   r  r   r   r   
atomic_xor  s    

"zTritonSemantic.atomic_xorc              
   C  sT   |  |||d\}}}| |}| |}| | jtjj|j	|j	|j	|||j
S )NZxchg)r  r)  r,  r#   r$   r  r   r  ZXCHGr   r_   r  r   r   r   atomic_xchg  s    

zTritonSemantic.atomic_xchgc                 C  sL   |  | jjjv s,J d| jjj d| | }|dkr@d}ttj|S )Nzinput_precision must be one of . Got ZTF32X3ZTF32x3)lowerr$   rw   Zallowed_dot_input_precisionsupperr   r   ZINPUT_PRECISION)r   input_precisionr   r   r   _str_to_dot_input_precision  s    z*TritonSemantic._str_to_dot_input_precision)rf   rg   accr  max_num_imprecise_acc	out_dtyper(   c              
   C  s&  |j  r|j  sJ |j r.|j r.n|jtjtjtjtjtj	fv s\J d|j |jtjtjtjtjtj	fv sJ d|j |j|jksJ d|j d|j |j
 s|j
 rd| jjjv rtd | |tj}| |tj}|d u r
| jjj}| |}t|j}t|j}||  kr@dksvn ||  krZdksvn J d	|j d
|j d|jd j|jd jksJ d|j d|j d|jd j d|jd j d	| jjdd usJ d| jjd |j |j }	|jd j|	d kr:|jd j|	d kr:|jd j|	d ks`J d|	d  d|	d  d|	d  |j j r|j jtjksJ d| jd}
tj}nj| rtdnV|j j s|j j r| jd}
tj	}n&|  r| j!dn
| jd}
|}|j jd }|j jd }|j jd }|dkr>|j jd nd }t"||rX|||gn||g}|d u r| j#|$| j|
}n|j%}|j |ksJ |d u r|j r|j r| jjj&}nd}n8|j r|j r||krtd| d| d| '| j(|j%|j%||||S )NzUnsupported lhs dtype zUnsupported rhs dtype z&Both operands must be same dtype. Got r   Zfp8e4b15zthe use of fp8e4b15 is deprecated on Hopper and later architectures and can cause significant slow down. It will be removed in a future triton releaser      +Both inputs must be either 2D or 3D; (lhs: 	 vs rhs: r   r   zFirst input shape (z) and second input shape z= are not compatible for matmul (second index of first shape (z0) must be equal to first index of second shape (min_dot_sizez2target doesn't provide lower shape bounds for dot.r   r   zInput shapes should have M >= z, N >= z
 and K >= zonly int8 supported!zhout_dtype=bfloat16 is unsupported. Please use out_dtype=float32/float16 and cast with `.to(tl.bfloat16)`zmax_num_imprecise_acc (z) must be <= K ())r_   r   r4   rF   r+   r@  uint8r?   r@   rA   r  r$   rw   Z!deprecated_fp8_dot_operand_dtypeswarningswarnrn   Zdefault_dot_input_precisionr  r   r   r>   r	  r
  rj   rG   	get_int32r,   rE   r*   rC   get_fp32rD   Zget_fp16r   r   r   r   Zmax_num_imprecise_acc_defaultr#   Z
create_dot)r   rf   rg   r  r  r  r  lhs_rankrhs_rankr  r   Zret_scalar_tyMNKBr   
acc_handler   r   r   dot  s    

$




N0, 
" 

"zTritonSemantic.dot)float_formatc                 C  s.   t tj| d }|d u r*td| d|S )NzInvalid float format: rO   )r   r   ZScaleDotElemTypeTYr  r*   )r   r  Zty_enumr   r   r   _str_to_fp_type	  s    zTritonSemantic._str_to_fp_type)rx  r  c                 C  s   t jt jt jt jd|}|du rZ|dks:J d| |jt jksVJ d|j |S |j|krh|S t jt jt jt jd| }|j|ksJ d| d|j | 	||S dS )z
        If float_format is subbyte, make sure it's packed as uint8 and return it.
        Otherwise, return a tensor (perhaps bitcasting) of the specified float format.
        )e5m2e4m3bf16fp16Ne2m1z)Internal Error: Unexpected float format: z)e2m1 format must be packed as uint8. Got zUnexpected dtype for r  )
r+   Zfloat8e5Z
float8e4nvr@   r?   r
  r4   r  r  r   )r   rx  r  Z	triton_tyZunsigned_tyr   r   r   _bitcast_to_fp_type  s    
 z"TritonSemantic._bitcast_to_fp_typezTensorTy | None)rf   	lhs_scale
lhs_formatrg   	rhs_scale
rhs_formatr  	fast_math
lhs_k_pack
rhs_k_packr  r(   c           !      C  s  |j  r|j  sJ t|j}t|j}||  kr@dksrn ||  krVdksrn J d|j d|j d|j}|j}| |}| |}h d}||v sJ d| ||v sJ d| |d u pt|tjo|jd u }|d u pt|tjo|jd u }| 	||}| 	||}|	s6|d	ks6J d
|
sN|d	ksNJ d
|j jdd  \}}|j jdd  \}}|d	krdnd}|d	krdnd}|	r|| n|}|
r|| n|}||ksJ d|j d|j d|dkr|j jd nd }|	s
|| }|
s|| }t
||r.|||gn||g}| jd}|d u rf| j|| j|}n|j}|j |ks|J |rd n|j}|rd n|j} | | j|j| ||j||||	|
|
|S )Nr   r  r  r  r   >   r  r  r  r  r  zNYI: lhs_format zNYI: rhs_format r  zBonly mxfp4 inputs can be packed along a dimension different than Kr  r   zCReduction dimension should pack the same number of elements; (lhs: r   )r_   r   r   r   r>   r  rT   r+   r]   r  r   r$   r  r   r   r   r#   Zcreate_dot_scaled)!r   rf   r  r  rg   r  r  r  r  r  r  r  r  r  Zlhs_format_enumZrhs_format_enumallowed_formatsZrhs_scale_is_noneZlhs_scale_is_noner  ZK_LHSZK_RHSr  ZPACKED_AZPACKED_BZPACKED_A_DIMZPACKED_B_DIMr  r   r   r  Zrhs_scale_handleZlhs_scale_handler   r   r   
dot_scaled!  sV    

F

"$ 

zTritonSemantic.dot_scaled)	conditionr`   r   r(   c                 C  s   |j tjkrtd|j   | |tj}| ||dd\}}|j rl| 	||\}}| 	||\}}n| 	||\}}|j}| 
| j|j|j|j|S )Nzgtl.where with a non-boolean condition is deprecated and will error out in a future triton release. Got T)r4   r+   rV   r  r  rn   rs   r_   r   ro   r#   r$   Zcreate_selectr   )r   r  r`   r   r   r   r   r   r   r  U  s    

zTritonSemantic.wherec                 C  s"   |rt ||}n|}| ||S r%   )r+   r   r#   )r   r`   rH   r   Zres_tyr   r   r   wrap_tensori  s    zTritonSemantic.wrap_tensorzSequence[TensorTy]zTuple[TensorTy, ...])inputsr'   r(   c                   s    d u r"t fddD d d jjt} |k sNJ d| d fddtD tfddD sJ d	jd
d D  |  sJ t fddt	tD S )Nc                 3  s$   | ]} j ||jjgd dV  qdS )Tr   N)r   r   r>   r   tr   r   r   r   s  r   z+TritonSemantic.reduction.<locals>.<genexpr>r   z&reduction axis must be < inputs rank (r   c                   s   g | ]\}}| kr|qS r   r   )r   r   r   )r'   r   r   r   y  r   z,TritonSemantic.reduction.<locals>.<listcomp>c                 3  s   | ]}|j j kV  qd S r%   )r_   r   r  r   r   r   r   z  r   z-all reduction inputs must have the same shapec                 S  s   g | ]
}|j qS r   r   r  r   r   r   r   |  r   c                 3  s*   | ]"} | | jjV  qd S r%   r  Z
get_resultr_   rj   r   r   )r  	reduce_opr   r   r   r   r     s   )
tupler_   r   r   r   allr$   Zcreate_reduceverifyr   )r   r  r'   region_builder_fnrankr   )r'   r  r  r   r   r   r   	reductionq  s    
zTritonSemantic.reduction)r  r'   reverser(   c                   s    d j jt}| |  kr*|k sBn J d| d| d|dk rR||7 } D ]}|j jksVJ dqVjdd  D |||  sJ t fdd	tt D S )
Nr   z
scan axis z must be < inputs rank (r   z(all scan inputs must have the same shapec                 S  s   g | ]
}|j qS r   r  r  r   r   r   r     r   z3TritonSemantic.associative_scan.<locals>.<listcomp>c                 3  s*   | ]"} | | jjV  qd S r%   r  r  r  Zscan_opr   r   r   r   r     r   z2TritonSemantic.associative_scan.<locals>.<genexpr>)r_   r   r   r$   Zcreate_scanr  r  r   )r   r  r'   r  r  r  r  r   r  r   associative_scan  s    .zTritonSemantic.associative_scan)srcindexr'   r(   c                 C  s   |j  sJ dt|jj}t|jj|ks6J d| |  krL|k sdn J d| d| d|dk rt||7 }t|D ]6}||krq||jj| |jj| ks|J d| dq|| j|j|j|}| 	||jj
|jjS )	Nzindex must be an integer tensorz0source and index tensors must have the same rankzgather axis z must be < source rank (r   r   z
index dim z( must match the corresponding source dim)r4   rG   r   r_   r   r   r$   Zcreate_gatherr   r  rj   )r   r  r  r'   r  r   gatherr   r   r   r    s    .*zTritonSemantic.gather)r   num_binsr:  r(   c                 C  s~   t |jdksJ d|j s(J d|d urX| ||j}|jj sRtd|j	}| 
| j|j	||ttj|gS )Nr   z histogram only supports 1D inputz%histogram only supports integer inputrz  )r   r   r4   rG   r   r_   rj   r  r*   r   r#   r$   Zcreate_histogramr+   r   r,   )r   r   r  r:  r   r   r   	histogram  s    zTritonSemantic.histogram)r`   r   r(   c                 C  s@   t dt|jt|kr td|jdt||j  |S )Nr   zAShape of input to multiple_of does not match the length of valuesztt.divisibility)	r   r   r   r*   r   set_attrr   	make_attrget_contextr   r`   r   r   r   r   multiple_of  s    zTritonSemantic.multiple_ofc                 C  s:   t |jt |krtd|jdt||j  |S )NzDShape of input to max_contiguous does not match the length of valuesztt.contiguityr   r   r*   r   r  r   r  r  r  r   r   r   max_contiguous  s    zTritonSemantic.max_contiguousc                 C  s:   t |jt |krtd|jdt||j  |S )NzCShape of input to max_constancy does not match the length of valuesztt.constancyr  r  r   r   r   max_constancy  s    zTritonSemantic.max_constancy)r(   c                 C  s   |  | j tjS r%   )r#   r$   Zcreate_barrierr+   rR  r  r   r   r   debug_barrier  s    zTritonSemantic.debug_barrierzList[TensorTy])prefixargshexr(   c                 C  s   | ds|r|d7 }| ds4|r4|d d d }t|dkrR|dsRd| }dd |D }dd |D }| | j||||tjS )N r   r   r   c                 S  s   g | ]
}|j qS r   r  r   argr   r   r   r     r   z/TritonSemantic.device_print.<locals>.<listcomp>c                 S  s   g | ]}|j  qS r   )r4   r   r  r   r   r   r     r   )endswithr   
startswithr#   r$   Zcreate_printr+   rR  )r   r  r  r  new_args	is_signedr   r   r   device_print  s    zTritonSemantic.device_print)r~   r   r(   c                 C  s(   | j jjsd S | | j |j|tjS r%   )r$   rw   debugr#   Zcreate_assertr   r+   rR  )r   r~   r   r   r   r   r|     s    
zTritonSemantic.device_assertc                 C  s   |  | j|jtjS r%   )r#   r$   Zcreate_assumer   r+   rR  )r   r~   r   r   r   assume  s    zTritonSemantic.assumec                 C  sZ  t |trt|}t |tjrt |jtr:| j|jS |rvd|j  krTdk shn J d|j d| j|jS d|j  krdk sn J d|j d| j	|jS nt |tj
r>|jjdksJ d	|j sJ d
|jtjkr|r| j|j| j |j S |jtjkr8|s8ds8J d|jS dsVJ dt| d S )NrM   rN   z@Block pointers only support 64 bit `shape/strides`, got a value z which is out of the rangerK   rL   zFBlock pointers only support 32 bit `offsets/block_shape`, got a value r   z*Expected a scalar in shape/strides/offsetsz8Expected an integer scalar type in shape/strides/offsetsFzzBlock pointers only support 32 bit `offsets/block_shape`, add a `.to(tl.int32)` or use regular indexing for 64 bit supportz3Unsupported element type in shape/strides/offsets: )rT   r&   r+   r]   r>   r7   r$   rU   r  r  r#   r   r4   rG   rX   r   r   Zget_int64_tyr   r,   r_   )r   r0  rJ  r   r   r   _convert_elem_to_ir_value  s2    



z(TritonSemantic._convert_elem_to_ir_valuec                   s,   t |dr fdd|D S | gS )Nr-  c                   s   g | ]} | qS r   )r  r/  rJ  r   r   r   r     r   z8TritonSemantic._convert_to_ir_values.<locals>.<listcomp>)r1  r  )r   	list_likerJ  r   r  r   rL    s    
z$TritonSemantic._convert_to_ir_values)baser(   c              	     s<  |  |}|  |}| j |dd}|j r8|jj r@td|jjtjkrh| |t	tj
|jj}t dsx g dd  D  tdd  D sJ d	t|ds|g}d
d |D }t|ttt|ksJ dt fdd||||fD sJ d| j|j||| |}| |t	t|jj S )NFrI  zMExpected `base` to be a pointer type (but not a block pointer type or others)r-  c                 S  s"   g | ]}t |tjr|jn|qS r   r.  r/  r   r   r   r     r   z1TritonSemantic.make_block_ptr.<locals>.<listcomp>c                 s  s0   | ](}t |to&d |  ko"dk n  V  qdS )rK   rL   N)rT   r&   r/  r   r   r   r     r   z0TritonSemantic.make_block_ptr.<locals>.<genexpr>zGExpected a list of constant integers (`int32_t` range) in `block_shape`c                 S  s"   g | ]}t |tjr|jn|qS r   r.  r/  r   r   r   r   #  r   z<Expected a permutation of (0, 1, ..., len(order)-1) in orderc                 3  s   | ]}t  t |kV  qd S r%   )r   )r   r  r4  r   r   r   '  r   zBExpected shape/strides/offsets/block_shape to have the same length)rL  r_   rc   r8  r   r*   r+   rV   rn   rA  r@  rB  r1  r  r   r   r   r   r$   Zcreate_make_block_ptrr   r#   r   )r   r  r   stridesrM  r4  orderr   r   r  r   make_block_ptr
  s,    



 "zTritonSemantic.make_block_ptrc                 C  s(   | j |dd}| | j|j||jS rQ  )rL  r#   r$   Zcreate_advancer   r_   )r   r  rM  r   r   r   advance0  s    zTritonSemantic.advancezList[tl.constexpr]ztl.tensor_descriptor)r  r   r  r4  r(   c                   s  t |}d|  krdks.n td| dt ||krRtd| dt | t ||krvtd| dt | t|jtjsJ |jjjd	 }t|d
 }|| dk rtd| d| d||  dt|d
 |d
< |d
 dkrtd|d
   fdd|D } fdd|D }t	|}t|j
tjsDJ t|j
j|}|j}	|j
j }
 j|	dd |D dd |D ||
}t||||S )Nr      z Expected 1 <= ndim <= 5 but got z dimensionsz	Expected z strides but got zExpected block_shape to have z dimensions but got rp  r   r  zRDescriptor block shape must have at least 16 bytes in the last dimension, but got z * z = z bytesz-Tensor descriptor last dim must be 1 but got c                   s   g | ]}  |tjqS r   )r   r+   r,   r   r  r   r   r   Q  r   z9TritonSemantic.make_tensor_descriptor.<locals>.<listcomp>c                   s   g | ]}  |tjqS r   )r   r+   rX   r   r  r   r   r   R  r   c                 S  s   g | ]
}|j qS r   r  r   r   r   r   r   r   \  r   c                 S  s   g | ]
}|j qS r   r  r   r   r   r   r   ]  r   )r   r*   rT   r4   r+   rA  r8  r  r   Z_unwrap_shaper_   r   r   r   r$   Zcreate_make_tensor_descriptorZtensor_descriptor)r   r  r   r  r4  rN  Z	elem_sizeZcontig_dim_sizer_   Zbase_handleZis_signed_intr   r   r  r   make_tensor_descriptor7  s8    
z%TritonSemantic.make_tensor_descriptor)T)FFTF)N)T)yr   r   r    r+   r#   __annotations__langr   r.   r/   r6   rI   r^   re   rs   r   r   r   r   r   r   r   r   r   r   r   r   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r{   r   rz   r   r   r   rZ   r   r   r   r   r   r   r   r   r   r   ro   r   r   rn   r  r  r  r$  r)  r,  r5  r=  rD  rE  rO  rP  rS  rZ  r`  ra  rc  re  rg  ri  rk  rv  rw  ry  r{  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r|   r  r  rL  r  r  r  r   r   r   r   r"      s   
	2) %		
	8	o<,&&	N,4&r"   )
__future__r   r  typingr   r   r   r   r   r   r	   rh   Ztriton.runtimer
   Z_C.libtritonr   rl  r   r+   r   r   	Exceptionr   r"   r   r   r   r   <module>   s   $	