o
    Zh                 
   @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlm  mZ d dlm Z  d dl!m"Z"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7m8Z8 ddl9m:Z: ddl;m<Z<m=Z=m>Z> ddl?m@Z@ ddlAmBZB ddlCmDZD ddlEmFZFmGZGmHZHmIZI ddlJmKZKmLZL ddlMmNZNmOZOmPZPmQZQ ddlmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_ ddl`maZbmcZcmdZdmeZe ddlfmgZg ddlhmiZi ddljmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZy dd lzm{Z{m|Z|m}Z}m~Z~mZmZ dd!lmZmZmZmZmZ dd"lmZ erd d#lmZ d d$lmZ d d%lmZ dd&l7mZ dd'lmZ ed(ZeeZejed)Zejed*Zejed+Ze: Z9G d,d- d-Zeddxd0d1Zeddxd2d3ZG d4d5 d5ZejG d6d7 d7ZejG d8d9 d9Zdyd>d?ZG d@dA dAetZe jZdzdDdEZd{dGdHZdzdIdJZd|dKdLZdzdMdNZd}dPdQZd~dTdUZG dVdW dWeoZddYdZZddd^d_ZG d`da daesZedb G dcdd ddeZG dedf dfZejG dgdh dhZG didj djZejG dkdl dlZG dmdn dneneeeeeef f f ZG dodp dpee ZG dqdr dreZddvdwZdS )    )annotationsN)IterableSequence)	lru_cache)AnyCallablecastOptionalTYPE_CHECKINGUnion)
PRECEDENCE)get_interface_for_device)identitypreserve_rng_state)is_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)has_triton_package   )free_symbol_is_type
prefix_strsymbol_is_typeSymT)ValueRanges   )configirmetrics)AsyncCompile)	code_hashget_pathPyCodeCache)DefaultHandler)triton_heuristics)benchmarker)AutotuneHintDevicePropertiesTRITON_MAX_BLOCKTRITON_MAX_RSPLIT)get_max_y_gridnext_power_of_2)BaseSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfDelayReplaceLineget_bounds_index_exprget_fused_kernel_nameget_kernel_metadatais_welford_reductionPlaceholderprefix_is_reduction	sympy_dotsympy_product
sympy_substriton_typetriton_version_uses_attrs_dictupcast_compute_type)_opsReductionType	StoreModeV)"get_kernel_category_by_source_code   )BlockPatternMatcher)ArgNameBackendFeatureConstexprArgCSECSEVariableDeferredLineIndentedBufferInplacedBufferOpOverridesPythonPrinter
RemovedArgSizeArg	TensorArgWorkspaceArgWorkspaceZeroMode)constant_reprIterationRangesIterationRangesEntryIterationRangesRoot
SIMDKernelSIMDScheduling)	config_ofequal_1_arg_indicesnon_constexpr_signatureshould_unwrap_unspec_argsignature_to_meta)SymbolicCallArg)
ModuleType)TypeVarDtypePropagationOpsHandler)IRNode)SIMDKernelFeatures_TZ
perf_hintsZscheduleZfusionc                   @  s8   e Zd ZU dZi Zded< i Zded< edddZdS )OpDtypeSupportz
    Some Triton ops such as libdevice and tl.math only support float32 and float64.
    This class records which dtypes are supported by specific IR ops.
    z"dict[str, OrderedSet[torch.dtype]]supported_dtypeszdict[str, bool]convert_outputsfuncCallable[..., str]convert_outputboolreturnNonec                 C  s*   |j }ttjtjg| j|< || j|< d S N)__name__r   torchfloat32Zfloat64ri   rj   )clsrk   rm   Zop_name rv   M/var/www/auris/lib/python3.10/site-packages/torch/_inductor/codegen/triton.pyregister_upcast   s   zOpDtypeSupport.register_upcastN)rk   rl   rm   rn   ro   rp   )	rr   
__module____qualname____doc__ri   __annotations__rj   classmethodrx   rv   rv   rv   rw   rh   v   s   
 rh   ro   strc                  C  s(   t  sdS ddl} t| jjdrdS dS )zd
    import AttrsDescriptor if the triton version is new enough to have this
    class defined.
     r   NZAttrsDescriptorz4from triton.compiler.compiler import AttrsDescriptor)r   Ztriton.compiler.compilerhasattrcompiler)tritonrv   rv   rw   gen_attr_descriptor_import   s   r   c                  C  s6   t  } | d t  }r| | | d |  S )NzD
        import triton
        import triton.language as tl
        a  
        from torch._inductor.runtime import triton_helpers, triton_heuristics
        from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
        from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
        )rL   splicer   	writelinegetvalue)ZimportsZ	attr_descrv   rv   rw   gen_common_triton_imports   s   

r   c                   @  sp   e Zd ZdZeejejgZeej	ej
ejgeZdd eD Zdd eD Zedd	d
ZedddZdS )TritonSymbolszU
    Stores sympy.Symbol instances and constants associated with triton codegen.
    c                 C  s(   i | ]}|t jt|  d dddqS )offsetTintegerZnonnegative)sympySymbolr   .0symtrv   rv   rw   
<dictcomp>   s    zTritonSymbols.<dictcomp>c                 C  s,   i | ]}|t jt|   d dddqS )BLOCKTr   Zpositive)r   r   r   upperr   rv   rv   rw   r      s    treerV   ro   sympy.Symbolc                 C     | j |j S rq   )block_sizesr   ru   r   rv   rv   rw   get_block_size      zTritonSymbols.get_block_sizec                 C  r   rq   )block_offsetsr   r   rv   rv   rw   get_block_offset   r   zTritonSymbols.get_block_offsetN)r   rV   ro   r   )rr   ry   rz   r{   r   r   ZR0_INDEXZR1_INDEXreduction_typesXBLOCKZYBLOCKZZBLOCKblock_typesr   r   r}   r   r   rv   rv   rv   rw   r      s    r   c                   @  sv   e Zd ZU ded< ded< ded< ded< d	ed
< dddZdddZdddZdddZdddZe	dddZ
dS )IndexingOptionsr~   	index_strOrderedSet[str]	mask_varsOptional[str]
expand_strrn   _has_rindex
sympy.Exprindexro   c                 C  
   t | jS rq   )rn   r   selfrv   rv   rw   has_mask      
zIndexingOptions.has_maskc                 C  s   t | jtjS rq   )r   r   r   TMPr   rv   rv   rw   has_indirect      zIndexingOptions.has_indirectc                 C     | j S rq   )r   r   rv   rv   rw   
has_rindex   s   zIndexingOptions.has_rindexc                 C     t dd | jD S )Nc                 s      | ]
}t |d V  qdS )tmpNr~   
startswithr   maskrv   rv   rw   	<genexpr>       z.IndexingOptions.has_tmpmask.<locals>.<genexpr>anyr   r   rv   rv   rw   has_tmpmask      zIndexingOptions.has_tmpmaskc                 C  r   )Nc                 s  r   )rNr   r   rv   rv   rw   r      r   z,IndexingOptions.has_rmask.<locals>.<genexpr>r   r   rv   rv   rw   	has_rmask   r   zIndexingOptions.has_rmaskc                 C  s   | j rdtt| j S dS )N & rp   )r   joinmapr~   r   rv   rv   rw   mask_str   s   zIndexingOptions.mask_strNro   rn   ro   r~   )rr   ry   rz   r|   r   r   r   r   r   propertyr   rv   rv   rv   rw   r      s   
 




r   c                   @  s  e Zd ZU ded< ded< ded< ded< d	ed
< ded< d	ed< dZded< edCddZedCddZedCddZedCddZ	dDd d!Z
edEd&d'ZdFd,d-ZdGdHd0d1ZdId3d4ZdJd5d6ZdKd7d8ZdLd9d:ZdLd;d<ZdLd=d>ZdLd?d@ZdLdAdBZdS )MBlockPtrOptionsBlockParametersparamsr   constant_offset	list[int]orderr   r   Sequence[sympy.Expr]broadcast_shapez
list[bool]broadcasting_dimsfinal_shapeNzOptional[list[int]]_boundary_checkro   list[sympy.Expr]c                 C     | j jS rq   )r   shaper   rv   rv   rw   r         zBlockPtrOptions.shapec                 C  r   rq   )r   block_shaper   rv   rv   rw   r      r   zBlockPtrOptions.block_shapec                 C  r   rq   )r   stridesr   rv   rv   rw   r      r   zBlockPtrOptions.stridesc                 C  r   rq   )r   offsetsr   rv   rv   rw   r      r   zBlockPtrOptions.offsetsvaluer~   initial_shapeallow_implicitrn   c                   s   dd t | j| jD }t|||}tjj t| jo0t|t|kp0t fddt ||D }|r5|rCd| dtj	
| j d}t|| j|}|S )z
        Generate a broadcast and a reshape for the block pointer.
        This restores stride-0 dimensions which were removed from the block pointer.
        c                 S  s    g | ]\}}|rt jjn|qS rv   )r   SOne)r   dimZis_broadcastingrv   rv   rw   
<listcomp>  s    zABlockPtrOptions.codegen_broadcast_and_reshape.<locals>.<listcomp>c                 3  s.    | ]\}}  |d p  || V  qdS rD   Nstatically_known_equals)r   Zpre_dimZpost_dimsizevarsrv   rw   r     s    

z@BlockPtrOptions.codegen_broadcast_and_reshape.<locals>.<genexpr>tl.broadcast_to(, ))zipr   r   triton_reshaperB   graphr   r   lenkernelindex_to_str)r   r   r   r   r   Zpre_broadcast_shapeZrequire_broadcastrv   r   rw   codegen_broadcast_and_reshape  s"   
z-BlockPtrOptions.codegen_broadcast_and_reshaperange_treeslist[IterationRangesRoot]get_max_blockCallable[[str], int]c              	     s|  t jjdfdd}|| j| _|| j| _fdd| jD }fd	d| jD }t|r3d
|d< dd t| j|D }dd t||D   fddtdi fddt	
|  D } dd |D }	t jjrz|d jdksuJ |	d t jj}
t jjst| jtt jj|
 krt jj r|	tjjg|
 7 }	t| t jj|tttt| j||	||d}|| |S )z,Helper to create a  BlockPtrOptions instanceexprsIterable[sympy.Expr]ro   r   c                   s    fdd| D S )Nc                   s   g | ]}  |qS rv   )lookup_precomputed_sizer   exprr   rv   rw   r   >      z?BlockPtrOptions.create.<locals>.lookup_size.<locals>.<listcomp>rv   )r   r   rv   rw   lookup_size=     z+BlockPtrOptions.create.<locals>.lookup_sizec                      g | ]}  |d qS )r   r   )r   strider   rv   rw   r   F      z*BlockPtrOptions.create.<locals>.<listcomp>c                   r   )rD   r   )r   r   r   rv   rw   r   L  r   Fc                 S     g | ]\}}|s|qS rv   rv   )r   r   Zis_singletonrv   rv   rw   r   V      c                 S     g | ]}t |qS rv   )r   )r   dimsrv   rv   rw   r   ]      c                   s   dd t |  D S )z@Removes any broadcasting or singleton dims from a given sequencec                 S  r  rv   rv   )r   itemZis_removablerv   rv   rw   r   a  r  z?BlockPtrOptions.create.<locals>.remove_dims.<locals>.<listcomp>)r   )it)removable_dimsrv   rw   remove_dims_  s   z+BlockPtrOptions.create.<locals>.remove_dimsc                      i | ]	\}}| |qS rv   rv   r   keyval)r
  rv   rw   r   i      z*BlockPtrOptions.create.<locals>.<dictcomp>c                 S  s   g | ]}t |qS rv   )r   r   r   r   rv   rv   rw   r   m  r   r   x)r   r   r   r   r   r   r   N)r   r   ro   r   rv   )rB   r   r   r   r   r   allr   r   dataclassesasdictitemsr   no_x_dimprefixpopnum_reduction_dimsinside_reductionr   numelsfeaturesis_reductionr   r   r   r   r   listreversedrangecompute_boundary_check)r   r   r   r   r   r   r   Zsingleton_dimsr   r   Zreduction_ndimresultrv   )r	  r
  r   rw   create0  sT   


	


	zBlockPtrOptions.creater   replacementr   r   c                 C  s   t j| }t|||iS )zN
        Replaces instances of {symt}_offset with the new expression.
        )r   r   r;   )r   r   r$  r   roffsetrv   rv   rw   replace_offset  s   
zBlockPtrOptions.replace_offsetTnamec                   s   dfdd t jj}g j}|s fdd|D }jdkr+| d	|j d
n|d|j d|j d|j d|j d|| g}dd	| d
S )a  
        Codegen a call to tl.make_block_ptr()

        Args:
            name: variable name for pointer
            roffset: should rn_offset be included in offsets=..., for use with tl.advance()

        Returns:
            "tl.make_block_ptr(...)"
        r   r   ro   c                   s$   t jD ]} | td|} q| S Nr   )r   r   r&  r   Integer)r   r   r   rv   rw   remove_roffsets  s   
z/BlockPtrOptions.format.<locals>.remove_roffsetsc                      g | ]} |qS rv   rv   r   r   )r*  rv   rw   r     r  z*BlockPtrOptions.format.<locals>.<listcomp>r    + (r   zshape=zstrides=zblock_shape=zorder=zoffsets=ztl.make_block_ptr(r   N)r   r   ro   r   )
rB   r   r   r   r   r   r   r   r   r   )r   r'  r%  fr   argsrv   )r*  r   rw   format  s   

zBlockPtrOptions.formatrp   c                   sF   t jjfddtj D   fddttjD _	dS )z6List of indices to pass to tl.load(boundary_check=...)c                   s   i | ]\}}| t | qS rv   r   )r   r   
block_size)r   rv   rw   r     s    z:BlockPtrOptions.compute_boundary_check.<locals>.<dictcomp>c                   sz   g | ]9} j| tjjs;j| j| s;j| tj|  s;t	j
jr9j| tjtj ks|qS rv   )r   r   r   r   Zerostatically_known_multiple_ofr   r   r;   rB   r   r  r   r   r   r   r   idx)block_to_maxr   r   rv   rw   r     s"    z:BlockPtrOptions.compute_boundary_check.<locals>.<listcomp>N)
rB   r   r   r   r   r  r   r   r   r   )r   r   rv   )r7  r   r   r   rw   r!    s   
z&BlockPtrOptions.compute_boundary_checkc                 C  s   | j d usJ | j S rq   )r   r   rv   rv   rw   boundary_check  s   zBlockPtrOptions.boundary_checkc                   s&   t j   fddjD }|S )av  
        Codegen string to pass to tl.advance(name, ...).

        Advance is the difference between offsets in each loop iteration.
        To compute it, we replace rN_offset with multiples of RN_BLOCK.
        Since we expect rN_offset to vary in range(0, rN_numel, RN_BLOCK), the first
        iteration has rN_offset=0, while the second has rN_offset=RN_BLOCK.
        c                   s,   g | ]} |  |tjj qS rv   )r&  r   r   r3  r,  rblockr   r   rv   rw   r     s    z3BlockPtrOptions.advance_roffset.<locals>.<listcomp>)r   r   r   )r   r   advancerv   r9  rw   advance_roffset  s
   
	zBlockPtrOptions.advance_roffsetc                 C     dS NFrv   r   rv   rv   rw   r        zBlockPtrOptions.has_indirectc                 C  r   )Nc                 s  s    | ]	}t |tjV  qd S rq   )r   r   r   r   rv   rv   rw   r     s
    

z-BlockPtrOptions.has_rindex.<locals>.<genexpr>)r   r   r   rv   rv   rw   r     s   zBlockPtrOptions.has_rindexc                 C  s   |   S rq   )r   r   rv   rv   rw   r        zBlockPtrOptions.has_rmaskc                 C  r=  r>  rv   r   rv   rv   rw   r     r?  zBlockPtrOptions.has_tmpmaskc                 C  s   t |  S rq   )rn   r8  r   rv   rv   rw   r        zBlockPtrOptions.has_maskro   r   )
r   r~   r   r   r   r   r   rn   ro   r~   )r   r   r   r   r   r   r   r   r   r   ro   r   )r   r   r$  r   r   r   ro   r   T)r'  r~   ro   r~   )r   r   ro   rp   )ro   r   )r   r   ro   r   r   )rr   ry   rz   r|   r   r   r   r   r   r   r   staticmethodr#  r&  r0  r!  r8  r<  r   r   r   r   r   rv   rv   rv   rw   r      s<   
 
-
V	
#





r   r   	old_shaper   	new_shapec                 C  s   t |tr
t |tsJ dd |D }dd |D }||kr | S dd |D |kr5d|  dd| dS d	}g }|D ]#}|t|k rS||| krS|d
 |d7 }q;|dksYJ |d q;|t|ksgJ |  dd| dS )z7Workaround https://github.com/openai/triton/issues/2836c                 S     g | ]}t j|qS rv   rB   r   r   r   r   rv   rv   rw   r          z"triton_reshape.<locals>.<listcomp>c                 S  rG  rv   rH  rI  rv   rv   rw   r     rJ  c                 S  s   g | ]}|d kr|qS )1rv   )r   srv   rv   rw   r     rJ  ztl.reshape(z, [r   z])r   :rD   rK  rp   [])
isinstancer  r   r   append)r   rE  rF  Zold_shape_strZnew_shape_strr6  expandsizerv   rv   rw   r     s"   

r   c                   @  s(  e Zd Zd?ddZd?ddZd?d	d
Zd?ddZd?ddZd?ddZd?ddZ	d?ddZ
d?ddZd?ddZd?ddZd?ddZeZd?ddZd@d d!Zd?d"d#Zd?d$d%Zd?d&d'Zd?d(d)Zd?d*d+Zd?d,d-Zd?d.d/Zd?d0d1Zd?d2d3Zd?d4d5Zd?d6d7Zd?d8d9Zd?d:d;Zd?d<d=Zd>S )ATritonPrinterr   r   ro   r~   c                 C  4   t |jdks	J d| |jd  dtjj dS )NrD   libdevice.trunc(r   ).to(r   r   r/  _printrB   r   index_dtyper   r   rv   rv   rw   _print_TruncToInt      zTritonPrinter._print_TruncToIntc                 C  s*   t  rtjjr| }|S d| d}|S )Nztl.full([], z, tl.float64))r   	is_fbcoders   versionhip)r   r   retrv   rv   rw   _print_Float   s
   zTritonPrinter._print_Floatc                 C  s6   t |jdks	J | |jd td d }| dS )NrD   r   Atom      ?z.to(tl.float64))r   r/  parenthesizer   )r   r   rL  rv   rv   rw   _print_ToFloat'  s   
zTritonPrinter._print_ToFloatc                 C  sT   |j \}}|jr|jr| |j dtd d S | |}| |}d| d| dS )N % rc  rd  z!triton_helpers.remainder_integer(r   r   )r/  is_nonnegative	stringifyr   rY  r   r   quotdivZquot_sZdiv_srv   rv   rw   _print_PythonMod,  s   


zTritonPrinter._print_PythonModc                 C  s^   |j sJ |j\}}|jr|jr| |jdtd d S | |}| |}d| d| dS )N // rc  rd  z!triton_helpers.div_floor_integer(z,  r   )
is_integerr/  rh  ri  r   rY  rj  rv   rv   rw   _print_FloorDiv4  s   



zTritonPrinter._print_FloorDivc                 C  s   |  |jdtd d S )N / rc  rd  )ri  r/  r   r[  rv   rv   rw   _print_IntTrueDiv?  s   zTritonPrinter._print_IntTrueDivc                 C  rU  NrD   libdevice.floor(r   rW  r   rX  r[  rv   rv   rw   _print_floorD  r]  zTritonPrinter._print_floorc                 C  rU  rs  rX  r[  rv   rv   rw   _print_FloorToIntJ  r]  zTritonPrinter._print_FloorToIntc                 C  rU  NrD   libdevice.ceil(r   rW  r   rX  r[  rv   rv   rw   _print_ceilingP     "zTritonPrinter._print_ceilingc                 C  rU  rw  rX  r[  rv   rv   rw   _print_CeilToIntT  rz  zTritonPrinter._print_CeilToIntc                 C  s   d|  | dS )Nzlibdevice.sqrt(().to(tl.float32)))rY  r[  rv   rv   rw   _helper_sqrtX  r   zTritonPrinter._helper_sqrtc                 C  s*   d|  |jd  d|  |jd  dS )Nlibdevice.pow(r   r   rD   r   )rY  r/  r[  rv   rv   rw   _print_FloatPow[  s   (zTritonPrinter._print_FloatPowc                 C  sH   |  |jd }|  |jd }|  |jd }d| d| d| dS )Nr   rD   r   	tl.where(r   r   )doprintr/  )r   r   cpqrv   rv   rw   _print_Whereb  s   zTritonPrinter._print_Wherecmpc                 C  s   t |jdkr| |jd S t |jd }t|}| ||jd|  }| ||j|d  }tdd ||fD \}}|dv sKJ d| d	d
| d| d| d| d| d| d| d| dS )zK
        Helper for max/min code genereration.
        cmp: > or <
        rD   r   r   Nc                 s  s    | ]	}d | dV  qdS )(r   Nrv   r   r  rv   rv   rw   r   w      z6TritonPrinter._print_min_max_helper.<locals>.<genexpr>)><zUnexpected comparator: ''r  z * ( z= z) + )))r   r/  rY  typetuple)r   r   r  midru   abrv   rv   rw   _print_min_max_helperh  s   6z#TritonPrinter._print_min_max_helperc                 C     |  |dS )Nr  r  r[  rv   rv   rw   
_print_Min{  rA  zTritonPrinter._print_Minc                 C  r  )Nr  r  r[  rv   rv   rw   
_print_Max~  rA  zTritonPrinter._print_Maxc                 C  *   t |jdks	J d| |jd  dS )NrD   tl_math.abs(r   r   r   r/  rY  r[  rv   rv   rw   
_print_Abs     zTritonPrinter._print_Absc                 C  r  )NrD   zlibdevice.cos((r   r|  r  r[  rv   rv   rw   _print_OpaqueUnaryFn_cos  r  z&TritonPrinter._print_OpaqueUnaryFn_cosc                 C  r  )NrD   zlibdevice.cosh((r   r|  r  r[  rv   rv   rw   _print_OpaqueUnaryFn_cosh  r  z'TritonPrinter._print_OpaqueUnaryFn_coshc                 C  r  )NrD   zlibdevice.acos((r   r|  r  r[  rv   rv   rw   _print_OpaqueUnaryFn_acos  r  z'TritonPrinter._print_OpaqueUnaryFn_acosc                 C  r  )NrD   zlibdevice.sin((r   r|  r  r[  rv   rv   rw   _print_OpaqueUnaryFn_sin  r  z&TritonPrinter._print_OpaqueUnaryFn_sinc                 C  r  )NrD   zlibdevice.sinh((r   r|  r  r[  rv   rv   rw   _print_OpaqueUnaryFn_sinh  r  z'TritonPrinter._print_OpaqueUnaryFn_sinhc                 C  r  )NrD   zlibdevice.asin((r   r|  r  r[  rv   rv   rw   _print_OpaqueUnaryFn_asin  r  z'TritonPrinter._print_OpaqueUnaryFn_asinc                 C  r  )NrD   zlibdevice.tan((r   r|  r  r[  rv   rv   rw   _print_OpaqueUnaryFn_tan  r  z&TritonPrinter._print_OpaqueUnaryFn_tanc                 C  r  )NrD   zlibdevice.tanh((r   r|  r  r[  rv   rv   rw   _print_OpaqueUnaryFn_tanh  r  z'TritonPrinter._print_OpaqueUnaryFn_tanhc                 C  r  )NrD   zlibdevice.atan((r   r|  r  r[  rv   rv   rw   _print_OpaqueUnaryFn_atan  r  z'TritonPrinter._print_OpaqueUnaryFn_atanc                 C  rU  )NrD   zlibdevice.llrint(r   rW  r   rX  r[  rv   rv   rw   _print_RoundToInt  r]  zTritonPrinter._print_RoundToIntc                 C  sf   t |jdks	J |j\}}|jr|dk sJ td| d| |td }d| d| d|  S )	Nr   r   zOFor integer inputs, only non-negative ndigits are currently supported, but got .ZMulzlibdevice.nearbyint(1e * z) * 1e)r   r/  ro  
ValueErrorre  r   )r   r   numberndigitsZ
number_strrv   rv   rw   _print_RoundDecimal  s   

z!TritonPrinter._print_RoundDecimalN)r   r   ro   r~   )r   r   r  r~   ro   r~   ) rr   ry   rz   r\  rb  rf  rm  rp  rr  ru  rv  ry  r{  r}  r  Z_print_PowByNaturalr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rv   rv   rv   rw   rT    s<    


























rT  dtypetorch.dtypec                 C     t t| S )zCConvert torch.dtype to triton type and upcast [b]float16 to float32)r<   r>   r  rv   rv   rw   triton_compute_type  r   r  intc                 C  s$   t | } t| dd}|r|d S dS )z'Number of bits of triton_compute_type()itemsizeN   r  )r>   getattr)r  r  rv   rv   rw   _get_primitive_bitwidth  s
   r  c                 C  s   | t jkrt j} t| S )z@Convert torch.dtype to triton type, with fix for storing tl.bool)rs   rn   int8r<   r  rv   rv   rw   triton_store_type  s   
r  c                 C  s&   t | r| jr| jdkrtjS t| S )z0Implicit upcasts used for Triton reduction types   )r   	is_signedr  rs   int32r>   r  rv   rv   rw   upcast_acc_dtype  s   r  c                 C  r  )z:Convert torch.dtype to triton type, with reduction upcasts)r  r  r  rv   rv   rw   triton_acc_type  r   r  rn   c                 C  s   | j dko| jS )Nr   )r  is_floating_pointr  rv   rv   rw   low_precision_fp     r  varUnion[CSEVariable, Any]c                 C  s,   t | tsdS | j}t |tjrt|S dS r>  )rP  rJ   r  rs   r  )r  r  rv   rv   rw   low_precision_fp_var  s   
r  c                      s&   e Zd Zd fddZd	d
 Z  ZS )TritonCSEVariableboundsValueRanges[Any]r  r  ro   rp   c                   s0   t  ||| tt  | _|d usJ dd S )Nz!TritonCSEVariable must have dtype)super__init__r   r~   r   )r   r'  r  r  	__class__rv   rw   r    s   zTritonCSEVariable.__init__c                 C  sf   |D ].}t |tr| j|j qt |tjr0tjD ]}t||r/| jt	|  dg  nqqd S )Nr   )
rP  r  r   updater   r   r   r   r   r   )r   r'  r/  kwargsargr   rv   rv   rw   update_on_args  s   


z TritonCSEVariable.update_on_args)r  r  r  r  ro   rp   )rr   ry   rz   r  r  __classcell__rv   rv   r  rw   r    s    r  rd   c                  C  s   ddl m}  |  S )Nr   rc   )!torch._inductor.dtype_propagationrd   rc   rv   rv   rw   get_dtype_handler  s   r  Trm   Callable[[_T], _T]c                   s.   ddddfddd fd
d}|S )z
    Codegen helper to upcast arguments to float32, depending on the config and dtype.
    This decorates tl.math/libdevice codegen functions.
    ro   rn   c                 S  s&   t jj ot| to| jtjtjfv S rq   )	r   r   codegen_upcast_to_fp32rP  rJ   r  rs   float16bfloat16r  rv   rv   rw   needs_upcast  s
   
z*maybe_upcast_float32.<locals>.needs_upcastr~   c                   s    | rdnd}|  | S )N.to(tl.float32)r   rv   )r  Zupcast_stringr  rv   rw   maybe_upcast_arg  s   z.maybe_upcast_float32.<locals>.maybe_upcast_argrk   Callable[..., Any]c                   s$   t   d fdd}|S )Nro   r~   c            	        s   fdd| D }fdd|  D }|i |} o-tfddt| | D }|s2d ntt j| i |}|tj	d fv}|rS|d urSdt
| dnd	}| | S )
Nc                   r+  rv   rv   )r   r  r  rv   rw   r      r  zLmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<listcomp>c                   r  rv   rv   r  r  rv   rw   r   !  r  zLmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<dictcomp>c                 3  s    | ]} |V  qd S rq   rv   r   r  r  rv   rw   r   %      
zKmaybe_upcast_float32.<locals>.decorator.<locals>.wrapped.<locals>.<genexpr>.to(r   r   )r  r   	itertoolschainvaluesr  r  rr   rs   rt   r<   )	r/  r  Zupcast_argsZupcast_kwargsr"  Zany_needs_upcastZresult_dtypeZneeds_downcastZdowncast_string)rm   rk   r  r  rv   rw   wrapped  s$   z8maybe_upcast_float32.<locals>.decorator.<locals>.wrappedr   )rh   rx   )rk   r  rm   r  r  )rk   rw   	decorator  s   z'maybe_upcast_float32.<locals>.decoratorNr   r   )rk   r  ro   r  rv   )rm   r  rv   r  rw   maybe_upcast_float32	  s   
r  c                   @  s  e Zd ZdZeejZe		dddd	Z	edd
dZ
edd Zedd Zee dd Zedd Zedd Zee dd Zee dd Zee dd Zee dd Zee dd Zee d d! Zee d"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zedejdd,d-d.d/Zee d0d1 Z ee d2d3 Z!ee d4d5 Z"ee d6d7 Z#ed8d9 Z$ed:d; Z%ee d<d= Z&ee d>d? Z'ee d@dA Z(ee dBdC Z)ee dDdE Z*ee dFdG Z+ee dHdI Z,ee dJdK Z-ee dLdM Z.ee dNdO Z/ee dPdQ Z0ee dRdS Z1ee dTdU Z2ee dVdW Z3ee dXdY Z4ee dZd[ Z5ee d\d] Zee d^d_ Z6ed`da Z7edbdc Z8eddde Z9edfdg Z:edhdi Z;edjdk Z<edldm Z=edndo Z>edpdq Z?edrds Z@edtdu ZAedvdw ZBedxdy ZCedzd{ ZDee d|d} ZEee d~d ZFee dd ZGee dd ZHee dd ZIedd ZJee dd ZKee dd ZLee dd ZMee dd ZNeedddd ZOeedddd ZPee dd ZQee dd ZRedd ZSedd ZTee dd ZUedd ZVee dd ZWdS )TritonOverrideszMap element-wise ops to TritonNTr  r  	src_dtypeOptional[torch.dtype]c                 C  sz   ddd}|d urt |||tjjtj_|tjkr d|  d	S |tjkr*|  d
S |r1t|}nt|}|  d| dS )Nr  r  	dst_dtypero   r  c                 S  sl   | |krdS t jt jf}| |v r||v r| |krJ d| t jks&|t jkr(dS | t jks2|t jkr4dS dS )Nr   zCConversions between float8_e5m2 and float8_e4m3fn is not supported!r  r   )rs   Zfloat8_e4m3fnZfloat8_e5m2)r  r  Z
fp8_dtypesrv   rv   rw   _get_min_elements_per_threadF  s   
z>TritonOverrides.to_dtype.<locals>._get_min_elements_per_threadr  z != 0)z.to(tl.int8).to(tl.uint8)r  r   )r  r  r  r  ro   r  )	maxrB   r   min_elem_per_threadrs   rn   Zuint8r  r  )r  r  r  Zuse_compute_typesr  	out_dtyperv   rv   rw   to_dtype?  s   




zTritonOverrides.to_dtypec           
      C  s   t |}|tjtjfv rNtjjrNt|dd }|  d| d}|tjtjfv r7t|dd }d| }| d| d}|tjtjfv rL| dS |S t	|}t	|}||kr\d	nd
}	|  d| d|	 dS )Nr  r  z.to(tl.r   ztl.r  z, bitcast=True)r  TrueFalsez
, bitcast=)
r  rs   r  r  r   r   r  r~   splitr  )
r  r  r  triton_dtypeZtriton_src_dtypeZcast_xZtriton_type_nameZsrc_dtype_bitwidthZtarget_dtype_bitwidthZbitcastrv   rv   rw   to_dtype_bitcastx  s"   

z TritonOverrides.to_dtype_bitcastc                 C  sD   t j|}t|| }t|}|dkr|S d| d| d| dS )Nz
tl.float32tl.full(r   r   )rs   Z_prims_commonZdtype_to_typerU   r  )r   r  r   type_Z
triton_valr<   rv   rv   rw   _shaped_constant  s   z TritonOverrides._shaped_constantc                 C  s   | j ||g dS )Nr   )r  )ru   r   r  rv   rv   rw   constant  s   zTritonOverrides.constantc                 C     d|  dS )Nr  r   rv   r  rv   rv   rw   abs     zTritonOverrides.absc                 C  X   d|  d| d}t | st |r*t | |}|tjtjfv r*| dt| d}|S )Nr  rq  r   r  )r  r  truedivrs   r  rt   r<   r  youtr  rv   rv   rw   r     s   zTritonOverrides.truedivc                 C  r  )Nr  rg  r   r  )r  r  modrs   r  rt   r<   r  rv   rv   rw   r    s   zTritonOverrides.modc                 C  r  )Nzlibdevice.abs(r   rv   r  rv   rv   rw   libdevice_abs  r  zTritonOverrides.libdevice_absc                 C  s&   t jrd|  dtj dS d|  dS )z
        When use_fast_math, use the ftz (flushing to zero) variant
        of exponent computation.

        Check https://github.com/triton-lang/triton/issues/5735 for
        more details.
        libdevice.exp2(r  r   ztl_math.exp()r   use_fast_mathr  _LOG_2_Er  rv   rv   rw   exp  s   
zTritonOverrides.expc                 C  r  )Nzlibdevice.exp(r   rv   r  rv   rv   rw   libdevice_exp  r  zTritonOverrides.libdevice_expc                 C  r  )Nr  r   rv   r  rv   rv   rw   exp2  r  zTritonOverrides.exp2c                 C  r  )Nzlibdevice.expm1(r   rv   r  rv   rv   rw   expm1  r  zTritonOverrides.expm1c                 C  r  Nzlibdevice.sqrt(r   rv   r  rv   rv   rw   sqrt  r  zTritonOverrides.sqrtc                 C  r  r  rv   r  rv   rv   rw   libdevice_sqrt  r  zTritonOverrides.libdevice_sqrtc                 C  sl   t jj}|dkr
dS |dkrd|  d|  dS |dkr |  dS |d u r/ttd	tj| S td
|)NZcompile_errorzcompile error!Zruntime_errorz"triton_helpers.device_assert_then(z == 0, "injected assert fail", r   Zaccuracyz + 1r   z:unrecognized config triton.inject_relu_bug_TESTING_ONLY = )	r   r   Zinject_relu_bug_TESTING_ONLYopsmaximumr  rs   r  AssertionError)r  bugrv   rv   rw   relu  s   
zTritonOverrides.reluc                 C     d|  d| dS )Nztriton_helpers.minimum(r   r   rv   r  r  rv   rv   rw   minimum     zTritonOverrides.minimumc                 C  r  )Nztriton_helpers.maximum(r   r   rv   r  rv   rv   rw   r    r  zTritonOverrides.maximumc                 C  s   d|  d| d| dS )Nr  r   r   rv   )r  r  r  rv   rv   rw   where  s   zTritonOverrides.whererD   )constraintsr  is_purepackc                 G  sh   t |}ddd |D }|d u rddgdd |D  }d|  d| d| d	| d
| d| dS )Nr   c                 S  r  rv   r~   r   irv   rv   rw   r     r  z:TritonOverrides.inline_asm_elementwise.<locals>.<listcomp>z=rc                 S  s   g | ]}d qS )r   rv   r   _rv   rv   rw   r     s    ztl.inline_asm_elementwise('z', 'z', [z	], dtype=z
, is_pure=z, pack=r   )r  r   )asmr  r  r  r  Zinputsr<   Z
input_refsrv   rv   rw   inline_asm_elementwise  s
   *z&TritonOverrides.inline_asm_elementwisec                 C  r  )Nztl_math.cos(r   rv   r  rv   rv   rw   cos  r  zTritonOverrides.cosc                 C  r  )Nzlibdevice.cos(r   rv   r  rv   rv   rw   libdevice_cos  r  zTritonOverrides.libdevice_cosc                 C  r  )Nztl_math.sin(r   rv   r  rv   rv   rw   sin  r  zTritonOverrides.sinc                 C  r  )Nzlibdevice.sin(r   rv   r  rv   rv   rw   libdevice_sin$  r  zTritonOverrides.libdevice_sinc                 C     t d)Nz/ops.index_expr not implemented outside a kernelNotImplementedError)ru   r   r  rv   rv   rw   
index_expr)  r   zTritonOverrides.index_exprc                 C  r(  )Nz+ops.masked not implemented outside a kernelr)  )r   bodyotherrv   rv   rw   masked-  r   zTritonOverrides.maskedc                 C  r  )Nzlibdevice.lgamma(r   rv   r  rv   rv   rw   lgamma1  r  zTritonOverrides.lgammac                 C  r  )Nzlibdevice.erf(r   rv   r  rv   rv   rw   erf6  r  zTritonOverrides.erfc                 C  r  )Nzlibdevice.cosh(r   rv   r  rv   rv   rw   cosh;  r  zTritonOverrides.coshc                 C  r  )Nzlibdevice.sinh(r   rv   r  rv   rv   rw   sinh@  r  zTritonOverrides.sinhc                 C  r  )Nzlibdevice.acos(r   rv   r  rv   rv   rw   acosE  r  zTritonOverrides.acosc                 C  r  )Nzlibdevice.acosh(r   rv   r  rv   rv   rw   acoshJ  r  zTritonOverrides.acoshc                 C  r  )Nzlibdevice.asin(r   rv   r  rv   rv   rw   asinO  r  zTritonOverrides.asinc                 C  r  )Nzlibdevice.asinh(r   rv   r  rv   rv   rw   asinhT  r  zTritonOverrides.asinhc                 C  r  )Nzlibdevice.atan2(r   r   rv   r  r  rv   rv   rw   atan2Y     zTritonOverrides.atan2c                 C  r  )Nzlibdevice.atan(r   rv   r  rv   rv   rw   atan^  r  zTritonOverrides.atanc                 C  r  )Nzlibdevice.atanh(r   rv   r  rv   rv   rw   atanhc  r  zTritonOverrides.atanhc                 C  r  )Nzlibdevice.copysign(r   r   rv   r7  rv   rv   rw   copysignh  r9  zTritonOverrides.copysignc                 C  r  )Nzlibdevice.erfc(r   rv   r  rv   rv   rw   erfcm  r  zTritonOverrides.erfcc                 C  r  )Nzlibdevice.erfinv(r   rv   r  rv   rv   rw   erfinvr  r  zTritonOverrides.erfinvc                 C  r  )Nzlibdevice.hypot(r   r   rv   r7  rv   rv   rw   hypotw  r9  zTritonOverrides.hypotc                 C  r  )Nzlibdevice.log10(r   rv   r  rv   rv   rw   log10|  r  zTritonOverrides.log10c                 C  r  )Nzlibdevice.log2(r   rv   r  rv   rv   rw   log2  r  zTritonOverrides.log2c                 C  r  )Nzlibdevice.nextafter(r   r   rv   r7  rv   rv   rw   	nextafter  r9  zTritonOverrides.nextafterc                 C     |  d| S Nr   rv   r  rv   rv   rw   logical_and     zTritonOverrides.logical_andc                 C  s
   |  dS )Nz == 0rv   r  rv   rv   rw   logical_not     
zTritonOverrides.logical_notc                 C  rC  Nz | rv   r  rv   rv   rw   
logical_or  rF  zTritonOverrides.logical_orc                 C  r  )Nr   ^ r   rv   r  rv   rv   rw   logical_xor  r  zTritonOverrides.logical_xorc                 C  rC  rD  rv   r  rv   rv   rw   bitwise_and  rF  zTritonOverrides.bitwise_andc                 C  s
   d|  S )N~rv   rG  rv   rv   rw   bitwise_not  rI  zTritonOverrides.bitwise_notc                 C  rC  rJ  rv   r  rv   rv   rw   
bitwise_or  rF  zTritonOverrides.bitwise_orc                 C  rC  )NrL  rv   r  rv   rv   rw   bitwise_xor  rF  zTritonOverrides.bitwise_xorc                 C  rC  )Nz << rv   r  rv   rv   rw   bitwise_left_shift  rF  z"TritonOverrides.bitwise_left_shiftc                 C  rC  )Nz >> rv   r  rv   rv   rw   bitwise_right_shift  rF  z#TritonOverrides.bitwise_right_shiftc                 C     d| d}d|  d| dS )Nr  ).to(tl.uint32)ztl.rand(r   r   rv   seedr   rv   rv   rw   rand     zTritonOverrides.randc                 C  rU  )Nr  rV  z	tl.randn(r   r   rv   rW  rv   rv   rw   randn  rZ  zTritonOverrides.randnc              	   C  s*   d| d}d|  d| d| d| d	S )Nr  rV  ztriton_helpers.randint64(r   r   rv   )rX  r   lowhighrv   rv   rw   	randint64  s   zTritonOverrides.randint64c                 C  r(  )Nz.ops.load_seed not implemented outside a kernelr)  )r'  r   rv   rv   rw   	load_seed  r   zTritonOverrides.load_seedc                 C  r  )Nzlibdevice.rsqrt(r   rv   r  rv   rv   rw   rsqrt  r  zTritonOverrides.rsqrtc                 C  r  )Nzlibdevice.log1p(r   rv   r  rv   rv   rw   log1p  r  zTritonOverrides.log1pc                 C  r  )Nzlibdevice.tan(r   rv   r  rv   rv   rw   tan  r  zTritonOverrides.tanc                 C  r  )Nzlibdevice.tanh(r   rv   r  rv   rv   rw   tanh  r  zTritonOverrides.tanhc                 C  r  )Nztl.sigmoid(r   rv   r  rv   rv   rw   sigmoid  r  zTritonOverrides.sigmoidc                 C  s   d|  d|  d|  dS )Nz(libdevice.signbit(z) != 0) if (z).dtype is tl.float32 else z < 0rv   r  rv   rv   rw   signbit  s   zTritonOverrides.signbitc                 C  r  )Nzlibdevice.fmod(r   r   rv   r  rv   rv   rw   fmod  r9  zTritonOverrides.fmodc                 C  r  )Nr~  r   r   rv   r  rv   rv   rw   pow  r9  zTritonOverrides.powc                 C  r  )Nztl_math.log(r   rv   r  rv   rv   rw   log  r  zTritonOverrides.logc                 C  r  )Nzlibdevice.log(r   rv   r  rv   rv   rw   libdevice_log  r  zTritonOverrides.libdevice_logF)rm   c                 C  r  )Nzlibdevice.isinf().to(tl.int1)rv   r  rv   rv   rw   isinf  r  zTritonOverrides.isinfc                 C  r  )Nzlibdevice.isnan(rj  rv   r  rv   rv   rw   isnan  r  zTritonOverrides.isnanc                 C  r  )Nzlibdevice.nearbyint(r   rv   r  rv   rv   rw   round  r  zTritonOverrides.roundc                 C  r  )Nrt  r   rv   r  rv   rv   rw   floor	  r  zTritonOverrides.floorc                 C  sF   |  d| }|  d| }d|  d| d| d| d| d| d	S )
Nrn  rg  z
tl.where((z
 < 0) != (z < 0), tl.where(z != 0, z - 1, ), r   rv   )r  r  rk  remrv   rv   rw   floordiv  s   *zTritonOverrides.floordivc                 C  sV   t dtj}t t || tj}t t | |tj}t ||}| d|  dS )Nr   r  .dtype))r  r  rs   r  r  ltr  sub)r  zleftrightrt  rv   rv   rw   sign  s
   zTritonOverrides.signc                 C  r  )NrV  r   rv   r  rv   rv   rw   trunc  r  zTritonOverrides.truncc                 C  rC  )Nrn  rv   r  rv   rv   rw   truncdiv$  s   zTritonOverrides.truncdivc                 C  r  )Nrx  r   rv   r  rv   rv   rw   ceil*  r  zTritonOverrides.ceil)NT)r  r  r  r  )r  r  r  r  )Xrr   ry   rz   r{   mathrA  er  rD  r  r  r  r}   r  r  r  r   r  r  r	  r
  r  r  r  r  r  r  r  r  rs   rt   r#  r$  r%  r&  r'  r+  r.  r/  r0  r1  r2  r3  r4  r5  r6  r8  r:  r;  r<  r=  r>  r?  r@  rB  rE  rH  rK  rM  rN  rP  rQ  rR  rS  rT  rY  r[  r^  r_  r`  ra  rb  rc  rd  re  rf  rg  rh  ri  rk  rl  rm  rn  rq  rx  ry  rz  r{  rv   rv   rv   rw   r  :  s   8


	





	



















r  r   c                   @  sL   e Zd ZdZedd Zedd Zedd Zedd	 Z	ed
d Z
dS )TritonKernelOverridesa   Map element-wise ops to Triton within a TritonKernel

    Unlike TritonOverrides, these assume the code is going to be inserted into
    the body of the main triton kernel and so it may use indexing and mask
    variables which are assumed to already be defined in the current scope.
    c                 C  s$   t j }dg| }| j|||dS )NrD   r  )rB   r   triton_tensor_ndimr  )ru   r   r  ndimr   rv   rv   rw   r  ;  s   

zTritonKernelOverrides.constantc                 C  s6  t jj|dd}t|tsJ t jjdkrtjntj}|tjtjfvr%|n|}t	j
j}zdt	j
_t jjjt jj|jt||d}W |t	j
_n|t	j
_w |tjtjfvrft jjjt jj| ||t|d}n/|}|jD ]}t|tjrt|t jjj|j j}qk||krt jjjt jj| |||d}|j|_|S )NF	block_ptrtl.int32r  r  r  )rB   r   indexingrP  r   rZ  rs   r  int64r   Ztest_configsZruntime_triton_dtype_assertcsegeneratecomputer   r3   r  r>   free_symbolsr   r   r   promote_typesvarname_mapr'  r  r   )ru   r   r  r  rZ  origr  	index_varrv   rv   rw   r+  D  sF   



z TritonKernelOverrides.index_exprc              	   C  s<  | d urt jjd urtjjjtjj|  dt jd} |j	j
dd}|s'J dd}|D ]}|jD ]}|jdks>t|jd rBd	} nq0q+|rHd n|}tjj| |d
}| }	W d    n1 s`w   Y  |r|	jjrot|}tjjjtjjd|	 dt| d|	 dt||	jd}t||	|}
n|	}
|
j| |
S )N.to(tl.int1)r  output)opz)graph for body does not contain an outputFloadrD   Tr   r  z.shape, r   rr  r  )rs   r_  r`  rB   r   r  r  r  rn   r   Z
find_nodesr/  targetr^   Z
mask_loadsr  Zis_boolrU   r   wrapr  r  r  r   discard)r   r,  r-  nodesZ
need_wherenoder  r   Znew_maskr"  ra  rv   rv   rw   r.  w  sB   
zTritonKernelOverrides.maskedc                 C  s,   t jj| }d| dt jjd| dS )Ntl.load( + Zload_seed_offsetr   )rB   r   r/  inputseed_offset)r'  r   r  rv   rv   rw   r_    s   zTritonKernelOverrides.load_seedc                 C  s   d|  d}t jj| }r|S t jjj| jd}t jjjtjd}t jj	| d| d|  d t jj
|||f ||fS )Nzfrexp(r   r  r   z = triton_helpers.frexp()rB   r   r  Ztry_getnewvarr  rs   r  r  r   put)r  	cache_keyZcse_valZmantissaexponentrv   rv   rw   frexp  s   zTritonKernelOverrides.frexpN)rr   ry   rz   r{   r}   r  r+  rD  r.  r_  r  rv   rv   rv   rw   r~  3  s    

2
,
r~  c                   @  sL   e Zd ZU dZded< ded< ddd	Zd
ddddZdd Zdd ZdS )HelperFunctionsz#An ordered set of helper functions.zdict[str, str]_templates_seen	list[str]finalized_helpersro   rp   c                 C  s   i | _ g | _d S rq   )r  r  r   rv   rv   rw   r    s   
zHelperFunctions.__init___triton_helper_fn	base_nametemplate_coder~   c                C  sL   | j |}|dur|S | t| j }|| j |< | j|j|d |S )a9  This accepts a function definition with the function name
        left as a format specifier e.g.

            @triton.jit
            def {name}(arg0, arg1):
                return arg0 + arg1

        We add the templated code to the function set and return the name
        assigned to that function.

        N)r'  )r  getr   r  rQ  r0  )r   r  r  Zexisting_namer'  rv   rv   rw   add  s   
zHelperFunctions.addc                 C  r   rq   )iterr  r   rv   rv   rw   __iter__  r   zHelperFunctions.__iter__c                 C  
   | j | S rq   )r  )r   r6  rv   rv   rw   __getitem__  r   zHelperFunctions.__getitem__Nro   rp   )r  r~   ro   r~   )	rr   ry   rz   r{   r|   r  r  r  r  rv   rv   rv   rw   r    s   
 
r  c                   @  sl   e Zd ZU dZejedZded< ejedZ	ded< ejedZ
ded< ejedZded< dd
dZdS )r   zM
    Class representing ND block dimensions, for block pointer analysis.
    )default_factoryr   r   r   r   r   r-  ro   c                   s@   t | }tdd | |fD \ |di  fdd D S )z0
        Concatenates block parameters.
        c                 s  s    | ]}t |V  qd S rq   )r  r  r  rv   rv   rw   r         z*BlockParameters.__add__.<locals>.<genexpr>c                   s   i | ]}| | |  qS rv   rv   )r   r  r  rv   rw   r         z+BlockParameters.__add__.<locals>.<dictcomp>Nrv   )r  r  )r   r-  ru   rv   r  rw   __add__  s   zBlockParameters.__add__N)r-  r   ro   r   )rr   ry   rz   r{   r  fieldr  r   r|   r   r   r   r  rv   rv   rv   rw   r     s   
 r   c                   @  s2   e Zd ZdZdd ZdddZdd	 Zd
d ZdS )"CooperativeReductionWorkspaceCachez
    The scratch space used for cooperative reductions can be reused
    after two reduction loops.  This keeps track of what can be reused.
    c                 C  s0   || _ g | _g | _ttj| _d| _d| _d S r(  )	r/  current_loop
prior_loopcollectionsdefaultdictdequeready_for_reuse
loop_countstore_count)r   r/  rv   rv   rw   r    s   
z+CooperativeReductionWorkspaceCache.__init__nbytesr   c                 C  sD   | j |}|r| S | j|d\}}| j|||f ||fS r>  )r  r  popleftr/  Z	workspacer  rQ  )r   r  cachedws_name	ws_offsetrv   rv   rw   allocate  s   z+CooperativeReductionWorkspaceCache.allocatec                 C  sF   | j D ]\}}}| j| ||f q| j| _ g | _|  jd7  _d S NrD   )r  r  rQ  r  r  )r   r  r  r  rv   rv   rw   on_loop_end  s
   z.CooperativeReductionWorkspaceCache.on_loop_endc                 C  s   | j }|  j d7  _ |S r  )r  )r   Zpriorrv   rv   rw   increment_store_count  s   z8CooperativeReductionWorkspaceCache.increment_store_countN)r  r   )rr   ry   rz   r{   r  r  r  r  rv   rv   rv   rw   r    s    
r  c                   @  s&   e Zd ZU ded< dd Zdd ZdS )FixedTritonConfigzdict[str, int]r   c                 C  r  rq   r   r   r  rv   rv   rw   r    r   zFixedTritonConfig.__getitem__c                 C  s
   || j v S rq   r  r  rv   rv   rw   __contains__   r   zFixedTritonConfig.__contains__N)rr   ry   rz   r|   r  r  rv   rv   rv   rw   r    s   
 r  c                   @  s   e Zd ZdZd	ddZdS )
	TritonCSEz
    Subclasses CSE to apply the current load mask to the cache key to avoid CSEing
    variables across separate masked blocks.
    r  r~   ro   Union[str, tuple[str, str]]c                 C  s   t jj }r||jfS |S rq   )rB   r   
_load_maskr'  )r   r  r   rv   rv   rw   augment_key*  s   
zTritonCSE.augment_keyN)r  r~   ro   r  )rr   ry   rz   r{   r  rv   rv   rv   rw   r  $  s    r  c                      s  e Zd ZU eZded< eZded< dZ			ddχ fddZ	dddZ
dddZdd Zdd Zdd Zdd Zdd d!Zd"d# Zedd$d%Zdd&dd&d'dd*d+Z	,ddd2d3Zdd4d5Zdd:d;Zd<d= Zdd>d?Z	dddDdEZdFdG Z		dddRdSZddTdUZddVdWZdd\d]Zdd^d_Z dd`daZ!dbdc Z"ddde Z#dfdg Z$dhdi Z%djdk Z&ddldmZ'ddpdqZ(ddudvZ)ddydzZ*d{d| Z+dd~dZ,dd Z-dd Z.dd Z/e0dd Z1dddZ2e0dd Z3e0dd Z4dd Z5dddZ6dd Z7ddddZ8dddZ9dddZ:dddZ;dddZ<dddZ=dddZ>dddZ?dddZ@dddZAdddZBdddZCeDdddZEdddZFdddZGeDdddńZHdddȄZIdddʄZJddd̈́ZK  ZLS )TritonKernelr  helper_functionszCallable[[sympy.Expr], str]kexprTr   Ntilingdict[str, sympy.Expr]fixed_configOptional[FixedTritonConfig]ro   rp   c                   s   || _ || _t j|fi | t| j| j| _t | _	t | _
tt  | _|| _t | _tttf  | _t | _tt| _t | _tt  | _d | _| jrW|  | j! | j"r^| #  | $  | j"rk| %  d S d S rq   )&optimize_maskr  r  r  r  Znewvar_prefixsuffixr  rL   post_loop_combinepost_loop_storer   r   outside_loop_varsr  r  countblock_ptr_iddictr~   block_ptr_to_bufferr  r  r  r  pointer_advancementsCounter_load_countsr'   autotune_hintstriton_metar  codegen_reduction_numelsr,  cooperative_reductioninit_cooperative_reductioncodegen_range_treeinit_cooperative_reduction_mask)r   r  r  r  r  r  r  rv   rw   r  7  s0   

zTritonKernel.__init__r  r  r~   c                 C  s   t |S rq   )r<   )r   r  rv   rv   rw   dtype_to_str^  r@  zTritonKernel.dtype_to_strrn   c                 C  s   | j o	tj| jS rq   )r  rB   choices should_use_cooperative_reductionr  r   rv   rv   rw   r  a  s   z-TritonKernel.should_use_cooperative_reductionc                   s    j sJ  jD ]}|jdur| jd7  _q jd } jr't| jd } j| _t	 j _
 jd t fdd jD rN jd dS dS )	z/One time setup code for cooperative reductions.NrD   r  r   a              RSPLIT_NEXT_POWER_OF_2: tl.constexpr = triton_helpers.constexpr_next_power_of_2(RSPLIT)
            RSPLIT_IS_POWER_OF_2: tl.constexpr = RSPLIT == RSPLIT_NEXT_POWER_OF_2
            HAS_RSPLIT: tl.constexpr = RSPLIT > 1
            rsplit_id = tl.program_id(0)
            num_rblocks = (rnumel + RBLOCK - 1) // RBLOCK
            rsplit_chunk = (num_rblocks + RSPLIT - 1) // RSPLIT * RBLOCK
            rsplit_start = rsplit_chunk * rsplit_id
            rsplit_end = rsplit_chunk * (rsplit_id + 1)
            c                 3  s"    | ]}|j r | V  qd S rq   )r  _has_constant_maskr  r   rv   rw   r     s    

z:TritonKernel.init_cooperative_reduction.<locals>.<genexpr>z>rsplit_end = tl.where(rsplit_end < rnumel, rsplit_end, rnumel))r  r   grid_dimr  r  r   r/  Z
semaphoressemaphores_namer  %cooperative_reduction_workspace_cacher,  r   r   r   )r   r   Z	sem_countrv   r   rw   r  f  s,   



z'TritonKernel.init_cooperative_reductionc                 C  sX   d}| j s
| d}| jd|  |  r| jd d S | j r$J | jd d S )Nz$tl.arange(0, RSPLIT_NEXT_POWER_OF_2)z	[None, :]zrsplit_arange = z                if RSPLIT_IS_POWER_OF_2:
                    rsplit_mask: tl.constexpr = None
                else:
                    rsplit_mask = rsplit_arange < RSPLIT
                zSrsplit_mask = xmask if RSPLIT_IS_POWER_OF_2 else ((rsplit_arange < RSPLIT) & xmask))r  r,  r   _has_constant_xmaskr   )r   Zrsplit_arangerv   rv   rw   r    s   

	z,TritonKernel.init_cooperative_reduction_maskc                 C  s   | j D ]}|js| || j q| jr"| j|j d| |  q| jrStdd | j D rK| j	dddd}| 
|}| jd| |  d S | | j d S d S )Nzbase = c                 s  s    | ]}|j V  qd S rq   is_loopr  rv   rv   rw   r     s    z2TritonKernel.codegen_range_tree.<locals>.<genexpr>baseTr   zrbase = )r   r  iteration_ranges_codegen_headerr,  r  r   r  iteration_ranges_ranges_coder   _get_reduction_symbols_flatten_reduction_indicesr   r   codegen_reduction_indices)r   r   Zrn_basesZrbaserv   rv   rw   r    s"   

zTritonKernel.codegen_range_treec                 C  r=  )z
        Indicate whether we need provide numel as arguments for the generated
        kernel calls in the benchmark.

        Should be true for pointwise/reduction kernels but false for triton
        matmul kernels.
        Trv   r   rv   rv   rw   need_numel_args  s   zTritonKernel.need_numel_argsc                 C  s   | j otj| j| jS rq   )r  rB   r  should_use_persistent_reductionr  r  r   rv   rv   rw   r    s   z,TritonKernel.should_use_persistent_reductionc                 C  s@   | j rt| j| jd kr| jr| jd dkS tj| jS dS )NrD   r   F)	persistent_reductionr   r  r  r  rB   r  want_no_x_dimr  r   rv   rv   rw   r    s   zTritonKernel.want_no_x_dimc                 C  r=  )Nztl.device_assertrv   r   rv   rv   rw   assert_function  s   zTritonKernel.assert_functionF)
copy_shapedense_indexingoverride_maskr  r   r   c             	     s       j}d}tt  |D ]]ttjsJ |p"ttj	}|r&qtt
jr:jjj }|j qtt
jt
jt
jt
jt
jt
jfrLqfddtjD }	t|	dksdJ dj |	d  d qtjjpy|pyjduo} dk}
d	}d}tt  } D ]}||j rd	}nd}||j! d q|rj"rtjj#r|sjst| dkr$ s|rj%d
krd#ddd#fddd$fddd% fdd}| }|dur|S d}& }t tj'r2|r| dn( }d| d| d}j)r* stdgnt jr*j t+||| S |
rP|sP|r@| dn( }d | d| d!}|n|sa|rad | d| d"}||rit|gjrsj , t+||| S )&zO
        Compute the index and mask to pass to tl.load() or tl.store()
        Fc                   s   g | ]}t  |rt| qS rv   )r   r   r   r  rv   rw   r         z)TritonKernel.indexing.<locals>.<listcomp>rD   zAmbiguous type: r   r   NTr  r   r   
range_treerX   ro   Optional[BlockParameters]c                 S  sB   t | | }|du rdS t|jgt|g|gt|gdS )z
                Matches expressions of the form:
                    idx = s * xindex

                This implies stride (s,), and shape (XBLOCK,).
                Nr   r   r   r   )rE   Zmatch_affine_block_exprsymbolr   numelr   r   r   )r   r  r   rv   rv   rw   match_affine_block%  s   	

z1TritonKernel.indexing.<locals>.match_affine_blockc              
     s      tjdtjtj gdd\}}tdtj| 	t
 || 	t || }t|  j|}|du r;dS |\}}}t|}	tjjjtfdd|	D r]dS tt|	d gfd	d
t|	dd |dd D  }
 fdd
|D }t||
||dS )a  
                Matches higher-dimensional blocks coming from FloorDiv and ModularIndexing.

                Example expression to match:
                   sN * ((rindex//(d1 * ... * d(N-1))))
                       + s1 * ModularIndexing(rindex, 1, d1)
                       + ...
                       + s(N-1) * ModularIndexing(rindex, d1 * ... * d(N-2), d(N-1))

                This iterates over a block of shape (dN, ..., d1) and stride
                (sN, ..., s1). (d1,...,d(N-1)) and (s1,...,sN) are
                wildcards that we match.

                Note that dN does not appear in the expression, but we solve for it
                using range tree numels and the other dims.
                zdenom modulo)exclude)ru   r   Nc                 3  s*    | ]} |  o| V  qd S rq   )r4  Zstatically_known_power_of_2)r   r  )	max_blockr   rv   rw   r   z  s    

zETritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<genexpr>r   c                   s"   g | ]\}}t t ||qS rv   )r   ZMinr   )r   r  r   )linear_block_sizerv   rw   r     s    zFTritonKernel.indexing.<locals>.match_mod_div_block.<locals>.<listcomp>rD   c                   s    g | ]}t | tiqS rv   )r;   r   r   r   )r  r  rv   rw   r     s    r  )r  r   symbols	functoolspartialZWildr  r   range_tree_nodesr  r   r   rE   Zmatch_mod_div_block_exprr  Zget_slice_numelsrB   r   r   r
  r  r   r   r   r   r   r   )r   r  denomZmoduloZnum_dimsZmatch_resultr  r   Zblock_index_exprsZslice_numelsr   r   r   )r  r  r
  r  r   rw   match_mod_div_block;  sZ   
	



z2TritonKernel.indexing.<locals>.match_mod_div_blockr   c                   s,    fD ]}|| |}|dur|  S qdS )ze
                Match a block indexing subexpression involving a single range tree.
                Nrv   )r   r  
match_funcmatch)r  r  rv   rw   match_block_pointer_subexpr  s   
z:TritonKernel.indexing.<locals>.match_block_pointer_subexprOptional[BlockPtrOptions]c                    s   t dd j D  jdd}  fdd| D }tdd | D }t }t| |D ]!\}}t||j	d	kr> d S ||}|d u rJ d S ||7 }q- t
| } tj||| jd
S )Nc                 S  s   i | ]\}}||j qS rv   r   )r   vtrv   rv   rw   r     rJ  zFTritonKernel.indexing.<locals>.match_block_pointer.<locals>.<dictcomp>T)Zreorderc                   s   g | ]
}t  | qS rv   )rE   Zget_subexpr_involving_symbolr  r  Zindex_relative_to_xyr_indexrv   rw   r     s    zFTritonKernel.indexing.<locals>.match_block_pointer.<locals>.<listcomp>c                 s      | ]}|  V  qd S rq   )r  r  rv   rv   rw   r         zETritonKernel.indexing.<locals>.match_block_pointer.<locals>.<genexpr>rD   )r   r   r   r   r   )r;   r  r  active_range_treesr   r   r   r   intersectionr  sumfilter_masksr   r#  r
  )r   Zindex_subexprsZrange_symbolsZblock_paramsr   Zsubexprr   r   )r   r   r  r   r  rw   match_block_pointer  s2   



z2TritonKernel.indexing.<locals>.match_block_pointerz.shaper  r   z, tl.int32)Zxmaskr   r   .shape))r   r   r  rX   ro   r  )r   r   r  rX   ro   r  )ro   r  )-Zprepare_indexingr  r   r~   rP  r   r   r   r   r   r   r   r  r  r'  r  r   ZUNBACKED_INTZSIZEZPRECOMPUTED_SIZEZINDEXFLOATZUNBACKED_FLOATr   r   r  r   r   r   r  r  r  Zvar_listr  allow_block_ptrZuse_block_ptris_indirect_indexingrZ  r   r)  dense_size_strr  r  r   r  )r   r   r  r   r  r  Z
index_varsr   Zcse_varZprefix_matchesZ
need_denseZ
have_denseZhave_loop_varsZdense_mask_varsr   r   optionsr   r   rv   )r   r   r  r  r  r   r  rw   r    s   





b-


zTritonKernel.indexingr   r'  r  r  r   tuple[str, str]c           
   
   C  s   |  }|s	d}n|r|dksJ d|d}nd|}| jrt| jd jrt| rtdt| j }| jt	|| d|j
|dd	  || j|< tjD ]"}||}td
d |D r^qM| j| }	||	vskJ d||	|< qM||fS |
|}||fS )Nr   , other=0.0, boundary_check=z, padding_option='zero'r  r   = F)r%  c                 s  s&    | ]}t jj|td V  qdS r   N)rB   r   r   r   r   r)  r,  rv   rv   rw   r     s
    
z1TritonKernel.codegen_block_ptr.<locals>.<genexpr>z@duplicate advancement for pointer '{block_ptr}' at type '{symt}')r8  r  r   r  r   nextr  r,  r   rK   r0  r  r   r   r<  r  r  )
r   r'  r  r  r-  checkr  r   Zadvance_offsetsZadvancementsrv   rv   rw   codegen_block_ptr  sD   








zTritonKernel.codegen_block_ptrc                 C  sF   | ||j|jd}| dttj| d}d| d| | dS )NFr  r   	tl.store(r   )r   r   r   r  rB   r   	get_dtype)r   r'  r  r  r   r-  rv   rv   rw   codegen_block_ptr_store_line,  s
   z)TritonKernel.codegen_block_ptr_store_liner   rS  lowerr   c                 C  s   |s|sd S t |tjsJ | j|dd}t |tsJ |j}| r&|jnd }|r1t| 	|nd }| 
||r:dnd ||}	| |}
| jj|
|	dtjd d S )NFr  0)Z
assignmentr  )rP  r   Exprr  r   r   r   r   texprrename_indexingZindirect_assertget_load_bufferr  r  rs   r  )r   r   rS  r2  r   r  r   r   Zsize_strlinebufferrv   rv   rw   check_bounds6  s   
zTritonKernel.check_boundsc                 C  s<   |  s| r| jS | jr| jd jr| s| jS | jS )Nr  )	r   r   r  r  r   r  r   r,  loads)r   r  rv   rv   rw   r7  P  s   
zTritonKernel.get_load_bufferc              
     s  | j }| j  d7  < t}| ||}| j|dd}| | }tdd | 	|
 D }| |r?d}	n(|sDd}	n#| jre| jd jre fdd	}
  d
}	ttd|
}nd}	|skr~| r~| jr{dt| j }nd}nd}	 d}tjjr| j }| dk}	 | | o| j o| o|}d}|rd}d }tj}tr|}|tjtj fv rtj!}nvt"|t#r| $|||\}}d| | |	 | d}|%||j&|j'd}n't"|t(j)rd| d| d}|j*}nd| d|j+ d|j, |	 | | d
}|tjtj fv r$tjj-r$|d7 }tj!}|tj.kr8tj/j0d u r8|d7 }tj.}| 1|}| j2j3||||d}|j4dkrV  d8  < t"|t5s^J |j6|_6|rd| d| d}| j2j3|||d}|j6r|j7rd}n|tj.krd}nd}| jrt| jn|}d|j, d| d| d}| j2j3|||d}| jr|8 ss| j9:| |S )NrD   Tr  c                 s  s    | ]}|d kV  qdS r   rv   r  rv   rv   rw   r   t  r  z$TritonKernel.load.<locals>.<genexpr>z, eviction_policy='evict_last'r  c                     s     krs
rdS dS )NZ
evict_lastZevict_firstrv   rv   Zexpected_countr   Zindirect_indexingZload_countsr'  rv   rw   decide_later}  s   z'TritonKernel.load.<locals>.decide_laterz, eviction_policy='<EP>'z<EP>r   z, other=r(  z, cache_modifier='.cg'r  r   r-  r  ro  r  r  r  r   r   z0.0r  r3  r  );r/  r  r  r   r$  r  r   r   r   Zget_strides_of_loadr  is_broadcastedr  r   r  r  r  r2   r   Z_load_otherrU   r   r   skip_l1_cacher  buffer_read_countsrB   r   r0  r^   rs   r  r  rt   rP  r   r.  r   r   r   r   r)  r   r   r   r  rn   r_  r`  r7  r  r  Z	use_countr  r   r  r   r  r  )r   r'  r   r  Z	make_lineoriginal_indexr  r   Zis_coalescedepr=  r-  Zhas_read_depsr@  r?  ZcachemodZappend_broadcastr  r8  r  Zload_buffer
result_varzeroZ	other_valrv   r<  rw   r  _  s   



(
zTritonKernel.loadr   rJ   moderA   c              	   C  sB  | j |}|}| j|d|d u d}|| j jv }| |}	|r*|	r*| jt|d t|t	rB| 
|||\}
}| |||
||}n3|d u rXd| d|j d| d|j d	}n|d	krnd
| d|j d| d|j d	}ntd| t }| js| jr|| || j | jt|| | js| j| |  d S )NT)r   r  ztl.debug_barrier()r/  r-  ro  r   r   Z
atomic_addztl.atomic_add(z, sem='relaxed')zstore mode=)r/  r  r  inplace_buffersr>  storesr   rK   rP  r   r.  r1  r   r   r*  
contextlib	ExitStackr  r  enter_contextguard_cooperative_storer  r  close)r   r'  r   r   rE  r  rA  r  Z
is_inplacer>  r  r-  r8  
exit_stackrv   rv   rw   store  s0   


$$zTritonKernel.storec                 C  s*   | j  }|t|d| d | S )z
        For cooperative reductions only one thread block should write out the result.
        We rotate which thread block does each write for better parallelism
        zif rsplit_id == (z % RSPLIT):)r  r  r   rK   indent)r   r'  r9  r6  rv   rv   rw   rK  	  s   
z$TritonKernel.guard_cooperative_storer  
boundaries.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]boundary_indicesindexing_dtyperw  sorter Optional[tuple[str, sympy.Expr]]sorter_indicesOptional[CSEVariable]c                 C  s   | j tj | j|d }| |d }	| |d }
| |d }|r.| j|d nd}|r9| |d nd}|tjkrCd}n|tj	krKd}nt
d| jj| jd	| d
| d
|	 d
|
 d
| d
| d
| d
| d
| d
| d
| d|d}|S )z3
        See [Note: Inductor bucketize op]
        r   rD   r   r   rp   r  ztl.int64z5Bucketize only supports indexing with int32 and int64z'triton_helpers.bucketize_binary_search(r   z, )r  )r  r  r'   ZONE_ELEMENT_PER_THREADr/  r  r   rs   r  r  r*  r  r  r  )r   r  rP  rR  rS  rw  rT  rV  Zboundaries_ptrZboundary_sizeZboundaries_underlying_numelZboundary_strideZ
sorter_ptrZsorter_strider  r"  rv   rv   rw   	bucketize	  sP   

zTritonKernel.bucketizec                 C  sP   |   }|dkrd| dS | j}dg||  dg|  }| dd| dS )	NrD   z!triton_helpers.promote_to_tensor(r   rM  rp   rN  r   rO  )r  r  r   )r   r   ZndimsZnreducesizesrv   rv   rw   reduction_resizeL	  s   zTritonKernel.reduction_resizec                 C  sT   | j dkr|S |  | j  }|  }|d| dg }t| jj|t||||dS )zC
        Reshape to RBLOCK, collapsing all reduction dims.
        rD   NZRBLOCKr  )r  r  dense_size_listr~   r  r  r   )r   r9  r   r  Ztarget_ndimr   Ztarget_shaperv   rv   rw   reduction_collapse_dimsU	  s   
z$TritonKernel.reduction_collapse_dimsr  reduction_typer@   +Union[CSEVariable, tuple[CSEVariable, ...]]c           .        s	  dKdd}dd t |D }t ||}tdd	 |D r,t|tj}ttjjs1J td
d	 j	D }
| t|}jrM|j j	d jd } fdd|} j dLfdddMfdd}	fdd}
||f}|jjv rjj| S t|}t|}jj|d}tdd	 |D |_d|fdd	jrtj|}t|}dN	fdd  d!krnt|t r fd"dt!||D }n ||}d#v r#t"jj#j$d$| d%| d&t%j&j'd'krtj(ntj)d}d(d)d# |
j$||| nd*kr@j*r8+||	|}n̈,|}nĈd+krit|t-sMJ |\}}}t fd,d	.j$|||D }nd!krv/|}nt|t0s~J jj#j$j$t"|d |j1d}nrjj2d-| |d}tj3|}t|}t|t sĈj45| d.  d/| d/| d0 d#v r3d-| d1}j67 }j45| d.  d/t8|j9 d/:| d0 d(d)d# j$;d2| d3| d4 d5| d/| d/| d/| d6| d7	| d8| d9| d7	| d8| d9 |
j<||| nt=rC+||	|}nd!krd-| d:}d-| d;}j45| d.  d<| d0 j45| d=  d/| d0 j$;d>| d3| d?| d/| d/| d/t>j? d@ j$;d>| d7	| d8| d>| d7	| d8| d>	 |}jjd}@j<||||}n?tA|}|||}j$5| d7	||  |tjBkr| dA}tC}|	j<t"||| n|	j<t"|t"|d  j*rtj3|}tDE }j<jFfD ]} | 5dB |G| H  qd#v rbj<5| dCI| dD  J| dE||}!j67 }J||t8|j9}"|
jF||!|" nt=rd*ksnJ |\}#}$}%J|#t||d }&J|$t||dF }'J|%t||dG }(KjF|#|$|%|&|'|(	 n?d!kr|\}}J|t||d })J|t||dF }*@jF|||)|* nJ|t||}+|	jFt"||+d  |L  |jj|< t|t rNtMdHd	 |D sJ jNO| dIv rtP|dFksJ tP|| }tP|tP|ks#J t!||D ]#\},}-|-d us3J |,j1|-krJj<5|, d7|, dJtC|- d0 q(|S t|tQsVJ jNR| |j1|d kr|d d usmJ j<5| d7| dJtC|d  d0 |S )ONr   rJ   ro   c                 S  s$   | j tjtjfv rt| tjS | S rq   )r  rs   r  r  r  r  rt   r  rv   rv   rw   maybe_upcastm	  s   z,TritonKernel.reduction.<locals>.maybe_upcastc                 S  s   g | ]}|j qS rv   r  )r   r  rv   rv   rw   r   {	      z*TritonKernel.reduction.<locals>.<listcomp>c                 s  s     | ]}|t jt jfv V  qd S rq   )rs   r  r  r  rv   rv   rw   r   }	  s    z)TritonKernel.reduction.<locals>.<genexpr>c                 s      | ]	}|j  d V  qdS r   Nr  r  rv   rv   rw   r   	  r  r  r   c                   s$   j jjd|  d  d| jdS )Nr   r   r   r  r  r  r  r  )r  )r%  r   rv   rw   <lambda>	  s
    z(TritonKernel.reduction.<locals>.<lambda>r~   result_typer   c              
     s   dv }|rdnd} | |}dv r'| d d| d  d}n| d d	| d  d}|d
urD| d| d}|S )zK
            Helper to generate a reduction call, e.g. tl.sum.
            )r   r  minprodZtriton_helperstl)r  rg  r  z2(r   r   r  Nr  )r\  rZ  )r9  r   rf  Z
use_helpermodule)r   r  r]  r   rv   rw   final_reduction	  s   z/TritonKernel.reduction.<locals>.final_reductionrC  rp   c                   s$    | ||}|  | d|  dS )zU
            Generate a reduction and assign it to an existing variable.
            r*  N)r   )r9  rC  r   rf  )rk  rv   rw   final_reduction_define	  s   	z6TritonKernel.reduction.<locals>.final_reduction_definec                   sh    | |} | |}| d| d| d d| d| d  d| d| d d	 d S )
N                z_val, z_idx = triton_helpers.z_with_index(r   )
                r*  Z_idx
                )r\  r   rZ  )r9  rC  r   r   )r   r  root_opr   rv   rw   final_argreduce	  s*   z/TritonKernel.reduction.<locals>.final_argreducer  c                 s  s     | ]}t |d  s|V  qdS r+  )r8   r  rv   rv   rw   r   	  s    
r   c                   s    s| S t  | |S rq   )r~  r  )ZtvalZfval)condrv   rw   
where_cond	  s   z*TritonKernel.reduction.<locals>.where_condc                   s    j j j| || jdS )Nr  rd  )r   default)r   rs  rv   rw   _mask_value	  s   z+TritonKernel.reduction.<locals>._mask_valueonline_softmax_reducec                   s   g | ]	\}} ||qS rv   rv   )r   r  d)ru  rv   rw   r   	  r  )ZargmaxZargminr   zindex, r!  r  r  rg  welford_reducewelford_combinec                 3  s$    | ]}j jj| d V  qdS )r  N)r  r  r  r   r   r  r   rv   rw   r   
  s
    
r!   = tl.full(r   r   _indexrm  _next, z_next = triton_helpers.z%imum_with_index(
                    z(index
                )
                r*  _nextro  _maxZ_sumz, float('-inf'),  = tl.zeros(z
                    zG_next = triton_helpers.online_softmax_combine(
                        z+
                    )
                    z.to(tl.int8)zif HAS_RSPLIT:z_bval = Z_valZ_bvalrD   r   c                 s  s    | ]}t |tV  qd S rq   )rP  r  r  rv   rv   rw   r   
  r  )rx  rv  r  )r   rJ   ro   rJ   )r   r~   rf  r   ro   r~   )rC  r~   r   r~   rf  r   ro   rp   )ro   rJ   )SpytreeZtree_leavesZtree_mapr   rs   r  rt   r  r   r   r  sortedr  rQ  r  r%  Z_map_tuple_or_scalarr  r  r  Zreduction_cacher  r  r  r   r   r  r   Z	Reductiondefault_valuerU   rP  r  r   r~   r  r  rB   r   rZ  r  r  r  rx  Zwelford_reduce_fallbackr   _welfordZ prepare_softmax_twopass_fallbackrJ   r  namedvarZdefault_accumulatorr,  r   r  Zselect_index_dtypeZiinfor  r  r   r  r6   r   r  %online_softmax_reduce_final_reductionZget_reduction_combine_fnrn   r  rH  rI  r  rJ  rO  rZ  *codegen_cooperative_reduction_peer_combinewelford_reduce_final_reductionrL  r  r  r  r   r  r  ).r   r  r  r]  r   r_  Zoriginal_dtypesmasksZreduction_range_prefixrl  rq  r  acc_typeZtorch_acc_typerC  rt  Zmasked_valueZaccumulator_indexmeanm2weightaccumulatorrZ  accumulator_maxaccumulator_sum
result_max
result_sum
combine_fnupdatedZaccumulator_casted_strrf  rM  bufZpeer_valZpeer_idxresult_mean	result_m2result_weightZ	peer_meanZpeer_m2Zpeer_weightpeer_maxpeer_sumZpeersr  Z
orig_dtyperv   )
ru  rr  r%  r   r  rk  r]  rp  r   rs  rw   	reductionf	  s0  


	






	








	


		











zTritonKernel.reductionc                   s    || } || } fddtdD \}}|d| d| d| d| d| dtj d| d|  d| d|  d ||fS )	Nc                      g | ]}t jj d qS r  r~   r  r  r   r{  rv   rw   r   
  r  z7TritonKernel._online_softmax_reduce.<locals>.<listcomp>r   
            r   z9 = triton_helpers.online_softmax_reduce(
                )
            r*  )r\  r   r   r   r  rZ  )r   r9  r  r  r   r  r  r  rv   r{  rw   _online_softmax_reduce
  s6   

	z#TritonKernel._online_softmax_reducec           
   	     s    fdd|||fD \}}}d| d| d| d| d	}fddt dD } d| d	|  tfd
d|D }	|	S )z;
        Helper to codegen triton_helpers.welford.
        c                 3  s    | ]
}  |V  qd S rq   )r\  rz  r9  r  r   rv   rw   r     s
    
z(TritonKernel._welford.<locals>.<genexpr>ztriton_helpers.welford(r   r   c                   r  r  r  r   r{  rv   rw   r     r  z)TritonKernel._welford.<locals>.<listcomp>r   r*  c                 3  s    | ]}  |V  qd S rq   )rZ  rz  r   rv   rw   r   	  r  )r   r   r   r  )
r   r9  r  r  r  r   r  ZwelfordZwelford_resultsZresult_valuesrv   r  rw   r  
  s   zTritonKernel._welfordc                 C  s  |   | j }| d}| d}	| d}
| j| d|   d| d | j|	 d|   d| d | j|
 d|   d| d |dkru|\}}}| jd| d	|	 d	|
 d
| d|	 d|
 d| d| d| d n"|dks{J | jd| d	|	 d	|
 d| d| d|	 d|
 d | jd| d|| d| d|	 d||	 d|	 d|
 d||
 d|
 d |}| jj|d}| jj|d}| 	| j
|||||	|
||	S )z%Helper to codegen a welford reductionZ_meanZ_m2Z_weightr  r   r   ry  rm  r~  z<_next = triton_helpers.welford_combine(
                    z,
                    z#
                )
                rx  z;_next = triton_helpers.welford_reduce(
                    z1, roffset == 0
                )
                z            r*  r  r  r  )r  r  r,  r   r%  r  r   r  r  r  r  )r   rC  r]  r   rs  r  r  r   r  Zaccumulator_m2Zaccumulator_weightr  r  r  r  r  r  rv   rv   rw   rx    s   



	zTritonKernel.welford_reducec
                 C  sP   |  ||||||	}
|||g}t||
D ]\}}|| d|  q|||fS )z0Helper to codegen call to triton_helpers.welfordr*  )r  r   r   )r   r9  r  r  r  r  r  r  r   r  r  result_exprsresult_exprr   rv   rv   rw   r  G  s
   

z+TritonKernel.welford_reduce_final_reductionc                 C  sJ   |  |||||}||g}	t|	|D ]\}
}||
 d|  q||fS Nr*  )r  r   r   )r   r9  r  r  r  r  r   r  r  r  r  r   rv   rv   rw   r  [  s
   z2TritonKernel.online_softmax_reduce_final_reductionc                 C  s   | j r| j d S tS )NRSPLIT)r  r*   r   rv   rv   rw   
max_rsplite  s   
zTritonKernel.max_rsplitc           	      C  s   | j d }|  sdnd}||j |   }| j|\}}| jjd| d| d| | dt	| d| d	| d
| ddd | j
| d| dt| d | dS )a	  
        Generate code to save a [XBLOCK, RSPLIT] temporary workspace, where each thread block writes a different
        column.  After the barrier, every thread block loads the completed value so that it can compute the final
        value independently.
        r  zxindex < xnumelNro  z_ws = (r  z).to(tl.pointer_type(z))
                tl.store(z%_ws + (xindex * RSPLIT + rsplit_id), r   r  Tstripz_peers = tl.load(z_ws + (xindex * RSPLIT + rsplit_arange), rsplit_mask, eviction_policy='evict_first', other=triton_helpers.if_mask(rsplit_mask, r  Z_peers)r  r  r  r  r  r  r  r   r   r<   r  r   rU   )	r   rC  r  Zdefault_valZxnumelr   r  r  r  rv   rv   rw   r  j  s8   

z7TritonKernel.codegen_cooperative_reduction_peer_combinec                 C  s   | j sJ d| _ | j|dd}d| _ | j|}t }| jr)|| || j	 t
|trG| j	t|| |||||d|  nt
|tsNJ | j	t|d| d|j d| d|j d		 |  d S )
NFTr  r)  r/  r-  ro  r   r   )r  r  r/  r  rH  rI  r  rJ  rK  r  rP  r   r   rK   r1  r0  r8  r   r   r   rL  )r   r'  r   r   r  r  rM  rv   rv   rw   store_reduction  s>   

 zTritonKernel.store_reductiondtypestuple[torch.dtype, ...]c           	   	     s*  t  d t   fddtdD }ddd tj|D }d| d	 t d
ddl	m
} | G  fdddt} 4 t|  || }ddd |D }d|  W d    n1 sww   Y  W d    n1 sw   Y  | jj dS )Nz@triton.jitc                   s*   g | ] t  fd dtD qS )c                 3  s.    | ]} j d  d| | dV  qdS )r  r!  r  N)r  r   n)r  r  r  rv   rw   r     s   , z7TritonKernel._lift_helper.<locals>.<listcomp>.<genexpr>)r  r   )r   )r  r  num_args)r  rw   r     s    z-TritonKernel._lift_helper.<locals>.<listcomp>r   r   c                 s      | ]}t |V  qd S rq   r  r  rv   rv   rw   r     r  z,TritonKernel._lift_helper.<locals>.<genexpr>zdef {name}():r  r   rc   c                      s"   e Zd Zd fd	d
ZdS )z+TritonKernel._lift_helper.<locals>.CSEProxyr'  r~   r/  tuple[Any, ...]r  dict[str, Any]ro   r   c                   sB   d| 7 t ||i |} jt ||i ||dS )Nr!  r  )r  r  )r   r'  r/  r  Zoutput_dtyper  dtype_handlerhelperhelper_name	overridesrv   rw   _default  s   z4TritonKernel._lift_helper.<locals>.CSEProxy._defaultN)r'  r~   r/  r  r  r  ro   r   )rr   ry   rz   r  rv   r  rv   rw   CSEProxy  s    r  c                 s  r  rq   r  )r   r  rv   rv   rw   r     r  return r  )rL   r   rI   r   r   r  r  from_iterabler  r  rd   r$   rO  rB   Zset_ops_handlerr  r  r   )	r   fnr  r  r/  	signaturerd   r  Zoutputsrv   )r  r  r  r  r  r  r  rw   _lift_helper  s*   
 zTritonKernel._lift_helperr  UCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]tuple[CSEVariable, ...]c                   st  j sJ jrJ dtdd jD }| t|}jr&J dg }g }tdd |D }t	j
jj |t||} j }t||D ]k\}	}
j
jj|	 dt|
 d|
d}j
jjd	| d
  d|
d}	||	 t|
}jsj
j|
d} }d|d< dd
| d}|
jrdnd}j| d| d
| d
| d || qQdd fdd}|d| d| d
| d|||}js! fdd|D }|t|t|}|t||} fddt||D }t|||D ]\}}}j| d| d
| d q
n|}|D ]}t|ts/J t||_q%t|S )NTODOc                 s  ra  rb  rc  r  rv   rv   rw   r     r  z$TritonKernel.scan.<locals>.<genexpr>z(ops.scan not supported inside ops.maskedc                 s  r  rq   r>   r   r  rv   rv   rw   r     r  r  r   r  r   r   rK  r  rN  rO  zfloat('nan')z-1r|  c                 S     d dd | D S )Nr  c                 s      | ]}| d V  qdS ,Nrv   rz  rv   rv   rw   r     r  z1TritonKernel.scan.<locals>.csv.<locals>.<genexpr>r   r  rv   rv   rw   csv  r   zTritonKernel.scan.<locals>.csvc           	        s   t |} fddt|D }tfdd|D r$fdd|D S fdd|D }j| d   t||D ]\}}rH|_j|| q?t	|S )Nc                       g | ]}  d | d  qS r   rv   r  r8  r  rv   rw   r          z;TritonKernel.scan.<locals>.cse_multiple.<locals>.<listcomp>c                 3      | ]	} j |V  qd S rq   r  containsr   r  r   rv   rw   r     r  z:TritonKernel.scan.<locals>.cse_multiple.<locals>.<genexpr>c                      g | ]} j |qS rv   r  r  r  r   rv   rw   r     rJ  c                   s   g | ]	} j j|d qS r  r  r  )r   Z_dtyper   rv   rw   r     r  r*  )
r   r   r  r  r   r   r   r  r  r  )	r8  r  r  r  r  
cache_keysresult_varsrC  r  r  r   r  rw   cse_multiple  s   z'TritonKernel.scan.<locals>.cse_multipleztl.associative_scan((ro  c                   s&   g | ]} d | dt |jdqS )ztriton_helpers.select_one((z1), rbase == (RBLOCK - 1), dim=-1, keep_dims=True)r  )r>   r  )r   Zpartial_scan_varcse_computerv   rw   r   2  s    
z%TritonKernel.scan.<locals>.<listcomp>c                   s,   g | ]\}} d | d| d|j dqS )ztl.where(roffset > 0, r   r   r  r  )r   Z	full_scanZpartial_scanr  rv   rw   r   ;  s    z = tl.where(roffset > 0, ) r  r  r   r   r  r  r  r  r  r  r  r  r  r  r   r  r  r   r  r%  rQ  r  r  r  r[  r   r  r,  r   rP  r  r   )r   r  r  r  r  broadcasted_valuesZaccumulatorsZcombine_helper_fnr   r   r  Zvalue_dtyper  r  Zreduced_sizert  r  Zpartial_scan_varsZpartial_reduce_varsZ	accs_nextZfull_scan_varsr  Zacc_nextZpartial_reducerC  rv   )r  r  r   rw   scan  s   





zTritonKernel.scanstable
descendingc                   s|  j sJ jrJ dtdd jD }| t|}jr&J djs-J dt	j
jj  j }tdd D tt|ksPJ  fddt|D }d	d
 fdd}jd jspJ jd rzdnd}	t|dkrd|d  d|d  d|	 d| d| d| d}
||
t||}ntdt||D ]\}}||_|j|_qt|S )Nr  c                 s  ra  rb  rc  r  rv   rv   rw   r   Z  r  z$TritonKernel.sort.<locals>.<genexpr>z(ops.sort not supported inside ops.maskedz3ops.sort is only supported in persistent reductionsc                 s  r  rq   r  r  rv   rv   rw   r   e  r  c                   s2   g | ]\}} d | d   d| dqS )r   r   r   r  )r%  )r   r  r   )r  r  r   rv   rw   r   g  s    z%TritonKernel.sort.<locals>.<listcomp>c                 S  r  )Nr  c                 s  r  r  rv   rz  rv   rv   rw   r   o  r  z1TritonKernel.sort.<locals>.csv.<locals>.<genexpr>r  r  rv   rv   rw   r  n  r   zTritonKernel.sort.<locals>.csvc                   s   fddt |D }tfdd|D r fdd|D S  fddt |D }j| d  t||D ]\}}rG|_j|| q>t|S )Nc                   r  r  rv   r  r  rv   rw   r   r  r  z;TritonKernel.sort.<locals>.cse_multiple.<locals>.<listcomp>c                 3  r  rq   r  r  r   rv   rw   r   s  r  z:TritonKernel.sort.<locals>.cse_multiple.<locals>.<genexpr>c                   r  rv   r  r  r   rv   rw   r   t  rJ  c                   s   g | ]}j j | d qS r  r  r  )r  r   rv   rw   r   u  r  r*  )	r   r  r  r   r   r   r  r  r  )r8  r  r  r  r  r  rC  r  r  )r  r8  r  rw   r  q  s   z'TritonKernel.sort.<locals>.cse_multipler  rp   rnumelr   ztriton_helpers.sort_with_index(r   r   rD   z	, stable=z, descending=r   zUnhandled sort)r  r  r   r   r  r  r  r  r  r  r  r  r  r  r  r  r   	enumerater  r  r  r   r   r  )r   r  r  r  r  r  r   r  r  r  r8  r  rC  Z	input_varrv   )r  r  r  r   rw   sortQ  sJ   


zTritonKernel.sortc                   s~  | j s| js| js| js| js| jsdS dd | jD }| jrRt|dkrRt	|D ]d\}}| j
j|d1 |j}| jr?dnd}| jrFdn| d	}| j
d
| d| d| d|  d	 W d   n1 sjw   Y  | j
j|d d | || j
 W d   n1 sw   Y  q+| j
jt|d* | | j
 | j
| j  | j
| j | j
| j | j
| j W d   n1 sw   Y  tg t	|D ]}\}}| j
j|d d\ | j|j  D ]K\}}|t|d k r||d  }	| j|	j | }
t|	}t|	j|  fddt||
D }| j
t| j| | d| dtj | d qW d   n	1 s@w   Y  | j!"| j# |$  qn| j
| j  | j
| j | j
| j | j
| j | j
| j | jr| js| jr| j% d}| j
jd| ddd | j&'  | j
| j | j (  | j(  | j(  | j(  | j(  | j(  dS )a  
        Concat output code from index_code, loads, compute, stores,
        suffix into self.body.

        For pointwise kernels, this is called just once at the end.

        For reduction kernels, this generates a loop over the reduction
        axis.
        Nc                 S     g | ]}|j r|qS rv   r  r  rv   rv   rw   r     r   z-TritonKernel.codegen_body.<locals>.<listcomp>r   )r   Zrsplit_startr3  Z
rsplit_endr  zfor zoffset in range(r   zBLOCK):rD   c                   s   g | ]
\}}||   qS rv   rv   )r   curprevZprev_num_iterrv   rw   r     s    
z = tl.advance(r   z + tl.program_id(1)zR
                if HAS_RSPLIT:
                    triton_helpers.x_grid_barrier(rn  Tr  ))indexing_coder;  rG  r  r  r  r   r  r   r  r,  rO  r  r  r   r   r  r  r   r  r  r   r  r   r   r   r  r   rK   r  rB   r   r   r  Z
invalidater  cache_clearr  r  r  clear)r   Z
loop_treeslevelr   r  Z
loop_startZloop_endr  ZadvancementZ	prev_treeZprev_advancementZ
prev_blockZsem_ptrrv   r  rw   codegen_body  s    	









zTritonKernel.codegen_bodyr  c                 C  s   g }|   rOg }| d|g  |D ]=}t|tr |t| qt|tr3|ttjj	
|j qt|tjrF|ttjj	
| qtdt| |S )Nr   z!Unsupported numel argument type: )r  add_numel_to_call_argsrP  r  rQ  r~   r`   rB   r   r   	size_hint
inner_exprr   r4  r  r  )r   r/  Z
numel_argsr  rv   rv   rw   kernel_benchmark_extra_args  s   

z(TritonKernel.kernel_benchmark_extra_argsc                 C  s  t  }| j \}}}}|g d |  t }g }t||D ]\}	}
dt| }t	j
|	}|r]|| dt	j
j|  dt	j
j|  d|  d|  d
 n||	t	j
jv rt	j
j|	 }|| dt	j
j|  dt	j
j|  d|j d|j d
 nKt|
trt	j
j|
j}d|
jv rd	}|| d
|  n,t|
trt	j
 }t	j
j|
j}|| d| d| d|
j d ntd|	 | | q#|!| "  |dd#| d W d    n1 sw   Y  |g d t	j
 }|j$}| S |dt	j
j%&| d | 0 |t	j
j%'| d| }|| d| d |t(t)j* d| d W d    n	1 s[w   Y  W d    n	1 skw   Y  |g d | A |dt	j
j%&| d |  |t	j
j%'| |dt(t)j* d W d    n	1 sw   Y  W d    n	1 sw   Y  |g d | / |d |d |d |d |d|  |d |d W d    |S 1 sw   Y  |S )N)r   r   zdef get_args():Zarg_z = rand_strided(r   z
, device='z	', dtype=r   r  r   r*  z = torch.zeros(z*Don't find the buffer or const tensor for r  r  )
r  zdef call(args):zwith rM  streamz = get_raw_stream(z.run(*args, stream=)r  r  z def benchmark_all_configs(args):z.benchmark_all_configs(*args))r  r  zif __name__ == '__main__':z<from torch._inductor.runtime.benchmarking import benchmarkerr   zargs = get_args()z:ms = benchmarker.benchmark_gpu(lambda: call(args), rep=40)z	num_gb = zgb_per_s = num_gb / (ms / 1e3)z<print(f"{ms:.3f}ms    {num_gb:.3f}GB    {gb_per_s:.2f}GB/s"))+rL   r/  python_argdefs
writelinesrO  r  r  r   r,  rB   r   Ztry_get_bufferr   r   
size_hintsget_sizeZ
get_stride
get_devicer0  	constantsrS  r   devicer  rP  rQ   r  r   r'  rS   get_current_device_or_throwKeyErrorrQ  extendr  r   r   
device_opsZdevice_guardZ
set_devicer~   r7   KERNEL_NAME)r   num_gbr"  Z_argdefs	call_argsr  r!  Zname_cntZ	var_namesarg_nameZarg_sigvar_namer  Zconst_tensorZsymval_hintr  r  current_devicer   stream_namerv   rv   rw   codegen_kernel_benchmark  s   
D@



'











z%TritonKernel.codegen_kernel_benchmarkc                 C  s   t dtjjdS )Nzl
            from torch._dynamo.testing import rand_strided
            {}
            import torch
        Zget_raw_stream)textwrapdedentr0  rB   r   r  Zimport_get_raw_stream_asr   rv   rv   rw   imports_for_benchmark_kernel_  s   z)TritonKernel.imports_for_benchmark_kernelc                 C  s6   | j rdS | jr
dS | jr| jsJ dS | jrdS dS )Nr  r  r  r  Z	pointwise)r  r  r  r  r   rv   rv   rw   _get_heuristich  s   
zTritonKernel._get_heuristicc                  C  s   t jj t  tjtjtjj	tj
tjtjtjtjtjjtjjtjjd} t jjd ur/d| d< t r7d| d< tjrNtj| d< tj| d< tj| d< tj| d< tjr`tj| d	< tj| d
< tj| d< | S )N)Zbackend_hash$are_deterministic_algorithms_enabledassert_indirect_indexingautotune_local_cacheautotune_pointwiseautotune_remote_cacheforce_disable_cachesdynamic_scale_rblockmax_autotunemax_autotune_pointwisemin_split_scan_rblockspill_thresholdstore_cubinTZis_hipr^  profile_bandwidthprofile_bandwidth_regexprofile_bandwidth_output/profile_bandwidth_with_do_bench_using_profilingcoordinate_descent_tuning coordinate_descent_search_radius'coordinate_descent_check_all_directions)rs   utilsZ_tritonZtriton_hash_with_backendr  r   r  r  r   r  r  r  r  r  r  r  r  r  r_  r`  r^  r  r  r  r  r  r  r  )inductor_metarv   rv   rw   inductor_meta_commont  s@   



z!TritonKernel.inductor_meta_commonc                    s8  t  }i }| j D ](\}}t|r| jsq
tjj|}t	|t
tjfs(d}ntt
|}|||< q
|d u r\|t  tj j}|dkrM|d n|d tjr\||   | j \ }	}	tD ]$\}
}t	|trttj|j}|tjjjv rt|jtjjj| |
< qitt  }| j D ]G}|| jj!v r|"| jj!|  || jj#v r|tjj$vr|| j$vr|"tt%| jj#| j& || jj'v r| jj'| }t	|t(rJ |"| qt) D ]\}}t	|t*r|j+t,j-kr|"|j qt.|}| / D ]}t|j0 d|j1}2|  2t3|j q fdd}| j4D ]}|j5r2| j6r2q&|j7d u r:q&||j08  d q&| j9rN|d	 t:| j; d
}|t<=tj i d}tjj>pjtjj?}| @ jAtB| jCttDjE||| jF| jG| jHd| I }| j9r| j6|d< d }tjstjJr| K d }||d< tLg|d< tMD ]}d|d | j< q|| _N| O  | jPD ]}|Qd || q| jRrd| S  d| jRjd|d|d	}nM| jr	| jTU }d| S  d|d| d|d|d}n/d}tV|dkr tVtWdkrd}nd}d| S  d|d| d |d|d!| jX d}|| |Qd"|pGttDjY d#dZd$d%  D  d& |[ ( | \| | j] D ]\}}|Q| d'|  qf|| j^ W d    n	1 sw   Y  tjr|| _| |` S )(Ni    cpuz"triton_helpers.set_driver_to_cpu()z"triton_helpers.set_driver_to_gpu()r  c                   s*   t  r
t|   t| dd d S )NT)Zis_constexpr)r=   rQ  rH   rF   )r  argdefsr  rv   rw   add_constexpr_arg  s   z6TritonKernel.codegen_kernel.<locals>.add_constexpr_argr   r  )Z
size_dtyper#  )r  r  r  )Z	grid_typer  kernel_namemutated_arg_namesoptimize_memr  num_loadnum_reductionr  g    eAZkernel_num_gbZconfigsrD   r  r   z$
                @triton_heuristics.z(
                    config=zI,
                    filename=__file__,
                    triton_meta=z$,
                    inductor_meta=z;
                )
                @triton.jit
            z!(
                    size_hints=z%,
                    reduction_hint=r   r  ztile_hint=TileHint.SQUARE,ztile_hint=TileHint.DEFAULT,r   zH
                    filename=__file__,
                    triton_meta=z*,
                    min_elem_per_thread=zdef r  c                 s  r  rq   )Z	full_namer  rv   rv   rw   r   n  r  z.TritonKernel.codegen_kernel.<locals>.<genexpr>r  r*  )arL   r  r  r8   r  rB   r   r   Zsymbolic_hintrP  r  r   r)  r,   r   r   r  r  r   benchmark_kernelr	  r/  r  r  rQ   r   r   r   Zinv_precomputed_replacementsr'  r   r~   Z	mutationsZinput_buffersr  rF  removed_buffersrM   Z
inner_nameZoutput_buffersrP   r   rS   Z	zero_moderT   ZZERO_ON_CALLr  r  r  r  rQ  rF   r   r  r  
tensor_dimr   r  r_   rZ  r(   r#  Zis_inferenceZis_backward_get_grid_typerr   setr  r7   DESCRIPTIVE_NAMEr  r(  r)  r   r  Zestimate_kernel_num_bytesr[   r\   r  r  r  r   r  r
  r  Zget_reduction_hintr   r]   r  r  r   rO  codegen_static_numelsaliasesr,  r  r   ) r   r'  coder  r  r  Z
numel_hintr  device_typer!  r  r  r  Zmutated_argsZmutationZmutation_argargnamer   Zsizeargr$  Ztriton_meta_signaturer  r'  r  r   Zarg_numr  Zheuristics_lineZreduction_hintZ	tile_hintoldnewrv   r"  rw   codegen_kernel  s*  








	




	



,

zTritonKernel.codegen_kernelc                 C  sx   t jj| } t| tjtfrt| }t|}|S d}t jj	| |s:|dkr.t
d|  |d9 }t jj	| |r#|S )N   i @  z!Failed to find static RBLOCK for r   )rB   r   r   simplifyrP  r   r)  r  r,   statically_known_leqr  )r  r  rv   rv   rw   _get_persistent_RBLOCK{  s   z#TritonKernel._get_persistent_RBLOCKc                 C  s&   zt |  W dS  ty   Y dS w )NTF)r  r;  r  )r  rv   rv   rw   has_persistent_RBLOCK  s   
z"TritonKernel.has_persistent_RBLOCKc                 C  s   ddd}| j D ]Z}|jr| jr)tjj|j}||r)||j	 dt
|  |jrU| jrU| jrB| | |j}d| d	}n| |j}||j	  d
|  |j	dkrb| jrb|d qdS )a  
        We get a small speedup from hard coding numels if they are static.

        This code stomps on the passed-in values by writing an constant to the top of the kernel.

        In a kernel like:
        def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

        We would add
        xnumel = 4096
        r0_numel = 768

        After the signature, before the kernel code, if we decided to make these static. As its hardcoded, it becomes
        a better signal to triton on how to unroll and do some static indexing. So, it's not so much that downstream
        knows that its a static numel, as that you just plop a constant into the kernel.
        r   r   ro   rn   c                 S  s   t | tjtfS rq   )rP  r   r)  r  r  rv   rv   rw   is_static_integer  r  z=TritonKernel.codegen_static_numels.<locals>.is_static_integerznumel = z*triton_helpers.constexpr_next_power_of_2((z + RSPLIT - 1) // RSPLIT)zBLOCK: tl.constexpr = r  zXBLOCK: tl.constexpr = 1N)r   r   ro   rn   )r   r  r  rB   r   r   r9  r  r   r  r  r  r  r  r6  r;  r   r  )r   r2  r=  r   Zsimplified_tree_numelr  r  rv   rv   rw   r0    s    


z"TritonKernel.codegen_static_numels type[triton_heuristics.GridExpr]c                 C  s|   t dd | jD }| jr|dksJ tjS |dkrtjS |dkr0tt| j| jr-tj	S tj
S |dkr7tjS td| )Nc                 S  s   g | ]}t |j qS rv   )r  r  r  rv   rv   rw   r     rJ  z/TritonKernel._get_grid_type.<locals>.<listcomp>rD   r   r   z"Unsupported number of dimensions: )r  r   r  r%   ZCooperativeReductionGridZGrid1Dr   r   needs_yz_grid_overflowZGrid2DWithYZOverflowZGrid2DZGrid3Dr  )r   r  rv   rv   rw   r-    s   zTritonKernel._get_grid_typec                 C  s`   | j D ]*}t|jtjtjfr|j}ntjj	||}|j
r!| jr-|| |t| qd S rq   )r   rP  r  r   r)  r   rB   r   wrapper_codeZgenerate_numel_exprr  r  rQ  r  )r   r'  r  	arg_typesr   r   rv   rv   rw   r    s   

z#TritonKernel.add_numel_to_call_argsr  Optional[IRNode]c                 C  s~   t jj}|  | j \}}}}| ||| | jjD ]}|| q|j	||d|| j
d t| jjD ]}|| q5d S )NT)r   rA  r  )rB   r   r@  Zwrite_triton_header_oncer/  r  r  Zworkspace_argsZgenerate_workspace_allocationZgenerate_kernel_callr  r  Zgenerate_workspace_deallocation)r   r'  r  wrapperr!  r  rA  wsrv   rv   rw   call_kernel  s    zTritonKernel.call_kernelc                 C  s   t jj}| j \}}}}t||D ]0\}}t|trBt jjr,|	d| d| d qd| d}|	| d| d}|	| qd S )Nz:AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_check_inf_and_nan("z", z));zassert not z.isnan().any().item()z.isinf().any().item())
rB   r   r@  r/  r  r   rP  rR   Zcpp_wrapperr   )r   rC  r!  r  Zarg_signaturesr  Zarg_signaturer8  rv   rv   rw   codegen_nan_check  s   


zTritonKernel.codegen_nan_checkr  c                 O  s   t |i |S rq   )r  )r   r/  r  rv   rv   rw   create_cse_var  r   zTritonKernel.create_cse_varentryrW   c                 C  sF   |j  d| | |j }|jjr| j| d S | j| d S r  )	r'  r  r6  r   rootr  r  r   r,  )r   rH  r8  rv   rv   rw   codegen_iteration_ranges_entry  s   z+TritonKernel.codegen_iteration_ranges_entryrX   c                 C  sn   |j d usJ | |j }| j}|dkrd| dnd}| jr*| jr*|jr*| d}d|j  d| | S )Nr  r  r   r   z + rsplit_startztl.arange(0, zBLOCK))r,  Zindexing_size_strrZ  r  r  r  r  r   )r   rH  rS  rZ  r  rv   rv   rw   r     s   
z)TritonKernel.iteration_ranges_ranges_coder   c                 C  s0   | j }|  }dg| }d| d| d| dS )NrD   r  r   r   )rZ  r  )r   rH  r   rZ  r  rS  rv   rv   rw   iteration_ranges_scalar_code  s   
z)TritonKernel.iteration_ranges_scalar_codec                 C  st   |j d usJ d|j  d}| |r#d| d|j d  d|j  d}|j||}| jdkr8| d	| j dS |S )
Nztl.program_id(r   r  z + tl.program_id(rD   z) * tl.num_programs(r  r  r  )r  r?  Z	pid_cacher  rZ  )r   rH  r  pidrv   rv   rw   iteration_ranges_get_pid  s   
 
z%TritonKernel.iteration_ranges_get_pidc                 C  s0   |j dko|j o| j otjj|jt  S r  )	r  has_zdimr  rB   r   r   r:  r  r+   )r   rH  rv   rv   rw   r?  $  s   
z#TritonKernel.needs_yz_grid_overflowr  r  c                 C  s&   | j r| j |  d S t|  S )Nr   )r  r   r)   )r   r  rv   rv   rw   r
  ,  s   zTritonKernel.max_blockr   c                 C  s   | j sdS | jr#|j  d| jv r#| j|j  d dkr"dS ntjj|jdr.dS |j	r;| j
r;| |j}n|jdkrF| jrFd}n| |j}|j	rX| jrX||   }tjj|j|rs|jdkpr|jprtjj|jt S dS )NFr   rD   Tr  )r  r  r  r   rB   r   r   r   r  r  r  r;  r  r
  r  r  r4  r  rN  r:  r+   )r   r   r
  rv   rv   rw   r  1  s,   
zTritonKernel._has_constant_maskc                 C  s"   | j d }|jdksJ | |S )Nr   r  )r   r  r  )r   Zxtreerv   rv   rw   r  [  s   

z TritonKernel._has_constant_xmaskr   r   c                 C  s6   | j D ]}| |r||j d q|d d S )Nr   rp   )r   r  r  r  )r   r   r   rv   rv   rw   r  `  s
   

zTritonKernel.filter_masksc                 C  s   dd t tjd | j D S )Nc                 S  s   g | ]}t | qS rv   r1  r   rv   rv   rw   r   j  s    z7TritonKernel.get_reduction_prefixes.<locals>.<listcomp>)r  r   r   r  r   rv   rv   rw   get_reduction_prefixesh  s   z#TritonKernel.get_reduction_prefixesr9  rL   c                 C  sp   dd | j D }dtdd |D }|d| |  dd | j D }t|}|d| |  d	S )
z^
        Generates code that flattens ND reduction numels, block sizes, etc. into 1D.
        c                 S  r  rv   )r  r  rv   rv   rw   r   t  r   z9TritonKernel.codegen_reduction_numels.<locals>.<listcomp>r  c                 s  ra  )r  Nrc  r  rv   rv   rw   r   u  r  z8TritonKernel.codegen_reduction_numels.<locals>.<genexpr>z	rnumel = c                 S  s   g | ]}|j rtj|j qS rv   )r  r   r   r   r  rv   rv   rw   r   y  s    
zRBLOCK: tl.constexpr = N)r   r   r  r   r  r:   )r   r9  Zreduction_treesr  Z	rn_blocksr:  rv   rv   rw   r  o  s   z%TritonKernel.codegen_reduction_numelsr  list[sympy.Symbol]c                   s   |   } fdd|D S )zK
        Helper to initialize symbols like rn_numel, rn_base, etc.
        c                   s&   g | ]}t j|  fi  qS rv   )r   r   )r   r  r  r  rv   rw   r     s   & z7TritonKernel._get_reduction_symbols.<locals>.<listcomp>)rO  )r   r  r  rn_prefixesrv   rQ  rw   r    s   z#TritonKernel._get_reduction_symbolsr   c                   sB   |   }| jdddd  fddtt|d D tdg S )z
        Compute coefficients to convert ND reduction indices to linear indices.
        For example:
          rindex = r0_index * r1_numel * ... * rn_numel + ... + rn_index.
        r  Tr   c                   s    g | ]}t  |d  d qS r   )r:   r5  Z	rn_numelsrv   rw   r     s    z<TritonKernel._get_reduction_index_coeffs.<locals>.<listcomp>rD   )rO  r  r   r   r   r)  )r   rR  rv   rS  rw   _get_reduction_index_coeffs  s   

z(TritonKernel._get_reduction_index_coeffs
multi_indsc                 C  s   |   }t||S )zK
        Compute linear reduction indices from N dimensional ones.
        )rT  r9   )r   rU  Zcoeffsrv   rv   rw   r    s   
z'TritonKernel._flatten_reduction_indicesc                 C  sd   | j dddd}| j dddd}| |}|d| |  | |}|d| |  dS )zX
        Generates code that converts ND reduction indices into linear indices.
        r   Tr   r   z
roffset = z	rindex = N)r  r  r   r   )r   r9  Z
rn_offsetsZrn_indsr%  rindexrv   rv   rw   r    s   

z&TritonKernel.codegen_reduction_indicesr2  c                 C  s  |j }|jr||j d| d| d nP|jd u r2||j d| |  || d n4|jd urB| d| | }n	| || d}|| d| 	| d|
  d|j d| g | |r||  }|| d	| d
 d S || d|j d| d d S )Nr*  z	offset + r  z
offset = 0r   z	offset = r  r   zmask = tl.full(z, True, tl.int1)zmask = z < r  )r  r  r   r'  r  r  r,  rK  r  rM  r   r  r%  )r   rH  r2  r  r8  rY  rv   rv   rw   r    s$    


"z,TritonKernel.iteration_ranges_codegen_header)r   TN)r  r  r  r  ro   rp   r  r  ro   r~   r   r   )r   r   )r   )r'  r~   r  r~   r  r   ro   r'  )r   r   rS  r   r2  rn   r   rn   )r'  r~   r   r   rq   )
r'  r~   r   r   r   rJ   rE  rA   ro   rp   NN)r  rJ   rP  rQ  rR  rJ   rS  r  rw  rn   rT  rU  rV  rW  ro   rJ   )r   r~   r  r  ro   r~   )
r  r  r  r  r]  r@   r   r^  ro   r^  )r  r  )r'  r~   r   r   r   r^  )r  r  ro   r~   )r  r  r  r  r  r  ro   r  )
r  r  r  r  r  rn   r  rn   ro   r  )ro   r  )ro   r>  )r'  r~   r  rB  r  )ro   r  )rH  rW   )rH  rX   ro   r~   )rH  rX   r   r   ro   r~   )rH  rX   ro   rn   )r  r~   ro   r  )r   rX   ro   rn   )r   r   ro   rp   )r9  rL   ro   rp   )r  r~   ro   rP  rB  )rU  r   ro   r   )rH  rX   r2  rL   ro   rp   )Mrr   ry   rz   r~  r  r|   r5  r  r#  r  r  r  r  r  r  r  r  r  r   r  r  r.  r1  r:  r7  r  rN  rK  rX  rZ  r\  r  r  r  rx  r  r  r  r  r  r  r  r  r  r  r  r	  r
  rD  r   r7  r;  r<  r0  r-  r  rE  rF  rG  rJ  r  rK  rM  r?  r
  r  r  r  r1   rO  r  r  rT  r  r  r  r  rv   rv   r  rw   r  1  s   
 
'
%


  +
/


 	*
3
	
   
	
;


*
3
o@
dZ	

' `


&









*




r  c                	      s   e Zd ZU eZded< eejej	ej
ejejejejejgZd) fddZed*ddZdd Zdd Zd+d,ddZ	d-d.ddZd/d!d"Zd0d%d&Zd'd( Z  ZS )1TritonSchedulingz	type[Any]kernel_type	schedulerOptional[Scheduler]ro   rp   c                   sF   t  | |d u st|dsd S |jD ]}t|ttfr t|_qd S )Nr  )	r  r  r   r  rP  r0   r.   debug_triton_codeZdebug_device_str)r   r[  r  r  rv   rw   r    s   
zTritonScheduling.__init__r  torch.devicec                 C  s*   t jjst jjrtg | jtjS | jS rq   )r   r   Zcooperative_reductionsZforce_cooperative_reductionsr   backend_featuresrG   ZREDUCE_TO_SINGLE_ELEMENT)ru   r  rv   rv   rw   get_backend_features  s   z%TritonScheduling.get_backend_featuresc                   s   t jj}t||\}}|r|| tjrAddlm m	 t
fdd|D sC fdd|D }||j dd|  d S d S d S )	Nr   r-   ForeachKernelSchedulerNodec                 3  s    | ]}t | V  qd S rq   )rP  r  )rb  rv   rw   r     s    

z3TritonScheduling.codegen_comment.<locals>.<genexpr>c                   s   g | ]}t | r| qS rv   )rP  get_namer  )r-   rv   rw   r     r  z4TritonScheduling.codegen_comment.<locals>.<listcomp>z Fused node name list: r   )rB   r   r@  r5   r   r   Zdebug_fusionZtorch._inductor.schedulerr-   rb  r   commentr   )r   node_schedulerC  originsZ_detailed_origins
node_namesrv   ra  rw   codegen_comment  s"   

z TritonScheduling.codegen_commentc                 C  st  t jj}||jv r|j| }|S tjjrt|tjjnd}t|d d }d	d|||
 g}||j|< tjjr;|nd}|ttj|}|ttj|}|dd}tt| d\}	}
}t }t rnt|| |d	|d
 |j|dd t j }|d|j d d| }t||\}}|d| d | 7 }||| | tdrt ||| |S )Nr   r   r!  r   triton_z#pragma CMT#pyzasync_compile.triton(z, '''Tr  z''', device_str='z')z# kernel path: r  Zkernel_metadata)!rB   r   r@  Zsrc_to_kernelr   r   Zdescriptive_namesr4   rC   r   Znext_kernel_suffixZunique_kernel_namesreplacer~   r7   r/  r  r"   r!   r  rL   async_compileZuse_process_poolr   r   r  r  r5   define_kernelr   r   Zis_metric_table_enabledZlog_kernel_metadata)r   src_codere  r   rC  r%  Z
fused_nameZkernel_categoryZ	subs_name	_basenamer!  Zkernel_pathZcompile_wrapperr  Zmetadata_commentrf  Zdetailed_originsrv   rv   rw   rn    sD   

5




zTritonScheduling.define_kernelr  tuple[float, str]c                 C  s6   | j |dd}t|}| j||tdd |D dS )z
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        T)r*  c                 s  r  rq   rc  r  rv   rv   rw   r   G  r  z9TritonScheduling.benchmark_fused_nodes.<locals>.<genexpr>)rg  )generate_kernel_code_from_nodesr#   r  benchmark_codegened_moduler   )r   r  n_spills_thresholdro  r  rv   rv   rw   benchmark_fused_nodes?  s
   
z&TritonScheduling.benchmark_fused_nodesNrg  Optional[OrderedSet[str]]c           	        s   t tjj}t  |tj  dfddfdd}fdd}|dur.|ntdg}t	d	|j
 | durUj
fW  d   W  d   S   jjzj  d
  W n8 ty } z,tjjrw t	d|| td|  j
fW  Y d}~W  d   W  d   S d}~ww j}t|dksJ |d
 j|krtdnt fddtjd
kr؈t fdd t	d| |  j
fW  d   W  d   S 1 sw   Y  W d   dS 1 s	w   Y  dS )z$Benchmark an already compiled moduleNc                     $    j d usJ tj j d d S Nr   z.kernel_perf__file__ospathsplitextrv   r  rv   rw   cache_file_pathU     zDTritonScheduling.benchmark_codegened_module.<locals>.cache_file_pathc                    sD     } t | d}|t W d    d S 1 sw   Y  d S )Nwopenwriter~   r}  fd)r  msrv   rw   store_cacheY  s   "z@TritonScheduling.benchmark_codegened_module.<locals>.store_cachec                    sJ     } t j| r#t| }t| W  d    S 1 sw   Y  d S rq   )r|  r}  existsr  floatreadr  r  rv   rw   
load_cache^  s   

 z?TritonScheduling.benchmark_codegened_module.<locals>.load_cacheunknown%kernel src code for %s written to: %sr   z*Exception (%s) in compiling fused nodes %sinfrD   c                        j   d S r(  
clone_argsrv   r/  callwrapped_jit_functionrv   rw   re    r`  z=TritonScheduling.benchmark_codegened_module.<locals>.<lambda>c                     s
   j   S rq   r  rv   r/  r  rv   rw   re    s   
 z+The fused kernel for %s took %.3f ms to run)r   rB   r   r3  r   r  r  r   rh  debugr{  get_argsr  ri  r  	Exceptionr   r   Z.disallow_failing_autotune_kernels_TESTING_ONLYr  	launchersr   n_spillsr&   benchmark_gpur&  )	r   r  ru  rg  Zdevice_interfacer  r  r}  r  rv   )r/  r  r  r  r  r  rw   rt  J  st   "(
Tz+TritonScheduling.benchmark_codegened_modulekernel_featuresrf   kernel_args	list[Any]kernel_kwargsr  list[TritonKernel]c           	      C  s   | d}|otdd | D }| j}|rddlm} |}|r%d|d< | dr2d	|d
< d|d< t|jsC|	d
r?J d|d
< t
j||||}||i |}| |||S )Nr  c                 s  r  rq   )is_split_scanr   r  rv   rv   rw   r     r  z9TritonScheduling.create_kernel_choices.<locals>.<genexpr>rD   )TritonSplitScanKernelFoverride_cooperative_reductionr  Toverride_persistent_reduction)Zcontains_opr   Zscheduler_nodesrZ  Ztriton_split_scanr  r  r<  reduction_numelr  rB   r  Ztriton_kernel_kwargsadd_multi_kernel_choices)	r   r  r  r  Zis_scanr  rZ  r  r   rv   rv   rw   create_kernel_choices  s*   

z&TritonScheduling.create_kernel_choicesr   r  c           
      C  s   |g}t jjs	|S |jo|d }|jo|d }|r,|| j|i |ddi |r`|jj	}t
jj|dr`|| j|i |ddi } |r`|jr`|| j|i |ddd t|dkr{|dd  D ]}	|j|	_ql|jdd d	 |S )
Nr  r  Fi   )r  r  rD   c                 S  r   rq   )r  )krv   rv   rw   re    s    z;TritonScheduling.add_multi_kernel_choices.<locals>.<lambda>)r  )r   r   Zmulti_kernelr  r  r  rQ  rZ  r  r  rB   r   r   r:  r   Zmust_keep_buffersr  )
r   r   r  r  ZkernelsZoptional_persistentZoptional_cooperativer  r-  Zkernel2rv   rv   rw   r    s^   



	
z)TritonScheduling.add_multi_kernel_choicesc                   s  fddfdd}fdd}dg }}d}t jj}t|t j_t jj}t|t j_tjdk}	tjdk}
| j|d	|	|
d	d
}|D ]\}}}dd |D }dd |D }|	t
tjd}t|td|j | \d ur|7 }|7 }|j qE  jjj  d  j}t|dksJ |d jdkrtd nt fddt fddtdtdd |D  |  |7 }|7 }|j qE|t j_|t j_|||fS )Nc                     rx  ry  rz  rv   r  rv   rw   r     r  z@TritonScheduling.benchmark_combo_kernel.<locals>.cache_file_pathc                    sX     } t j| r*t| }tdd |  D W  d    S 1 s%w   Y  dS )Nc                 s  r  rq   )r  )r   r}  rv   rv   rw   r     r  zNTritonScheduling.benchmark_combo_kernel.<locals>.load_cache.<locals>.<genexpr>rX  )r|  r}  r  r  r  r  r  r  r  rv   rw   r    s   
 z;TritonScheduling.benchmark_combo_kernel.<locals>.load_cachec                    sP     } t | d}|td t  W d    d S 1 s!w   Y  d S )Nr  r  r  r  )r  r  ms_clonerv   rw   r    s   "z<TritonScheduling.benchmark_combo_kernel.<locals>.store_cacher   g        T)Zsubkernel_nodesZcustom_part_algorithmenable_autotunemixed_sizesZonly_gen_src_codec                 S  s   g | ]}|  qS rv   )	get_nodesr  rv   rv   rw   r   !  r  z;TritonScheduling.benchmark_combo_kernel.<locals>.<listcomp>c                 S  s   g | ]}|D ]}|  qqS rv   rr  )r   r  r  rv   rv   rw   r   "  r  ri  r  rD   r  c                     r  r(  r  rv   r  rv   rw   re  C  r`  z9TritonScheduling.benchmark_combo_kernel.<locals>.<lambda>c                     s   j   d S r(  r  rv   r  rv   rw   re  F  s    zDThe fused kernel for %s took %.3f ms to run, %.3f ms to clone inputsc                 s  r  rq   rr  r  rv   rv   rw   r   K  r  z:TritonScheduling.benchmark_combo_kernel.<locals>.<genexpr>)rB   r   r+  r   Zinplaced_to_remover   Zcombo_kernels_autotuneZcombo_kernel_allow_mixed_sizesZgenerate_combo_kernel_coderl  r~   r7   r  r#   r  rh  r  r{  rQ  r  r  ri  r  r  r   r  r  r&   r  )r   Z	node_listr  r  Ztotal_ms	file_listZtotal_clone_msZremoved_buffers_origZinplaced_to_remove_origr  r  Zkernel_code_listro  r!  Z
node_groupZfused_node_listsnamesr  rv   )r/  r  r  r  r  r  r  rw   benchmark_combo_kernel  sz   





z'TritonScheduling.benchmark_combo_kernel)r[  r\  ro   rp   )r  r^  )r  )ro   rq  )r  N)rg  rw  ro   rq  )r  rf   r  r  r  r  ro   r  )r   r  r  r  r  r  ro   r  )rr   ry   rz   r  rZ  r|   r   rG   ZFOREACHZ	BUCKETIZEZINPLACE_BUFFERSZMASKED_SCATTER_WITH_INDEXZSCANZSORTZTRITON_TEMPLATESZTUPLE_REDUCTIONr_  r  r}   r`  rh  rn  rv  rt  r  r  r  r  rv   rv   r  rw   rY    s0   
 
:
W
%5rY  r  r-   r  c                 C  s
  g }|   }|d u st|tjsJ |r%|jd u r%||   d |S ddlm} | 	 }|d us5J | j
|}t|t|fsKJ dt| tj| tj}||   }|t_W d    n1 skw   Y  ||   d |t|d |S )Nz" Unfinalized multi template bufferr   )CUDACombinedSchedulingz]Scheduling backend should be SIMD or CUDACombined when generating debug Triton strings, got: z Triton code:z    )Zget_template_noderP  r   ZMultiTemplateBufferZmake_kernel_renderrQ  rc  Z0torch._inductor.codegen.cuda_combined_schedulingr  r  r[  get_backendrZ   r  rB   r   Zset_current_devicer   Zgenerated_kernel_countrs  r  r  r  rO  )r  linesZmulti_templater  r  backendZold_generated_kernel_countZtriton_coderv   rv   rw   r]  X  s2   
r]  r   )r   r~   rE  r   rF  r   ro   r~   rW  )r  r  ro   r  )r  r  ro   r  )r  r  ro   rn   )r  r  ro   rn   )ro   rd   rC  )rm   rn   ro   r  )r  r-   ro   r  )
__future__r   r  rH  r  r  r  loggingr|  r|  r  collections.abcr   r   r   typingr   r   r   r	   r
   r   r   Zsympy.printing.precedencer   rs   Ztorch._loggingZtorch.utils._pytreer  Z_pytreer  Ztorch._dynamo.device_interfacer   Ztorch._dynamo.utilsr   r   Ztorch._prims_commonr   Ztorch.utils._ordered_setr   Ztorch.utils._sympy.functionsr   r   r   Ztorch.utils._tritonr   Zutils._sympy.symbolr   r   r   r   Zutils._sympy.value_rangesr   r   r   r   r   rm  r    Z	codecacher!   r"   r#   Zops_handlerr$   Zruntimer%   Zruntime.benchmarkingr&   Zruntime.hintsr'   r(   r)   r*   Zruntime.runtime_utilsr+   r,   r[  r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   Zvirtualizedr?   r  r@   rA   rB   Zwrapper_benchmarkrC   Zblock_analysisrE   commonrF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   ZsimdrU   rV   rW   rX   rY   rZ   Ztriton_utilsr[   r\   r]   r^   r_   rC  r`   typesra   rb   r  rd   re   Zsimd_kernel_featuresrf   rg   	getLoggerrr   rh  Z_loggingZgetArtifactLoggerZperf_hint_logZschedule_logZ
fusion_logrh   r   r   r   	dataclassr   r   r   rT  r  r5  r  r  r  r  r  r  r  r  r  r  r  Z_initialize_pointwise_overridesr~  r  r   r  r  r~   r  r  r  rY  r]  rv   rv   rv   rw   <module>   s    @D 
  
 %








1   
y '$(
                   +   