o
    Zhh                    @  s
  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZmZ d dlmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlm Z! d d	l"m#Z# d d
l$m%Z% d dl&m'Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/ ddl0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZC ddlDmEZEmFZFmGZGmHZHmIZImJZJ erd dlKmLZLmMZMmNZN ddlOmPZPmQZQmRZRmSZS ddlTmUZU ddlVmWZWmXZXmYZY ddlZm[Z[ edZ\eeeX geWf Z]e^e[ Z_ee`ejaf Zbe`ZcejdeefdZgehefZid&d d!ZjG d"d# d#ejZke=d$d%G d&d' d'ZlejmG d(d) d)ZnejmG d*d+ d+ZoejmG d,d- d-ZpejmG d.d/ d/ZqejmG d0d1 d1Zreeleneoeqepf Zsi Ztd2eud3< G d4d5 d5Zvi Zwd6eud7< 	d'd(d?d@ZxG dAdB dBeZyd)dEdFZzd*dIdJZ{d+dLdMZ|	Nd,d-dPdQZ}e~dd.dRdSZd/dYdZZd0d\d]Zd1d^d_Zejejejejid`da ejejejejejejejejejejejfD Zdbeudc< d2didjZG dkdl dlZG dmdn dne(Z'G dodp dpZe
jdqe
jdrZd3dtduZG dvdw dwe6eeFe ZejmG dxdy dyZed4i dzeejd{d| d}d~deejdd| dd| dddeejdd| dd| dddeejdd| dd| dddeejdd| dd| dddeejdd| dd| dddeejdd| dd| dddeejdd| dd| dd| dddeejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd| dd| dddeejdd| dd| dddeejdd| dd| dddeejdd| dd~deejdd| dd~deejdd| dd| dddeejdd| dd| dddeejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~deejdd| dd~Zd eud< d5ddZG dd de:ZG dd de<ZG d	d
 d
eZejmG dd dZG dd dZe ZG dd dZG dd dZede`dZedeedZereejeHeeeedf f f ZG dd deeef ZG dd dZG dd deee ZejmG dd dZe~dd6d d!ZG d"d# d#ZG d$d% d%e7ZdS (7      )annotationsN)autoEnum)chain)	AnyCallablecastClassVarGeneric
NamedTupleOptionalTYPE_CHECKINGUnion)TypeVar)ELEMENTWISE_TYPE_PROMOTION_KIND)_pytree)
OrderedSet)int_oo)PythonPrinter)free_symbol_is_typesymbol_is_typeSymT)bound_sympyValueRanges   )configmetrics)DtypePropagationOpsHandler)BasicMathOpsMixinDefaultHandler)boolean_opsDeferredLineBasegenerate_assertIndentedBufferir_dataclass
ScopedDict	sympy_dotsympy_index_symbol
sympy_substriton_typeunique)ops
OpsHandlerOpsValueReductionType	StoreModeV)IteratorMutableMappingSequence)BufferChoiceCallerFixedLayoutIRNodeLoopBody)BaseScheduling	SchedulerSchedulerNode   PythonWrapperCodegen_TZschedulemsgstrreturnNonec                 C  s    t tjrt d|  d S d S )NzData type propagation: %s)schedule_logisEnabledForloggingDEBUGdebug)rA    rJ   M/var/www/auris/lib/python3.10/site-packages/torch/_inductor/codegen/common.pydata_type_loggerP   s   rL   c                   @  s4   e Zd ZdZdZdZedddZedddZdS )WorkspaceZeroModer   r=   r   abrC   c                 C  s:   | |ks	|t jkr| S | t jkr|S td| d|d)NzWorkspaceZeroMode.combine(, ))rM   UNINITIALIZEDNotImplementedErrorrN   rO   rJ   rJ   rK   combineZ   s
   
zWorkspaceZeroMode.combine	zero_fillboolc                 C  s   | rt jS t jS N)rM   ZERO_ON_CALLrR   )rV   rJ   rJ   rK   	from_boolb   s   zWorkspaceZeroMode.from_boolN)rN   rM   rO   rM   rC   rM   )rV   rW   rC   rM   )	__name__
__module____qualname__rR   rY   ZERO_PER_GRAPHstaticmethodrU   rZ   rJ   rJ   rJ   rK   rM   U   s    rM   T)frozenc                   @  s   e Zd ZU dZded< ded< ded< ded	< d
Zded< ejZded< e	d0d1ddZ
e	d2ddZe	d3ddZe	d3ddZd4ddZeZd5ddZd6d!d"Zed6d#d$ZeZeZeZd7d&d'Zd7d(d)Zd8d*d+Zd9d-d.Zd/S ):WorkspaceArga2  A temporary buffer used for a single kernel, then discarded.

    Not registered as a traditional buffer since there are no users,
    so it would be dead code eliminated.

    Args:
        nbytes: The size of the buffer in bytes.
        zero_fill: Whether the buffer should be initialized to zero.

    
sympy.ExprcountrM   	zero_modetorch.devicedevicerB   
outer_nameZws_ptr
inner_nametorch.dtypedtype
workspace_prefixrC   c                 C  s   |  t tjj S rX   )nextr0   graphZworkspace_id)rl   rJ   rJ   rK   unique_name}   s   zWorkspaceArg.unique_namerN   rO   rW   c                 C  s$   | j |j ko| j|jko| j|jkS rX   )rh   rj   rf   rT   rJ   rJ   rK   can_join   s   "zWorkspaceArg.can_joinc                 C  s0   t | j|j t| j|j| j| j| j| jdS N)rc   rd   rj   rf   rh   rg   )	ra   rc   rM   rU   rd   rj   rf   rh   rg   rT   rJ   rJ   rK   join   s   
zWorkspaceArg.joinc                 C  s\   | j |j kr| j|jkr| j|jksJ tt| j|jt| j	|j	| j | j| j| j
dS rq   )rj   rf   rh   ra   sympyZMaxrc   rM   rU   rd   rg   rT   rJ   rJ   rK   maximum   s   (zWorkspaceArg.maximumc                 C     | j S rX   rf   selfrJ   rJ   rK   
get_device      zWorkspaceArg.get_devicec                 C  ru   rX   )rj   rw   rJ   rJ   rK   	get_dtype   rz   zWorkspaceArg.get_dtyper6   c                 C  s&   ddl m} || j| j| jgdgdS )Nr   )r6   r=   )rf   rj   sizeZstride)irr6   rf   rj   rc   )rx   r6   rJ   rJ   rK   
get_layout   s   zWorkspaceArg.get_layoutc                 C  s   |   S rX   )r~   rw   rJ   rJ   rK   layout      zWorkspaceArg.layoutlist[sympy.Expr]c                 C  s   | j gS rX   )rc   rw   rJ   rJ   rK   get_size      zWorkspaceArg.get_sizec                 C  s
   t jjgS rX   )rs   SZOnerw   rJ   rJ   rK   
get_stride      
zWorkspaceArg.get_stridec                 C  ru   rX   )rg   rw   rJ   rJ   rK   get_name   rz   zWorkspaceArg.get_name	list[str]c                 C  s   g S rX   rJ   rw   rJ   rJ   rK   get_inputs_that_alias_output      z)WorkspaceArg.get_inputs_that_alias_outputN)rk   )rl   rB   rC   rB   )rN   ra   rO   ra   rC   rW   )rN   ra   rO   ra   rC   ra   )rC   re   )rC   ri   )rC   r6   )rC   r   rC   rB   )rC   r   )r[   r\   r]   __doc____annotations__rh   torchuint8rj   r_   ro   rp   rr   rt   ry   Zget_device_or_errorr{   r~   propertyr   Zget_output_specZmaybe_get_output_specZmaybe_get_layoutr   r   r   r   rJ   rJ   rJ   rK   ra   i   s:   
 







ra   c                   @  sB   e Zd ZU ded< ded< ded< ejjZded< dZd	ed
< dS )	TensorArgrB   namebufferri   rj   rb   offsetNOptional[str]alias_of)	r[   r\   r]   r   rs   r   ZZeror   r   rJ   rJ   rJ   rK   r      s   
 r   c                   @  s,   e Zd ZU ded< ded< ed
ddZd	S )SizeArgrB   r   rb   exprrC   r   c                 C     d S rX   rJ   rw   rJ   rJ   rK   r         zSizeArg.alias_ofNrC   r   )r[   r\   r]   r   r   r   rJ   rJ   rJ   rK   r      s
   
 r   c                   @     e Zd ZU ded< dS )ConstexprArgrB   r   Nr[   r\   r]   r   rJ   rJ   rJ   rK   r         
 r   c                   @  r   )TMADescriptorArgrB   r   Nr   rJ   rJ   rJ   rK   r      r   r   c                   @  s*   e Zd ZU ded< ded< dZded< dS )DeviceCodegenSchedulingConstructor
schedulingWrapperConstructorwrapper_codegenNOptional[WrapperConstructor]cpp_wrapper_codegen)r[   r\   r]   r   r   rJ   rJ   rJ   rK   r      s   
 r   zdict[str, DeviceCodegen]device_codegensc                   @  s   e Zd Zd+ddZd,dd	Zd-d
dZd,ddZd-ddZd-ddZd-ddZ	d-ddZ
d-ddZd-ddZd-ddZd-ddZd-ddZd-d d!Zd-d"d#Zd-d$d%Zd.d(d)Zd*S )/DeviceOpOverridesr   rB   rC   c                 C     t rX   rS   rx   r   rJ   rJ   rK   import_get_raw_stream_as   r   z*DeviceOpOverrides.import_get_raw_stream_as
device_idxintc                 C  r   rX   r   rx   r   rJ   rJ   rK   
set_device   r   zDeviceOpOverrides.set_devicec                 C  r   rX   r   rw   rJ   rJ   rK   synchronize   r   zDeviceOpOverrides.synchronizec                 C  r   rX   r   r   rJ   rJ   rK   device_guard   r   zDeviceOpOverrides.device_guardc                 C  r   rX   r   rw   rJ   rJ   rK   cpp_device_guard   r   z"DeviceOpOverrides.cpp_device_guardc                 C  r   rX   r   rw   rJ   rJ   rK   cpp_aoti_device_guard  r   z'DeviceOpOverrides.cpp_aoti_device_guardc                 C  r   rX   r   rw   rJ   rJ   rK   cpp_stream_guard  r   z"DeviceOpOverrides.cpp_stream_guardc                 C  r   rX   r   rw   rJ   rJ   rK   cpp_aoti_stream_guard  r   z'DeviceOpOverrides.cpp_aoti_stream_guardc                 C  r   rX   r   rw   rJ   rJ   rK   cpp_getStreamFromExternal
  r   z+DeviceOpOverrides.cpp_getStreamFromExternalc                 C  r   rX   r   rw   rJ   rJ   rK   kernel_header  r   zDeviceOpOverrides.kernel_headerc                 C  r   rX   r   rw   rJ   rJ   rK   kernel_driver  r   zDeviceOpOverrides.kernel_driverc                 C  r   rX   r   rw   rJ   rJ   rK   cpp_stream_type  r   z!DeviceOpOverrides.cpp_stream_typec                 C  r   rX   r   rw   rJ   rJ   rK   aoti_get_stream  r   z!DeviceOpOverrides.aoti_get_streamc                 C  r   rX   r   rw   rJ   rJ   rK   cpp_kernel_type  r   z!DeviceOpOverrides.cpp_kernel_typec                 C  r   rX   r   rw   rJ   rJ   rK   cpp_device_ptr  r   z DeviceOpOverrides.cpp_device_ptrc                 C  r   rX   r   rw   rJ   rJ   rK   tma_descriptor_helpers  r   z(DeviceOpOverrides.tma_descriptor_helpersidxOptional[tuple[str, str]]c                 C  r   rX   r   )rx   r   rJ   rJ   rK   cpp_global_scratch"  r   z$DeviceOpOverrides.cpp_global_scratchNr   rB   rC   rB   )r   r   rC   rB   r   )r   r   rC   r   )r[   r\   r]   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rJ   rJ   rJ   rK   r      s$    















r   zdict[str, DeviceOpOverrides]device_op_overrides_dictrf   device_schedulingr   device_wrapper_codegenr   device_cpp_wrapper_codegenr   c                 C  s   t |||t| < d S rX   )r   r   )rf   r   r   r   rJ   rJ   rK   register_backend_for_device?  s   r   c                   @  sH   e Zd Ze Ze Ze Ze Ze Ze Z	e Z
e Ze Ze ZdS )BackendFeatureN)r[   r\   r]   r   ZFOREACHZ	BUCKETIZEZINPLACE_BUFFERSZMASKED_SCATTER_WITH_INDEXZSCANZSORTZTUPLE_REDUCTIONZPREFER_STORE_LOOP_ORDERZTRITON_TEMPLATESZREDUCE_TO_SINGLE_ELEMENTrJ   rJ   rJ   rK   r   J  s    
r   Union[torch.device, str, None]OrderedSet[BackendFeature]c                 C  sf   | d u rt  S t  t| tjr| j}nt| tsJ | }t|} t|}|s*J |d }|| S rX   )	r   init_backend_registration
isinstancer   rf   typerB   get_scheduling_for_deviceget_backend_features)rf   Zdevice_typeZscheduling_ctorr   rJ   rJ   rK   r   W  s   

r   featurerW   c                 C  s   t |tsJ |t| v S )zSee also V.graph.has_feature)r   r   r   )rf   r   rJ   rJ   rK   has_backend_featurei  s   r   Optional[SchedulingConstructor]c                 C  s   | t v r	t |  jS d S rX   )r   r   rv   rJ   rJ   rK   r   q     r   Fcpp_wrapperc                 C  s$   | t v rt |  }|r|jS |jS d S rX   )r   r   r   )rf   r   Zwrapper_codegen_objrJ   rJ   rK   get_wrapper_codegen_for_deviceu  s   r   c                    s  ddl m}  ddlm} ddlm} ddlm} ddlm	} ddl
m} ddlm} dd	lm} dd
lm} tdd u rS| ||d td fdd|tjjrP|n| tdd u ri||dtdfdd|| tdd u rvtd||| tdd u rtd||| tj }	|	dkrt|	d u rddlm}
 z%|
d}|
d}|
d}|r|r|rt|	||| W d S W d S W d S W d S  ty   Y d S w d S d S )Nr=   )CppScheduling)CppWrapperCpu)CppWrapperCpuArrayRef)CppWrapperGpu)CUDACombinedScheduling)HalideScheduling)MetalScheduling)TritonSchedulingr>   cpu)cpphalidetritonc                       t j | S rX   )r   cpu_backendr   )cpu_backendsrJ   rK   <lambda>      z+init_backend_registration.<locals>.<lambda>cuda)r   r   c                   r   rX   )r   cuda_backendr   )cuda_backendsrJ   rK   r     r   xpumpsZprivateuseoner   )_get_custom_mod_funcZ
Schedulingr?   ZCppWrapperCodegen)r   r   Zcpp_wrapper_cpur   Zcpp_wrapper_cpu_array_refr   Zcpp_wrapper_gpur   Zcuda_combined_schedulingr   r   r   r   r   r   r   wrapperr?   r   r   r   Zaot_inductorZallow_stack_allocationr   Z_CZ_get_privateuse1_backend_nameZ torch.utils.backend_registrationr   RuntimeError)r   r   r   r   r   r   r   r   r?   Zprivate_backendr   r   r   r   rJ   )r   r   rK   r     s   
	


r   indexSequence[sympy.Expr]
index_varssizesr   c                 C  s$   ddl m} g | t|||S )Nr   )FlexibleLayout)r}   r   r&   Zcontiguous_strides)r   r   r   r   rJ   rJ   rK   index_prevent_reordering  s   r   device_op_overridesc                 C  s   |t | < d S rX   )r   )rf   r   rJ   rJ   rK   register_device_op_overrides  s   r   c                 C  sB   t | tsJ tsddlm}m} ddlm} ddlm} t|  S )Nr=   )cpu_device_op_overridesmps_device_op_overrides)r   )	r   rB   r    r   r   r   r   r   )rf   r   r   r   Zxpu_op_overridesrJ   rJ   rK   get_device_op_overrides  s   r   c                 C  s   i | ]}||qS rJ   rJ   ).0rj   rJ   rJ   rK   
<dictcomp>  s    r   zdict[torch.dtype, torch.dtype]DTYPE_TO_COMPUTATION_DTYPEop_nameargsr   kwargsOptional[torch.dtype]c                 O  s   | t  v rtjS | dv rd|v r|d S |d S | dv rtjS | dv r&tjS | dkr6d|v r2|d S |d S | dkrFd|v rB|d S |d S | d	v rT|d }tj|S | d
krdd|v r`|d S |d S dS )zK
    Given op name and a list of input dtypes, deduce the output dtype
    )to_dtype
index_exprrj   )ZrandZrandn)Z	get_indexZ	randint64	load_seed	reductionr=   constant)loadstorestore_reductionZto_dtype_bitcastN)r    r   rW   floatint64r0   rn   r{   )r   r  r  Zbuf_namerJ   rJ   rK   deduce_output_dtype_by_name  s$   
r  c                   @  sd   e Zd ZdddZdd
dZdddZdddZd ddZd!ddZe	d"ddZ
e	d#ddZdS )$DataTypePropagationbodyr9   rC   rD   c                 C  s8   || _ d|jji| _|j D ]
\}}|j| j|< qd S Nroot)r  Z
root_blockrn   graphsZ	subblocksitems)rx   r  kvrJ   rJ   rK   __init__-  s   zDataTypePropagation.__init__nodetorch.fx.Noder  c                 C  sV   |j }dd |D }t|dkrd S tdd |D }|sd S ttjdd |D S )Nc                 S  s(   g | ]}t |tjjr|jd kr|qS )placeholder)r   r   ZfxNodeopr   nrJ   rJ   rK   
<listcomp>7  s    zCDataTypePropagation.deduce_node_dtype_by_inputs.<locals>.<listcomp>r   c                 s  s.    | ]}t j|jv o|jt j jd uV  qd S rX   )OptimizationContextkeymetarj   r  rJ   rJ   rK   	<genexpr>=  s    
zBDataTypePropagation.deduce_node_dtype_by_inputs.<locals>.<genexpr>c                 S  s   g | ]	}|j tj jqS rJ   )r$  r"  r#  rj   r  rJ   rJ   rK   r!  G      )Zall_input_nodeslenall	functoolsreducer   Zpromote_types)rx   r  inputsZinput_nodesZall_input_nodes_propagatedrJ   rJ   rK   deduce_node_dtype_by_inputs5  s   z/DataTypePropagation.deduce_node_dtype_by_inputsri   c                 C  s"   | j |j }| |}|sJ |S rX   )r  targetpropagate_graph)rx   r  Z	sub_graphrj   rJ   rJ   rK   deduce_node_dtype_by_subgraphJ  s   
z1DataTypePropagation.deduce_node_dtype_by_subgraphc                 C  s   |j dkrd S |jdkrt|jdkrd S |jtjkr#| |jd S t|jts+J |j	dr6| 
|S t|jg|jR i |j }d urJ|S | |S )Nr  outputr=   r   Zmasked_subblock)r  r-  r'  r  operatorgetitemdeduce_node_dtyper   rB   
startswithr/  r  r  r,  )rx   r  output_dtyperJ   rJ   rK   r3  P  s(   


z%DataTypePropagation.deduce_node_dtypern   torch.fx.Graphc                 C  sf   |j sJ d }|j D ]&}tj|jv r|jtj }nt }| ||_||jtj< |jdkr0|j}q
|S )Nr0  )nodesr"  r#  r$  r3  rj   r-  )rx   rn   Zgraph_dtyper  opt_ctxrJ   rJ   rK   r.  k  s   


z#DataTypePropagation.propagate_graphc                 C  s   |  | jd S r  )r.  r  rw   rJ   rJ   rK   	propagate}     zDataTypePropagation.propagatec                 C  s   | |  S rX   )r9  )clsr  rJ   rJ   rK   propagate_loopbody     z&DataTypePropagation.propagate_loopbodyr<   c                 C  sB   ddl m} ddlm} t||sJ t|j|sJ t|jS )Nr   r8   )r<   )	loop_bodyr9   	schedulerr<   r   _bodyr  r<  )r;  r  r9   r<   rJ   rJ   rK   propagate_scheduler_node  s
   z,DataTypePropagation.propagate_scheduler_nodeN)r  r9   rC   rD   )r  r  rC   r  )r  r  rC   ri   )rn   r6  rC   r  )rC   r  )r  r9   rC   r  )r  r<   rC   r  )r[   r\   r]   r  r,  r/  r3  r.  r9  classmethodr<  rA  rJ   rJ   rJ   rK   r  ,  s    





r  c                      s&   e Zd Zdddd fd
dZ  ZS )r   T)simplifypr   rb   rC  rW   rD  rC   rB   c                  s6   |rt |tjrttjdrtjj|}t 	|S )Nsizevars)
r   rs   Exprhasattrr0   rn   rE  rC  superdoprint)rx   r   rC  rD  	__class__rJ   rK   rI    s   zPythonPrinter.doprint)r   rb   rC  rW   rD  rW   rC   rB   )r[   r\   r]   rI  __classcell__rJ   rJ   rJ  rK   r     s    r   c                   @  s  e Zd ZdZed1ddZed2dd	Zed2d
dZed2ddZed2ddZ	ed2ddZ
ed2ddZed2ddZed2ddZed2ddZed2ddZed2ddZed3d d!Zed4d%d&Zed4d'd(Zed4d)d*Zed5d,d-Zed4d.d/Zd0S )6OpDecompositionsz!
    Decomposes inductor ops
    valueOpVarTrC   c                 C  s   | S rX   rJ   )rN  rJ   rJ   rK   identity     zOpDecompositions.identityxc                 C  s   t t dtj| S Nr=   )r+   truedivr	  r   int32rR  rJ   rJ   rK   
reciprocal     zOpDecompositions.reciprocalc                 C  s   t | | S rX   )r+   mulrV  rJ   rJ   rK   square  r=  zOpDecompositions.squarec                 C  s   t t dtjt | S rS  )r+   subr	  r   float32erfrV  rJ   rJ   rK   erfc     zOpDecompositions.erfcc                 C  s   t t t | t | S rX   )r+   rY  exprZ  r^  rV  rJ   rJ   rK   erfcx  s   zOpDecompositions.erfcxc                 C  s   t t | t dtjS rS  )r+   r[  r`  r	  r   r\  rV  rJ   rJ   rK   expm1  r_  zOpDecompositions.expm1c              	   C  &   t t | t dtd tjS )Nr=   
   r+   rY  logr	  mathr   r\  rV  rJ   rJ   rK   log10     &zOpDecompositions.log10c              	   C  rc  )Nr=   r   re  rV  rJ   rJ   rK   log2  ri  zOpDecompositions.log2c              
   C  s"   t t | t tdtjS )Nr   )r+   r`  rY  r	  rg  rf  r   r\  rV  rJ   rJ   rK   exp2  s   "zOpDecompositions.exp2c              	   C  s   t t | t dtjS rS  )r+   rf  addr	  r   rU  rV  rJ   rJ   rK   log1p  r_  zOpDecompositions.log1pc                 C  .   t dtj}t |t |t t | S rS  )r+   r	  r   rU  rT  rl  r`  negrR  onerJ   rJ   rK   sigmoid      zOpDecompositions.sigmoidc                 C  s   t | t dtjS Nr   )r+   rt   r	  r   rU  rV  rJ   rJ   rK   relu  rX  zOpDecompositions.reluyzc                 C  s   t t | ||S rX   )r+   rl  rY  rR  rv  rw  rJ   rJ   rK   fma  s   zOpDecompositions.fmarN   rj   ri   c                 C     t t | |S rX   )r+   r  floorrN   rj   rJ   rJ   rK   floor_to_int     zOpDecompositions.floor_to_intc                 C  rz  rX   )r+   r  ceilr|  rJ   rJ   rK   ceil_to_int  r~  zOpDecompositions.ceil_to_intc                 C  rz  rX   )r+   r  truncr|  rJ   rJ   rK   trunc_to_int  r~  zOpDecompositions.trunc_to_intrO   c              	   C  sT   t | |}t t |t dtjt t |t |}t |t 	|||S rt  )
r+   modand_ner	  r   rU  Zsignbitwhererl  )rN   rO   rcondrJ   rJ   rK   	remainder  s   zOpDecompositions.remainderc                 C  rz  rX   )r+   r  roundr|  rJ   rJ   rK   round_to_int  r~  zOpDecompositions.round_to_intN)rN  rO  rC   rO  rR  rO  rC   rO  )rR  rO  rv  rO  rw  rO  rC   rO  )rN   rO  rj   ri   rC   rO  rN   rO  rO   rO  rC   rO  )r[   r\   r]   r   r_   rP  rW  rZ  r^  ra  rb  rh  rj  rk  rm  rr  ru  ry  r}  r  r  r  r  rJ   rJ   rJ   rK   rM    sL    rM  z[a-z0-9_.]+|\([^)]*\)|)flagsstringc                 C  s   | d dkst | dk rdS d}t| dd  D ]$\}}|dkr%|d7 }n|dkr-|d8 }|dkr<|t | d kr< dS q|dksCJ dS )Nr   (r   Fr=   rQ   T)r'  	enumerate)r  rc   icharrJ   rJ   rK   _all_in_parens  s   
r  c                   @  s  e Zd ZedddZedd
dZedddZedddZedddZedddZ	edddZ
edddZedddZedddZedddZedd!d"Zedd#d$Zedd%d&Zedd'd(Zedd)d*Zedd,d-Zedd1d2Z	3	3ddd;d<ZddBdCZddEdFZ	GdddJdKZddLdMZddRdSZddZd[Zdd^d_Z	G	GdddidjZddkdlZdGe j!d3dmdnddvdwZ"ddydzZ#dd{d|Z$edd~dZ%e&dddZ'e&dddZ(dGS )OpOverridesr  rO  rC   c                 C  s,   t | tst| st| r| S d|  dS Nr  rQ   )r   CSEVariable_RE_PAREN_NOT_NEEDED	fullmatchr  )r  rJ   rJ   rK   paren  s   zOpOverrides.parenrN  Union[bool, float, int]rj   ri   c                 C  s   t | S rX   )repr)rN  rj   rJ   rJ   rK   r	    r   zOpOverrides.constantrR  c                 C  rn  rS  )r+   r	  r   rU  rT  rl  libdevice_expro  rp  rJ   rJ   rK   libdevice_sigmoid  rs  zOpOverrides.libdevice_sigmoidc                 C  
   t | S rX   )r+   absrV  rJ   rJ   rK   libdevice_abs     
zOpOverrides.libdevice_absc                 C  r  rX   )r+   sqrtrV  rJ   rJ   rK   libdevice_sqrt  r  zOpOverrides.libdevice_sqrtc                 C  r  rX   )r+   cosrV  rJ   rJ   rK   libdevice_cos  r  zOpOverrides.libdevice_cosc                 C  r  rX   )r+   sinrV  rJ   rJ   rK   libdevice_sin!  r  zOpOverrides.libdevice_sinc                 C  r  rX   )r+   rf  rV  rJ   rJ   rK   libdevice_log%  r  zOpOverrides.libdevice_logc                 C  r  rX   )r+   r`  rV  rJ   rJ   rK   r  )  r  zOpOverrides.libdevice_expc                 C  s   dt |  S )N~r  r  rV  rJ   rJ   rK   bitwise_not-     zOpOverrides.bitwise_notrN   c                 C  s   t |  dS )Nz == 0r  )rN   rJ   rJ   rK   logical_not1  r  zOpOverrides.logical_notrv  c                 C     t |  dt | S )Nz & r  rR  rv  rJ   rJ   rK   bitwise_and5     zOpOverrides.bitwise_andc                 C  r  )Nz | r  r  rJ   rJ   rK   
bitwise_or9  r  zOpOverrides.bitwise_orc                 C  r  )Nz ^ r  r  rJ   rJ   rK   bitwise_xor=  r  zOpOverrides.bitwise_xorc                 C  r  )Nz << r  r  rJ   rJ   rK   bitwise_left_shiftA  r  zOpOverrides.bitwise_left_shiftc                 C  r  )Nz >> r  r  rJ   rJ   rK   bitwise_right_shiftE  r  zOpOverrides.bitwise_right_shiftrO   c                 C  s   t | |S rX   )r+   rT  rT   rJ   rJ   rK   int_truedivI  s   zOpOverrides.int_truedivr   rB   r   c                 C  s   t | t|S rX   )r+   r
  rs   Integer)r   r   rJ   rJ   rK   r  P  r~  zOpOverrides.load_seedTvarr|   Union[sympy.Expr, int]checkrW   wrap_negsympy.Symbolc                 C  s   t t|S rX   )r'   rB   )rx   r  r|   r  r  rJ   rJ   rK   indirect_indexingT  s   zOpOverrides.indirect_indexingr   rb   lowerupperrD   c                 C     t t| j d)Nz,: check_bounds should be handled by CSEProxyrS   r   r[   rx   r   r|   r  r  rJ   rJ   rK   check_bounds]     zOpOverrides.check_boundsr   c                 C  r  )Nz$: load should be handled by CSEProxyr  rx   r   r   rJ   rJ   rK   r
  d     zOpOverrides.loadNmoder/   c                 C  r  )Nz%: store should be handled by CSEProxyr  rx   r   r   rN  r  rJ   rJ   rK   r  i  r  zOpOverrides.storec                 C  r  )Nz/: store_reduction should be handled by CSEProxyr  rx   r   r   rN  rJ   rJ   rK   r  p  r  zOpOverrides.store_reduction	src_dtypereduction_typer.   !Union[OpVarT, tuple[OpVarT, ...]]c                 C  r  )Nz): reduction should be handled by CSEProxyr  rx   rj   r  r  rN  rJ   rJ   rK   r  u     zOpOverrides.reductiondtypestuple[torch.dtype, ...]
combine_fnFCallable[[tuple[OpVarT, ...], tuple[OpVarT, ...]], tuple[OpVarT, ...]]valuestuple[OpVarT, ...]c                 C  r  )Nz$: scan should be handled by CSEProxyr  rx   r  r  r  rJ   rJ   rK   scan     	zOpOverrides.scanstable
descendingc                 C  r  )Nz$: sort should be handled by CSEProxyr  rx   r  r  r  r  rJ   rJ   rK   sort  r  zOpOverrides.sort
boundaries.tuple[str, sympy.Expr, sympy.Expr, sympy.Expr]boundary_indicesindexing_dtyperightsorter Optional[tuple[str, sympy.Expr]]sorter_indicesOptional[OpVarT]c                 C  r  )Nz): bucketize should be handled by CSEProxyr  rx   r  r  r  r  r  r  r  rJ   rJ   rK   	bucketize  s   
zOpOverrides.bucketizec                 C  r  )Nz2: halide_clamp only implemented for Halide backendr  )rx   rN  r|   r  rJ   rJ   rK   halide_clamp  r  zOpOverrides.halide_clampr=   )constraintsrj   is_purepackr+  asmr  r   r  r  r   c                G  r  )Nz<: inline_asm_elementwise only implemented for Triton backendr  )rx   r  r  rj   r  r  r+  rJ   rJ   rK   inline_asm_elementwise  r  z"OpOverrides.inline_asm_elementwiser  c                 G  r  )Nz.: ops.output should not appear at codegen timeAssertionErrorr   r[   )rx   r  rJ   rJ   rK   r0    r  zOpOverrides.outputc                 C  r  )Nz3: ops.placeholder should not appear at codegen timer  rx   r   rJ   rJ   rK   r    r  zOpOverrides.placeholderCallable[..., OpVarT]c                   s   d fdd	} |_ d
|_|S )Nrx   r  r  r   r  rC   rO  c                   s   t t| j d  )Nz does not implement ops.r  rx   r  r  r   rJ   rK   unimplemented  s   z1OpOverrides._unimplemented.<locals>.unimplementedT)rx   r  r  r   r  r   rC   rO  )r[   is_unimplemented)r   r  rJ   r  rK   _unimplemented  s   zOpOverrides._unimplementedc                 C  s2   t | |d }t t|d }| p||kpt |ddS )Nr  F)getattrr,   )r;  r   fnZ
default_fnrJ   rJ   rK   _is_unimplemented  s   zOpOverrides._is_unimplementedr-  c                 C  s   |dv sJ |t  D ]7\}}t||}|d u r(| |r't| || | q|| jvs8J d| d| j ||_t| |t| qd S )N)r   r   cppvecr   r   zmultiple definitions of z on )	pointwise_overrides_datar  r  r  setattrr  __dict__r[   r_   )r;  r-  funcnamedataimplrJ   rJ   rK   _initialize_pointwise_overrides  s   

z+OpOverrides._initialize_pointwise_overrides)r  rO  rC   rO  )rN  r  rj   ri   rC   rO  r  )rN   rO  rC   rO  )rR  rO  rv  rO  rC   rO  r  )r   rB   r   rO  rC   rO  TT)
r  rO  r|   r  r  rW   r  rW   rC   r  
r   rb   r|   rb   r  rW   r  rW   rC   rD   )r   rB   r   rb   rC   rO  rX   )
r   rB   r   rb   rN  rO  r  r/   rC   rD   )r   rB   r   rb   rN  rO  rC   rD   )
rj   ri   r  ri   r  r.   rN  r  rC   r  )r  r  r  r  r  r  rC   r  )
r  r  r  r  r  rW   r  rW   rC   r  NN)r  rO  r  r  r  rO  r  ri   r  rW   r  r  r  r  rC   rO  )rN  rO  r|   rb   r  rW   rC   rO  )r+  rO  r  rB   r  r   rj   ri   r  rW   r  r   rC   rO  )r  rO  rC   rD   )r   r   rC   rO  )r   rB   rC   r  r   rB   rC   rW   )r-  rB   rC   rD   ))r[   r\   r]   r_   r  r	  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r  r  r  r  r   r\  r  r0  r  r  rB  r  r  rJ   rJ   rJ   rK   r     s    

	





	


r  c                   @  s\   e Zd ZU ded< ded< dZded< dZded< ejZd	ed
< dZ	ded< dZ
ded< dS )OverridesDatarB   r   zCallable[..., str]r   NzOptional[Callable[..., str]]r   r  r   type_promotion_kindr   r   )r[   r\   r]   r   r   r  r   DEFAULTr  r   r   rJ   rJ   rJ   rK   r    s   
 
r  Zairy_aic                 C     d|  dS )Nzairy_ai_forward(rQ   rJ   rV  rJ   rJ   rK   r         r   Zspecial_airy_ai)r  r   r   Z	bessel_j0c                 C  r	  )Nzbessel_j0_forward(rQ   rJ   rV  rJ   rJ   rK   r     r
  c                 C  r	  )Nzlibdevice.j0(rQ   rJ   rV  rJ   rJ   rK   r     r
  Zspecial_bessel_j0)r  r   r   r   Z	bessel_j1c                 C  r	  )Nzbessel_j1_forward(rQ   rJ   rV  rJ   rJ   rK   r     r
  c                 C  r	  )Nzlibdevice.j1(rQ   rJ   rV  rJ   rJ   rK   r     r
  Zspecial_bessel_j1Z	bessel_y0c                 C  r	  )Nzbessel_y0_forward(rQ   rJ   rV  rJ   rJ   rK   r   	  r
  c                 C  r	  )Nzlibdevice.y0(rQ   rJ   rV  rJ   rJ   rK   r   
  r
  Zspecial_bessel_y0Z	bessel_y1c                 C  r	  )Nzbessel_y1_forward(rQ   rJ   rV  rJ   rJ   rK   r     r
  c                 C  r	  )Nzlibdevice.y1(rQ   rJ   rV  rJ   rJ   rK   r     r
  Zspecial_bessel_y1Zdigammac                 C  r	  )Nzcalc_digamma(rQ   rJ   rV  rJ   rJ   rK   r     r
  c                 C  
   |  dS )Nz
.digamma()rJ   rV  rJ   rJ   rK   r        
 )r  r   r  r   ra  c                 C  r	  )Nzcalc_erfcx(rQ   rJ   rV  rJ   rJ   rK   r     r
  c                 C  r	  )Nzlibdevice.erfcx(rQ   rJ   rV  rJ   rJ   rK   r     r
  Zspecial_erfcxry  c                 C     d|  d| d| dS )Nz	std::fma(rP   rQ   rJ   rx  rJ   rJ   rK   r   #      c                 C  r  )Nzfmadd(rP   rQ   rJ   rx  rJ   rJ   rK   r   $  r  c                 C  r  )Nzlibdevice.fma(rP   rQ   rJ   rx  rJ   rJ   rK   r   %  r  )r  r   r  r   r   Zigammac                 C     d|  d| dS Nzcalc_igamma(rP   rQ   rJ   r  rJ   rJ   rK   r   +      Zigammacc                 C  r  Nzcalc_igammac(rP   rQ   rJ   r  rJ   rJ   rK   r   0  r  Zgammaincc                 C  r  r  rJ   r  rJ   rJ   rK   r   5  r  Zspecial_gammaincZ	gammainccc                 C  r  r  rJ   r  rJ   rJ   rK   r   :  r  Zspecial_gammainccZi0c                 C  r	  )Nzcalc_i0(rQ   rJ   rV  rJ   rJ   rK   r   ?  r
  c                 C  r	  Nzlibdevice.cyl_bessel_i0(rQ   rJ   rV  rJ   rJ   rK   r   @  r
  c                 C  r  )Nz.i0()rJ   rV  rJ   rJ   rK   r   A  r  )r  r   r   r  r   Zi0ec                 C  r	  )Nz	calc_i0e(rQ   rJ   rV  rJ   rJ   rK   r   F  r
  c                 C  r  )Nz.i0e()rJ   rV  rJ   rJ   rK   r   G  r  Zspecial_i0ei1c                 C  r	  )Nzcalc_i1(rQ   rJ   rV  rJ   rJ   rK   r   L  r
  c                 C  r	  Nzlibdevice.cyl_bessel_i1(rQ   rJ   rV  rJ   rJ   rK   r   M  r
  Z
special_i1Zi1ec                 C  r	  )Nz	calc_i1e(rQ   rJ   rV  rJ   rJ   rK   r   R  r
  Zspecial_i1eZlog_ndtrc                 C  r	  )Nzcalc_log_ndtr(rQ   rJ   rV  rJ   rJ   rK   r   W  r
  Zspecial_log_ndtrZmodified_bessel_i0c                 C  r	  )Nzmodified_bessel_i0_forward(rQ   rJ   rV  rJ   rJ   rK   r   ]  r
  c                 C  r	  r  rJ   rV  rJ   rJ   rK   r   ^  r
  Zspecial_modified_bessel_i0Zmodified_bessel_i1c                 C  r	  )Nzmodified_bessel_i1_forward(rQ   rJ   rV  rJ   rJ   rK   r   c  r
  c                 C  r	  r  rJ   rV  rJ   rJ   rK   r   d  r
  Zspecial_modified_bessel_i1Zmodified_bessel_k0c                 C  r	  )Nzmodified_bessel_k0_forward(rQ   rJ   rV  rJ   rJ   rK   r   i  r
  Zspecial_modified_bessel_k0Zmodified_bessel_k1c                 C  r	  )Nzmodified_bessel_k1_forward(rQ   rJ   rV  rJ   rJ   rK   r   n  r
  Zspecial_modified_bessel_k1Zndtrc                 C  r	  )Nz
calc_ndtr(rQ   rJ   rV  rJ   rJ   rK   r   t  r
  Zspecial_ndtrZndtric                 C  r	  )Nzcalc_ndtri(rQ   rJ   rV  rJ   rJ   rK   r   y  r
  Zspecial_ndtriZ	polygammac                 C  s   |  d| d| d|  dS )Nz == 0 ? calc_digamma(z) : calc_polygamma(rP   rQ   rJ   r  rJ   rJ   rK   r   ~  s    Zscaled_modified_bessel_k0c                 C  r	  )Nz"scaled_modified_bessel_k0_forward(rQ   rJ   rV  rJ   rJ   rK   r     r
  Z!special_scaled_modified_bessel_k0Zscaled_modified_bessel_k1c                 C  r	  )Nz"scaled_modified_bessel_k1_forward(rQ   rJ   rV  rJ   rJ   rK   r     r
  Z!special_scaled_modified_bessel_k1Zspherical_bessel_j0c                 C  r	  )Nzspherical_bessel_j0_forward(rQ   rJ   rV  rJ   rJ   rK   r     r
  Zspecial_spherical_bessel_j0zetac                 C  r  )Nzzeta(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Zspecial_zetaZchebyshev_polynomial_tc                 C  r  )Nzchebyshev_polynomial_t_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Zspecial_chebyshev_polynomial_tZchebyshev_polynomial_uc                 C  r  )Nzchebyshev_polynomial_u_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Zspecial_chebyshev_polynomial_uZchebyshev_polynomial_vc                 C  r  )Nzchebyshev_polynomial_v_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Zspecial_chebyshev_polynomial_vZchebyshev_polynomial_wc                 C  r  )Nzchebyshev_polynomial_w_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Zspecial_chebyshev_polynomial_wZlegendre_polynomial_pc                 C  r  )Nzlegendre_polynomial_p_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Zspecial_legendre_polynomial_pZshifted_chebyshev_polynomial_tc                 C  r  )Nz'shifted_chebyshev_polynomial_t_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Z&special_shifted_chebyshev_polynomial_tZshifted_chebyshev_polynomial_uc                 C  r  )Nz'shifted_chebyshev_polynomial_u_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Z&special_shifted_chebyshev_polynomial_uZshifted_chebyshev_polynomial_vc                 C  r  )Nz'shifted_chebyshev_polynomial_v_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Z&special_shifted_chebyshev_polynomial_vZshifted_chebyshev_polynomial_wc                 C  r  )Nz'shifted_chebyshev_polynomial_w_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Z&special_shifted_chebyshev_polynomial_wZhermite_polynomial_hc                 C  r  )Nzhermite_polynomial_h_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Zspecial_hermite_polynomial_hZhermite_polynomial_hec                 C  r  )Nzhermite_polynomial_he_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Zspecial_hermite_polynomial_heZlaguerre_polynomial_lc                 C  r  )Nzlaguerre_polynomial_l_forward(rP   rQ   rJ   r  rJ   rJ   rK   r     r  Zspecial_laguerre_polynomial_lzdict[str, OverridesData]r  r   c                   s.   t  fddtjjtjjtjjtjjfD S )Nc                 3  s    | ]} |v V  qd S rX   rJ   r   rR  r  rJ   rK   r%    s
    
z$is_buffer_removed.<locals>.<genexpr>)anyr0   rn   removed_bufferskernelinplaced_to_remover  rJ   r  rK   is_buffer_removed  s   r  c                      s6   e Zd ZdZd fddZdd	d
ZdddZ  ZS )DeferredLinezHA line that can be 'unwritten' by adding name to V.graph.removed_buffersr   rB   linec                   s$   t  | || _t|trJ d S rX   )rH  r  r   r   r!   )rx   r   r  rJ  rJ   rK   r    s   zDeferredLine.__init__rC   r   c                 C  s   t | js| jS d S rX   )r  r   r  rw   rJ   rJ   rK   __call__  s   
zDeferredLine.__call__c                 C  s   t | j|S rX   )r  r   )rx   r  rJ   rJ   rK   	_new_line  s   zDeferredLine._new_line)r   rB   r  rB   r   )r  rB   rC   r  )r[   r\   r]   r   r  r  r   rL  rJ   rJ   rJ  rK   r    s
    
r  c                   @  s   e Zd Zd	d
ddZdS )BracesBufferr=   r   r   rC   'contextlib.AbstractContextManager[None]c                   s   t jd fdd}| S )NrC   Iterator[None]c                  3  s    t  D ]} d  jd7  _qt   D ]}  jd8  _d qd V  t   D ]} d  jd7  _q0t  D ]}  jd8  _d qCd S )N{r=   })range	writeline_indent)_r   rx   rJ   rK   ctx  s   

z BracesBuffer.indent.<locals>.ctx)rC   r#  )
contextlibcontextmanager)rx   r   r+  rJ   r*  rK   indent  s   zBracesBuffer.indentN)r=   )r   r   rC   r"  )r[   r\   r]   r.  rJ   rJ   rJ   rK   r!    s    r!  c                   @  s   e Zd ZU ded< ded< dS )InplacedBufferrB   rh   r   other_namesNr   rJ   rJ   rJ   rK   r/  	  s   
 r/  c                   @  s,   e Zd ZU ded< dZded< d
ddZd	S )ArgNamerB   r   FrW   is_constexprrC   c                 C  s   | j  | jr
d S d S )Nz : tl.constexprr   )r   r2  rw   rJ   rJ   rK   	full_name  s   zArgName.full_nameNr   )r[   r\   r]   r   r2  r3  rJ   rJ   rJ   rK   r1    s   
 r1  c                   @  s   e Zd ZdddZdS )
RemovedArgrC   rB   c                 C  s   dS )NREMOVEDrJ   rw   rJ   rJ   rK   __str__  r   zRemovedArg.__str__Nr   )r[   r\   r]   r6  rJ   rJ   rJ   rK   r4    s    r4  c                   @  s   e Zd ZedIdd	ZdJddZdKddZedLddZdMddZdMddZ	dNddZ
dOdd ZdPd"d#ZdQd&d'ZdRd)d*ZdSd,d-ZdTd/d0ZdUd4d5ZdVd8d9ZdWd;d<ZdXd>d?ZdYdAdBZdZdCdDZd[dFdGZdHS )\
KernelArgsrl   rB   odict6Union[dict[_T, Union[str, RemovedArg]], dict[_T, str]]r   r@   rC   c                 C  s6   | |t}t|tr|  t|  ||< }|S |S rX   )getr5  r   r4  r'  )rl   r8  r   resultZ
new_resultrJ   rJ   rK   _lookup!  s
   
zKernelArgs._lookuprD   c                 C  s"   i | _ i | _i | _i | _g | _d S rX   )input_buffersoutput_buffersinplace_buffersrE  workspace_argsrw   rJ   rJ   rK   r  -  s
   
zKernelArgs.__init__c              
   C  s&   d dtt| j| j| j| jgS )NzKernelArgs({})rP   )formatrr   mapr  r=  r>  r?  rE  rw   rJ   rJ   rK   __repr__4  s   zKernelArgs.__repr__r   rW   c                 C  s
   t | tS rX   r   r4  r  rJ   rJ   rK   _buffer_is_marked_removedC  s   
z$KernelArgs._buffer_is_marked_removedc                 C  s   t jjrt jjj||}|t jjvsJ ||| jv r$tt| j| S || j	v r2tt
| j	| jS |dr?| d| j|S | d| j|S )NseedZin_ptr)r0   rn   r?  mutation_real_namer:  r  r>  r   rB   r?  r/  rh   r4  r<  r=  r   rJ   rJ   rK   inputH  s   


zKernelArgs.inputc                 C  sZ   t jjrt jjj||}|t jjvsJ ||| jv r%tt| j| j	S | 
d| j|S )NZout_ptr)r0   rn   r?  rG  r:  r  r?  r   r/  rh   r<  r>  r   rJ   rJ   rK   r0  T  s   
zKernelArgs.output
input_nameoutput_namec                 C  s   || j vsJ || j v r%| j | }t|trJ |j| || j |< d S dd | j  D }dd | j  D }tt|t| }td| ||g}|| j |< || j |< d S )Nc                 S  s   g | ]	}t |ts|qS rJ   rD  r   valrJ   rJ   rK   r!  d      z+KernelArgs.make_inplace.<locals>.<listcomp>c                 S  s   g | ]	}t |tr|qS rJ   rD  rK  rJ   rJ   rK   r!  i  rM  Z
in_out_ptr)	r?  r   r4  r0  appendr  r'  r*   r/  )rx   rI  rJ  bufZalive_buffersr  Zinplace_buffer_idxrJ   rJ   rK   make_inplace\  s&   


zKernelArgs.make_inplacenbytesrb   rV   tuple[str, int]c                 C  s   t |t|tj t  d}t| jD ]+\}}t 	||r2|j
}t ||| j|< |j|f  S |j|jkr>|j|jks@J q| j| |jdfS )a  
        Allocate or extend a workspace buffer of nbytes bytes.

        This function manages the allocation of a workspace buffer. It either creates
        a new WorkspaceArg or extends an existing one.

        Note:
        - Calling this function will in-place mutate the args by adding or updating
        a WorkspaceArg.
        - The codegen for generating the Python argdefs and call_defs will check
        this field and allocate the buffer accordingly.
        - A new argument "ws_ptr" will be present in the generated code.

        Args:
            nbytes (sympy.Expr): The number of bytes to allocate.
            zero_fill (bool): Whether to initialize the buffer to zero.

        Returns:
            Tuple[str, int]: A tuple containing:
                - "ws_ptr": A string identifier for the workspace pointer.
                - offset: An integer representing the byte offset in the workspace.
        )rc   rd   rf   rg   r   )ra   rM   rZ   r0   rn   get_current_device_or_throwro   r  r@  rp   rc   rr   rh   rg   rN  )rx   rQ  rV   argr  existing_argr   rJ   rJ   rK   	workspacev  s   
zKernelArgs.workspacemin_sizec              	   C  sh   t j }t|tjtjdd|j d|j	 |d}| j
D ]}|j|jkr*||ks*J q| j
| |jS )a  
        Lazily allocate a graph-wide semaphores buffer with at least min_size.  This is a single buffer shared by
        all kernels and zero initialized once at graph start.  Each kernel must leave the buffer zeroed on exit.

        Warning: multiple calls to this function will return the same buffer.

        Args:
            min_size: the number of int32 semaphores required

        Returns:
            name of the semaphores buffer
        Zsem_ptrZsemaphores_r)  )rc   rd   rj   rh   rg   rf   )r0   rn   rS  ra   rM   r^   r   uint32r   r   r@  rh   rN  )rx   rW  current_devicerT  rU  rJ   rJ   rK   
semaphores  s   

zKernelArgs.semaphoresrN  r   c                   sx   t |tsJ t||ft|}|| jv r| j| S  | j v r5  t fdd| j D    | j|<  S )Nc                 3  s    | ]
}|  rd V  qdS )r=   N)r4  )r   r  r  rJ   rK   r%    s    z)KernelArgs.seed_offset.<locals>.<genexpr>)r   r   r   rs   r  rE  r  sum)rx   r   rN  rJ   r  rK   seed_offset  s   


"
zKernelArgs.seed_offsetr  c                 C  sD   t |tjsJ t||f|jdkrd| j|< dS | d| j|S )NrF  ks)r   rs   Symbolr   r   rE  r<  r   rJ   rJ   rK   r|     s
   

zKernelArgs.sizeIterator[str]c                 C  s   t | j | j | j S rX   )r   r=  keysr>  rE  rw   rJ   rJ   rK   
call_names  s   zKernelArgs.call_namesr   c                 C  sX   | j |d}|durt|ts|jS | j|d}|dur%t|ts%|S | j|dS )z;
        Returns inner name of a given outer name.
        N)r?  r:  r   r4  rh   r>  r=  )rx   r   inplacedrJ  rJ   rJ   rK   arg_name  s   zKernelArgs.arg_namerO  rj   ri   c                 C  s   |S rX   rJ   )rx   rO  rj   rJ   rJ   rK   wrap_ptr_arg  r   zKernelArgs.wrap_ptr_argr|   
SymbolLikec                 C  s   t |S rX   )rB   )rx   r|   rJ   rJ   rK   wrap_size_arg  r   zKernelArgs.wrap_size_arg&tuple[list[str], list[str], list[str]]c                 C  s  ddl m}m} g }g }g }t| j D ]5}t|trq|jd }|j	}t
j|}	||	 }
||
 d|  || ||	 ||
 d q| j D ]1\}}|| jv rZqPt
j|}	||	 }
|d|
 d|  || ||	 |d|
 d qP| j D ]4\}}|| jv st|trqt
j|}	||	 }
||
 d|  || ||	 ||
 d q| j D ]*\}}|d| d|  || | |d|  t
jjrt
jj| q| jrJ d|||fS )	Nr=   )DTYPE_TO_CPP
INDEX_TYPEr  z* *zconst  zWorkspace not supported on CPU )Z	cpp_utilsrh  ri  r*   r?  r  r   r4  r0  rh   r0   rn   r{   rN  rd  r=  r  r>  rE  rf  wrapper_codeensure_size_computedr@  )rx   rh  ri  	call_argsarg_defs	arg_typesrb  outerinnerrj   Z	cpp_dtypeZmaybe_innerrJ   rJ   rK   cpp_argdefs  sN   



zKernelArgs.cpp_argdefs?tuple[list[ArgName], list[str], list[KernelArgType], list[Any]]c           	   
   C  s  g }g }g }g }t | j D ]9}t|trq|t|j ||jd  |t	j
|jd  |t|j|jd t	j
|jd d qt| j | j D ]2\}}|| jv sbt|trcqT|t| || |t	j
| |t||t	j
|d qT| j D ]*\}}|t| || |t| |t|| t	j
jrt	j
j| q| jD ]}|t|j ||j || ||j q||||fS )Nr  )r   r   rj   )r*   r?  r  r   r4  rN  r1  rh   r0  r0   rn   r{   r   r   r=  r  r>  rE  r   r   rl  rm  r@  rg   rj   )	rx   ro  rn  rp  Zprecompile_argsrb  rq  rr  rT  rJ   rJ   rK   python_argdefs  s\   





zKernelArgs.python_argdefsIterator[tuple[str, str]]c                 c  s    t | j D ]:}t|trq|jD ].}|tjjv s!|tj	jv r"q|| j
v r0| j
| |jfV  || jv rAtt| j| |jfV  qqd S rX   )r*   r?  r  r   r4  r0  r0   rn   r  r  r=  rh   r>  r   rB   )rx   rb  otherrJ   rJ   rK   aliases@  s   



zKernelArgs.aliasesc                 C  s(   t | j|ttot | j|ttS rX   )r   r>  r:  r5  r4  r?  r   rJ   rJ   rK   
is_removedO  s
   zKernelArgs.is_removedOrderedSet[str]c                 C  sn   t  }t| j D ]}t|trq
||jd  q
| j	 D ]\}}|| jv s.t|tr/q || q |S )Nr  )
r   r*   r?  r  r   r4  rl  r0  r>  r  )rx   Z	live_outsrb  rq  rr  rJ   rJ   rK   live_output_buffersW  s   
zKernelArgs.live_output_buffersN)rl   rB   r8  r9  r   r@   rC   rB   rC   rD   r   )r   r   rC   rW   r   )rI  rB   rJ  rB   rC   rD   )rQ  rb   rV   rW   rC   rR  )rW  rb   rC   rB   )r   rB   rN  r   rC   rB   )r   r  rC   rB   )rC   r_  )r   rB   rC   r   )rO  rB   rj   ri   rC   rB   )r|   re  rC   rB   )rC   rg  )rC   rt  )rC   rv  r  )rC   rz  )r[   r\   r]   r_   r<  r  rC  rE  rH  r0  rP  rV  rZ  r\  r|   ra  rc  rd  rf  rs  ru  rx  ry  r{  rJ   rJ   rJ   rK   r7     s.    






)







)
1
r7  c                      sX   e Zd ZdZ	dd fd	d
Zd ddZd!ddZd"ddZd#ddZd ddZ	  Z
S )$r  aD  A CSEVariable is just a name for an expression but it is useful to be able to annotate them on a backend dependent basis.
    To do so, the backends can simply overload `Kernel.create_cse_var`
    The "CSEVariable.update_on_args" method gives you a hook for annotations
    See example of TritonCSEVariable in triton.py
    Nr   rB   boundsValueRanges[Any]rj   r  c                   s4   t    t|tsJ || _|| _d| _|| _d S rS  )rH  r  r   r   r   r}  	use_countrj   )rx   r   r}  rj   rJ  rJ   rK   r  k  s   

zCSEVariable.__init__rC   c                 C  ru   rX   r  rw   rJ   rJ   rK   r6  x  rz   zCSEVariable.__str__r   c                 C  s
   t | jS rX   )hashr   rw   rJ   rJ   rK   __hash__{  r   zCSEVariable.__hash__rw  objectrW   c                 C  s   t |to
|j| jkS rX   )r   r  r   )rx   rw  rJ   rJ   rK   __eq__~  r   zCSEVariable.__eq__r  r   r  rD   c                 C  r   rX   rJ   )rx   r   r  r  rJ   rJ   rK   update_on_args  r   zCSEVariable.update_on_argsc                 C  s   | j j d| jdS r  )rK  r[   r   rw   rJ   rJ   rK   rC    r   zCSEVariable.__repr__rX   )r   rB   r}  r~  rj   r  r   )rC   r   )rw  r  rC   rW   )r   rB   r  r   r  r   rC   rD   )r[   r\   r]   r   r  r6  r  r  r  rC  rL  rJ   rJ   rJ  rK   r  d  s    




r  AugmentedKeyT)defaultCSEVariableType)boundr  .c                   @  s   e Zd ZdZ							dAdBddZdCddZdDddZdDddZdEd d!ZdFd$d%Z	dGd'd(Z
dHd*d+ZdId,d-Ze d.d.dd/dJd:d;Ze dfdKd<d=Ze dfdLd?d@ZdS )MCSEz Common subexpression eliminationr   tmpNrl   rB   suffixname_prefixiter_buffersOptional[itertools.count[int]]store_cache.Optional[MutableMapping[str, CSEVariableType]]reduction_cache<Optional[MutableMapping[ReductionCacheKey, CSEVariableType]]varname_map$Optional[dict[str, CSEVariableType]]c                 C  sP   || _ || _i | _|| _|pi | _|pi | _|pt | _t	 | _
|p$i | _d S rX   )rl   r  _cacher  r  r  	itertoolsrc   iter_buffer_idsr   invalidated_storesr  )rx   rl   r  r  r  r  r  r  rJ   rJ   rK   r    s   
zCSE.__init__	keep_varsOrderedSet[CSEVariable]rC   rD   c                   s`   g | j  D ]\}}| vr| j |= | j| q r+ fdd| j D | _d S i | _d S )Nc                   s   i | ]\}}| v r||qS rJ   rJ   )r   r  r  r  rJ   rK   r     s    z"CSE.invalidate.<locals>.<dictcomp>)r  r  r  rl  r  )rx   r  r   r  rJ   r  rK   
invalidate  s   
zCSE.invalidatetyping.Selfc              	   C  s(   t | | j| j| j| j| j| j| jdS )N)rl   r  r  r  r  r  r  )r   rl   r  r  r  r  r  r  rw   rJ   rJ   rK   clone  s   z	CSE.clonec                 C  s0   |   }t| j|_t| j|_t| j|_|S )zNReturn a copy of using ScopedDict so changes to *_cache aren't visible in self)r  r%   r  r  r  )rx   Znew_cserJ   rJ   rK   scoped_copy  s
   zCSE.scoped_copy	cache_keyr  c                 C  s
   t t|S )z@Override this method to augment cache key with backend specifics)r   r  rx   r  rJ   rJ   rK   augment_key  r  zCSE.augment_keyrL  r  c                 C  s   || j | |< d S rX   r  r  )rx   r  rL  rJ   rJ   rK   put     zCSE.putrW   c                 C  s   |  || jv S rX   )r  r  r  rJ   rJ   rK   contains  r:  zCSE.containsOptional[CSEVariableType]c                 C  s   | j | |d S rX   )r  r:  r  r  rJ   rJ   rK   try_get  r  zCSE.try_getc                 C  s   | j | | S rX   r  r  rJ   rJ   rK   r:    r:  zCSE.getT)r}  write
assignmentrj   r   r#   r   CUnion[str, CSEVariable, OpsValue, IndentedBuffer, DeferredLineBase]r}  r~  r  r  rj   r  c             	   C  s  t |tr|j}|s|sJ t |tr&|j||_| jd7  _tt|S t |t	r0|
 }nt |tr9|j}n	t |ts@J |}| |}|s| ||}| || |rtjjrdtjjj|dd t |t	r|rv|| j | d || || j |S t |tr|sJ ||| j | d|j | j  |S |r| j | d| | j }	n| | j }	||	 |rtjjr|d urd| j | dt| d}
||
 |S |j||_| jd7  _|S )	Nr=   T)Z	only_oncez =z = tl.static_assert(
.dtype == rQ   )r   r-   rN  r  r}  Ztightenr  r   r  r#   getvaluer!   r  rB   r  newvarr  r0   r  current_nodeZcodegen_originating_infor'  rl   splicer  r   r   test_configsruntime_triton_dtype_assertr)   )rx   r   r   r}  r  r  rj   r  r  r  Zassert_linerJ   rJ   rK   generate  sb   










 

zCSE.generatec                 C  s2   | j  t| j }tj|||}|| j|< |S rX   )r  rm   r  r0   r  create_cse_varr  )rx   r}  rj   var_namer  rJ   rJ   rK   r    s   
z
CSE.newvarr   c                   s8   t  | jv fdd tj ||}|| j < |S )Nc                     s
   d  S )Nzduplicate name: rJ   rJ   r  rJ   rK   r   0  r  zCSE.namedvar.<locals>.<lambda>)r   _check_valuer  r0   r  r  )rx   r   r}  rj   r  rJ   r  rK   namedvar)  s   
zCSE.namedvar)r   r   r  NNNN)rl   rB   r  rB   r  rB   r  r  r  r  r  r  r  r  )r  r  rC   rD   rC   r  )r  rB   rC   r  )r  rB   rL  r  rC   rD   )r  rB   rC   rW   )r  rB   rC   r  )r  rB   rC   r  )r   r#   r   r  r}  r~  r  rW   r  rW   rj   r  rC   r  )r}  r~  rj   r  rC   r  )r   rB   r}  r~  rj   r  rC   r  )r[   r\   r]   r   r  r  r  r  r  r  r  r  r:  r   unknownr  r  r  rJ   rJ   rJ   rK   r    s:    








Fr  c                      s2   e Zd Zd fddZdddZdddZ  ZS )CodeGenrC   rD   c                   s   t    t | _d S rX   )rH  r  r,  	ExitStack
exit_stackrw   rJ  rJ   rK   r  8  s   
zCodeGen.__init__r  c                 C  s   | j   | S rX   )r  	__enter__rw   rJ   rJ   rK   r  <  s   
zCodeGen.__enter__exc_typer   exc_valexc_tbc                 C  s   | j ||| d S rX   )r  __exit__rx   r  r  r  rJ   rJ   rK   r  @  r  zCodeGen.__exit__r|  r  r  r   r  r   r  r   rC   rD   )r[   r\   r]   r  r  r  rL  rJ   rJ   rJ  rK   r  7  s    
r  c                      sZ  e Zd ZU dZded< dZded< dZded< 	dwdx fddZej	dyddZ
ej			dzd{ddZd|d!d"Zd|d#d$Zd}d&d'Z	d~dd*d+Zdd2d3Zdd:d;Zdd>d?ZddAdBZ		dzddLdMZeddNdOZ	d~ddWdXZdd[d\Zdd]d^Zd fd`daZd fdfdgZddhdiZddjdkZddldmZddodpZddrdsZ ddudvZ!  Z"S )Kernelr   rB   newvar_prefixr  Nz'Optional[Callable[[], OpsHandler[Any]]]	overridesTr  Optional[KernelArgs]increase_kernel_countrW   rC   rD   c                   s   t    |rt jd7  _|pt | _t | _t | _t | _	d| _
d| _t| j| j| _tt  | _tt  | _d | _d | _d | _d | _tt  | _tt  | _i | _d| _d | _d S )Nr=   r   )rH  r  r   Zgenerated_kernel_countr7  r  r#   loadscomputestoresnum_loadnum_reductionr  r  r  cser   rB   must_keep_buffersstore_buffer_namesZ
_load_maskZ_load_otherr  node_to_boundsr  r  Zinplace_update_buffersZmin_elem_per_threadZkernel_name)rx   r  r  rJ  rJ   rK   r  I  s*   

zKernel.__init__r  r<   r#  c                 c  s:    | j }|| _ |j  | _z	d V  W || _ d S || _ w rX   )r  r@  r}  Z
get_boundsr  )rx   r  priorrJ   rJ   rK   set_current_nodek  s   zKernel.set_current_nodelbr#   cbOptional[IndentedBuffer]sbc           	      c  s    |d u r|}|d u  }rt  }| j}| j}| j}| j}|| _|| _|| _| | _zd V  W || _|| _|| _|| _|rC|rEJ dd S d S || _|| _|| _|| _|r[|r[J dw )Nz$unexpected store inside swap_buffers)r#   r  r  r  r  r  )	rx   r  r  r  Zdisallow_storesr  r  r  r  rJ   rJ   rK   swap_buffersu  s:   
zKernel.swap_buffersr   r   rb   r  c                 C  r   rX   r   r  rJ   rJ   rK   r
    r   zKernel.loadc                 C  s,   | j }z| j| _ | ||W || _ S || _ w )z+A load the depends on an index we have read)r  r  r
  )rx   r   r   r  rJ   rJ   rK   indirect_load  s
   zKernel.indirect_loadrN  c                 C  r   rX   r   r  rJ   rJ   rK   r    r   zKernel.store_reductionr  r/   c                 C  r   rX   r   r  rJ   rJ   rK   r    rQ  zKernel.storerj   ri   r  r  r.   +Union[CSEVariable, tuple[CSEVariable, ...]]c                 C  r   rX   r   r  rJ   rJ   rK   r       zKernel.reductionr  r  r  UCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]r  tuple[CSEVariable, ...]c                 C  r   rX   r   r  rJ   rJ   rK   r    s   zKernel.scanr  r  c                 C  r   rX   r   r  rJ   rJ   rK   r    r  zKernel.sortdict[sympy.Symbol, sympy.Expr]c                 C  r   rX   r   rw   rJ   rJ   rK   
var_ranges  r   zKernel.var_rangesr  r  r  r  r  r  r  r  Optional[CSEVariable]c                 C  r   )z3
        See [Note: Inductor bucketize op]
        r   r  rJ   rJ   rK   r    s   zKernel.bucketizec                 C  r   rX   r   rw   rJ   rJ   rK   assert_function  r   zKernel.assert_functionr  Union[CSEVariable, str]r  r   r  mask!Optional[Union[CSEVariable, str]]c              	   C  s   t |tr	t|}t |tsJ |d u st |tsJ |d u s&t |ts&J |rD|rDd| d| d| d| d	}| d| d| }n|rP| d| }|}n|sTJ | d| }|}|rhd| d| d}| j d| d| dS )	Nr  z <= z) & (z < rQ   z) | ~(z, "index out of bounds: z"))r   r  rB   r  )rx   r  r  r  r  r  Z
cond_printrJ   rJ   rK   indirect_assert  s"   
zKernel.indirect_assertr   r|   c                 C  r   rX   r   r  rJ   rJ   rK   r    rQ  zKernel.check_boundsc                 C  r   rX   r   r  rJ   rJ   rK   index_to_str  r   zKernel.index_to_strr  c                   sF   t    | js
J | jtt| |   | jt|  | S rX   )	rH  r  r  r  enter_contextr0   Zset_ops_handlerCSEProxyZset_kernel_handlerrw   rJ  rJ   rK   r    s   

zKernel.__enter__r  r   r  r  c                   s   |    t ||| d S rX   )remove_kernel_local_buffersrH  r  r  rJ  rJ   rK   r    s   zKernel.__exit__c                   s   t jjsdS tfdd| jD }tt   | jD ]}|| jvr4|| jjvr4	||r4 
| q D ]3}|| jjv re| jj| }t|trKq7t fdd|jD }|r^| | | j
| q7| | q7dS )z
        Any buffers that are both created and have a last use in the
        same kernel can be removed.

        Note that V.graph.scheduler can be None when codegening triton template
        kernels.
        Nc                 3  s(    | ]}| j v r j |  V  qd S rX   )Zname_to_bufZdefining_op_namer   rO  )r?  rJ   rK   r%    s    
z5Kernel.remove_kernel_local_buffers.<locals>.<genexpr>c                 3  s    | ]}| v V  qd S rX   rJ   r  )names_to_removerJ   rK   r%  ,  s    )r0   rn   r?  r   r  rB   r  r  r=  Z$can_buffer_be_removed_through_fusionrl  r?  r   r4  r(  r0  remove_inplace_bufferr  remove_buffer)rx   Zfused_node_namesr   rO  removerJ   )r  r?  rK   r    s6   





z"Kernel.remove_kernel_local_buffersc                 C  (   t d| t| jj|< | j| d S )Nzremove_buffer(%r))rf  rI   r5  r  r>  r  rl  r   rJ   rJ   rK   r  3  s   zKernel.remove_bufferc                 C  r  )Nzremoving_inplace_buffer(%r))rf  rI   r5  r  r?  r  rl  r   rJ   rJ   rK   r  ;  s   zKernel.remove_inplace_buffer;Union[list[sympy.Expr], tuple[sympy.Expr, ...], sympy.Expr]c                   s\   t |ttfr fdd|D S tjj|}t|jdd d} fdd|D }t	||S )Nc                   s   g | ]}  |qS rJ   )rename_indexingr  rw   rJ   rK   r!  F  s    z*Kernel.rename_indexing.<locals>.<listcomp>c                 S  ru   rX   r  )srJ   rJ   rK   r   H  s    z(Kernel.rename_indexing.<locals>.<lambda>)r#  c                   s0   i | ]}t |tjtjtjfr| j|qS rJ   )r   r   ZUNBACKED_INTZSIZEZPRECOMPUTED_SIZEr  r|   r  rw   rJ   rK   r   I  s    z*Kernel.rename_indexing.<locals>.<dictcomp>)
r   listtupler0   rn   rE  rC  sortedZfree_symbolsr(   )rx   r   Zsorted_symbolsZreplacementsrJ   rw   rK   r  @  s   

zKernel.rename_indexingr  c                 O  s   t |i |S rX   )r  r  rJ   rJ   rK   r  W  s   zKernel.create_cse_varr7   c                 C  s   |du rdS | j | S )zC
        Returns arg name of a given input or output node.
        N)r  rc  r   )rx   r  rJ   rJ   rK   rc  Z  s   zKernel.arg_name)NT)r  r  r  rW   rC   rD   )r  r<   rC   r#  r  )r  r#   r  r  r  r  rC   r#  r   rB   r   rb   rC   r  r   rB   r   rb   rN  r  rC   rD   rX   
r   rB   r   rb   rN  r  r  r/   rC   rD   
rj   ri   r  ri   r  r.   rN  r  rC   r  r  r  r  r  r  r  rC   r  
r  r  r  r  r  rW   r  rW   rC   r  )rC   r  r  r  r  r  r  r  r  ri   r  rW   r  r  r  r  rC   r  r   )
r  r  r  r   r  r   r  r  rC   rB   r  )r   rb   rC   rB   r  r  r|  r   rB   rC   rD   )r   r  rC   rb   )r  r   r  r   rC   r  )r  r7   rC   r   )#r[   r\   r]   r  r   r  r  r  r,  r-  r  r  r
  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  rc  rL  rJ   rJ   rJ  rK   r  D  sL   
 "	





	


	


	

'


r  c                   @  s2   e Zd ZU dZded< dZded< dZded	< dS )
r"  r8  zClassVar[str]r#  Nr  rj   r   rB   ops_name)r[   r\   r]   r#  r   rj   r  rJ   rJ   rJ   rK   r"  c  s   
 r"  c                  C  s.   zdd l } | j| jdW S  ty   Y d S w )Nr   )	undefined)jinja2EnvironmentZStrictUndefinedImportError)r  rJ   rJ   rK   
jinja2_envk  s   r  c                   @  s\   e Zd ZdZe	d!d"d	d
Zed#ddZed$ddZd%ddZd&ddZ	d'ddZ
d S )(KernelTemplatezg
    Base class for defining kernel templates.

    Children classes: TritonTemplate, CUDATemplate
       sourcerB   num_indentsr   indents_spacingrC   c                   sD   |  d}t|dkr fdd|dd  D |dd < d|S )NTr=   c                   s   g | ]
}d    | qS )rk  rJ   )r   r  r	  r  rJ   rK   r!    s    z6KernelTemplate.indent_except_first.<locals>.<listcomp>r   )
splitlinesr'  rr   )r  r  r	  linesrJ   r
  rK   indent_except_first~  s   


z"KernelTemplate.indent_except_firstr   c              
   C  sj   t  }|d u r	d S tj|jd< ddlm} z|| W S  |y4 } zG dd d|}|||d }~ww )Nr  r   )TemplateSyntaxErrorc                      s(   e Zd Zd
 fddZddd	Z  ZS )zIKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxErrororiginal_errorr  rC   rD   c                   s$   t  |j|j|j|j || _d S rX   )rH  r  messagelinenor   filenamer  )rx   r  rJ  rJ   rK   r    s   
zRKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__init__rB   c                 S  s   d| j  d}|d| j d7 }t| jdrs| jjd}|d7 }td| j d }tt|| j d }t	||D ]:}|| j d krd||d  d	||  d7 }t| jd
rc|dd| jj
d   d 7 }q8||d  d||  d7 }q8|S )NzError in template at line 
zError message: r  z	Context:
r   r   r=   z: --> columnz     rk  z^
z:     )r  r  rG  r  r  splitmaxminr'  r&  r  )rx   
error_infor  startendr  rJ   rJ   rK   r6    s*   zQKernelTemplate._template_from_string.<locals>.DetailedTemplateSyntaxError.__str__)r  r  rC   rD   r   )r[   r\   r]   r  r6  rL  rJ   rJ   rJ  rK   DetailedTemplateSyntaxError  s    	r  )r  r  r  filtersr  r  Zfrom_string)r  envr  er  rJ   rJ   rK   _template_from_string  s   
z$KernelTemplate._template_from_string	fake_outsUnion[list[Buffer], Buffer]Callable[[str], torch.dtype]c                   sJ   t jj t| ttfrdd | D n|  |  id	 fdd}|S )
Nc                 S  s   i | ]	}|  | qS rJ   )r   r{   r  rJ   rJ   rK   r     r&  z2KernelTemplate._fake_get_dtype.<locals>.<dictcomp>r   rB   rC   ri   c                   s    | }|d ur|S  | S rX   )r:  )r   r;  Z_get_dtype_reallookuprJ   rK   r{     s   
z1KernelTemplate._fake_get_dtype.<locals>.get_dtype)r   rB   rC   ri   )r0   rn   r{   r   r  r  r   )r   r{   rJ   r#  rK   _fake_get_dtype  s   zKernelTemplate._fake_get_dtyper   rD   c                 C  s
   || _ d S rX   r  r   rJ   rJ   rK   r    r   zKernelTemplate.__init__choices	list[Any]r  Optional[NotImplementedError]c              
   K  sf   z| | jdi | W dS  ty2 } ztjd|t| t tjk d |W  Y d}~S d}~ww )a%  
        Maybe generates a new ChoiceCaller and appends it into existing choices.
        Returns None if success, otherwise returns the error.

        choices: A list of ChoiceCallers.
        kwargs: Additional kwargs to be passed to self.generate() to generate a new ChoiceCaller.
        Nz3Cannot Append Choice: %s. KernelTemplate type is %s)
stack_inforJ   )	rN  r  rS   rf  infor   getEffectiveLevelrG   INFO)rx   r&  r  r  rJ   rJ   rK   maybe_append_choice  s   z"KernelTemplate.maybe_append_choicer5   c                 K  r   )zM
        Generates a ChoiceCaller instance from the given arguments.
        r   )rx   r  rJ   rJ   rK   r    s   zKernelTemplate.generateN)r  )r  rB   r  r   r	  r   rC   rB   )r  rB   rC   r   )r   r!  rC   r"  r  )r&  r'  r  r   rC   r(  )r  r   rC   r5   )r[   r\   r]   r   r_   r  r  r%  r  r-  r  rJ   rJ   rJ   rK   r  w  s    
,

r  c                      s   e Zd Zd ZdR fddZdSddZdTddZ		dUdVddZdWd$d%ZdXd'd(Z	dYd*d+Z
	,dZd[d/d0Zd\d1d2Zd]d9d:Zd^dAdBZd_dEdFZ	,	,d`dadPdQZ  ZS )br  r  Kernel[Any]parent_handlerOpsHandler[Any]c                   s.   t    ddlm} | | _|| _|| _d S )Nr   ValueRangeAnalysis)rH  r  r}  r2  vr_analysisr  r/  )rx   r  r/  r2  rJ  rJ   rK   r    s
   

zCSEProxy.__init__r   rB   r  tuple[Any, ...]r  dict[str, Any]rC   r   c                   s^   | j g R i t| j i t dd fdd}t|S )	Nr   r  rB   rC   r  c                   s   t jjd ur!t j j}|dkrtjdkn
|dkrtjdknd}nd}|r8dkr-j}nt	 i }nd }t j
jjt j
j| |d}tjjrn|rnddlm} t|ttfr^| }t j
jd	| d
|| d d7 |  |S )Nr   r   r   FZmasked)r}  rj   r   )r)   r  r  rQ   r=   )r0   rn   rY  rS  r   r   r   r   rj   r  r  r  r  r  r  r  Ztorch._inductor.codegen.tritonr)   r   r  r  r'  r  )r  Z
device_strZtriton_backendr5  Zcsevarr)   r  r}  Zdtype_handlerr  r   Z
output_idxrN  rJ   rK   do_cse  sJ   

z!CSEProxy._default.<locals>.do_cse)r  rB   rC   r  )_bound_variabler  r/  r   pytreeZtree_map)rx   r   r  r  r7  rJ   r6  rK   _default  s   3zCSEProxy._defaultr~  c                   s   ddl m} ddlm} ttj|rt S tj	j
  j|kr8| jjdur8t| jjts.J | jj t S tjrgt||rgt fdddD rOt S |rSJ ddd}tt||}t| j|| S t S )z
        If the variable comes from an FX node, we forward the bound we have already computed
        Else, if the variable when codegen'ing another op, we try to compute its bounds
        r   r1  )TritonTemplateKernelNc                 3  s    | ]}| j v V  qd S rX   )r-  )r   r  Zfx_noderJ   rK   r%  D	  s    z+CSEProxy._bound_variable.<locals>.<genexpr>)Zset_indirectr  r  rR  r   rC   c                 S  s(   t | tr| jS t | tjrt| S | S rX   )r   r  r}  rs   rF  r   rV  rJ   rJ   rK   arg_to_boundM	  s
   
z.CSEProxy._bound_variable.<locals>.arg_to_bound)rR  r   rC   r   )r}  r2  Zselect_algorithmr;  r   r0   r  r   r  interpreterr  r-  r  dictr:  r   Zcompute_all_boundsrG  r  r  rB  r  r3  )rx   r   r  r  r2  r;  r=  Z
arg_boundsrJ   r<  rK   r8  2	  s    
zCSEProxy._bound_variableTr  r  r|   r  r  rW   r  r  c                 C  sX  t |tr
t|}t |tjsJ ||jjdk r|r;t|t	|t
j}|jjdkr:t|d}t|||}n|}t }|jt krtt |tjrt|jtt d@ }t|j| |j| }|jjdkrt|jtdt@ }	||	B }| jjj| jj||d}| j|||}
t|r|jjdk }t |tj p|jj|k  }| j|
||| |
S )Nr   r  )r}  )r   r   rs   r  rF  r}  r  r+   rl  r  r   longr  ltr  r   r  Numberr   r  r  r  r  r/  r  r"   r  )rx   r  r|   r  r  ZstmrA  Z
new_boundsZ
neg_boundsposZ	sympy_varZassert_lowerZassert_upperrJ   rJ   rK   r  Y	  s:   


zCSEProxy.indirect_indexingr   rb   r  r  rD   c                 C     | j ||||S rX   )r  r  r  rJ   rJ   rK   r  	  s   zCSEProxy.check_boundsr   c                 C  s|   || j jjv rtj j| t|tjr| j 	||S | j jj
}||v r(|| S | j ||}|jdkr<| j  jd7  _|S rS  )r  r  r  r0   r  rl  r   r   TMPr  r  r
  r  r  )rx   r   r   r  outrJ   rJ   rK   r
  	  s   

zCSEProxy.loadrN  c                 C  sX   || j jj|< | j jr&|tjjv r(| j j|}| D ]}|| j jj|< qd S d S d S rX   )	r  r  r  r  r0   rn   Zname_to_bufferZ
get_outputZget_mutations)rx   r   rN  rO  Z
other_namerJ   rJ   rK   _update_store_cache	  s   zCSEProxy._update_store_cacheNr  r/   c                 C  sF   | j j| |d u r| || |tjjvr!| j j||||dS d S )N)r  )r  r  rl  rG  r0   rn   r  r  r  rJ   rJ   rK   r  	  s   zCSEProxy.storec                 C  s:   | j j| | || |tjjvr| j |||S d S rX   )r  r  rl  rG  r0   rn   r  r  r  rJ   rJ   rK   r  	  s
   zCSEProxy.store_reductionrj   ri   r  r  r.   r  c                 C  s"   | j  jd7  _| j ||||S rS  )r  r  r  r  rJ   rJ   rK   r  	  s   zCSEProxy.reductionr  r  r  r  r  r  c                 C  s   | j |||S rX   )r  r  r  rJ   rJ   rK   r  	  s   	zCSEProxy.scanr  r  c                 C  rD  rX   )r  r  r  rJ   rJ   rK   r  	  s   zCSEProxy.sortr  r  r  r  r  r  r  r  r  c              	   C  s   | j |||||||S )a  
        [Note: Inductor bucketize op]

        Inputs:
        -------
        values: the values to be bucketized.
        boundaries: a tuple containing
          (a) the name of the boundaries tensor (which must be sorted, unless
          the sorting tensor is present),
          (b) the length of the tensor in the last dimension (i.e. the length of
          one set of boundaries),
          (c) the number of elements in the underlying storage (i.e. the length
          of the flattened tensor, ignoring striding), and
          (d) the stride of the tensor in the last dimension.
        boundary_indices: indices into a flattened version of the boundaries
        tensor, of the same size and shape as "values".  Each index points to
        the first element in the set of boundaries to be used for the
        corresponding value.
        indexing_dtype: the dtype to use when indexing into the boundaries
        tensor.  This must be int64 or int32.  This additionally specifies the
        dtype of the return value.
        right: see "Details" below.
        sorter: an optional tuple containing
          (a) the name of an optional sorting tensor, used to access unsorted
          boundaries without reordering the boundaries tensor, and
          (b) the stride of the tensor in the last dimension.
        The values in the sorting tensor are used as indices into the *last*
        dimension of the boundaries tensor, with all other indices matching.
        The size of the sorting and boundaries tensors must be equivalent.
        sorter_indices: must be present if the sorting array is present; see
        "boundary_indices" for the equivalent definition for the boundaries
        tensor.

        Output:
        -------
        The buckets each value belongs in, within a given set of boundaries.  0
        indicates a position before the first boundary, and len(boundaries_set)
        represents a position after the last boundary.

        Details:
        --------
        Given a value and a set of boundaries, calculate the bucket that each
        value belongs to.  This works differently in 1-D and N-D cases.

        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [0, 4, 4, 8], right=True
        return =   [[ 0, 1, 1, 1], [1, 3, 3, 4]].

        for values [[-1, 0, 1, 2], [3, 4, 5, 9]], boundaries [[0, 4], [4, 8]], right=True
        return =   [[ 0, 1, 1, 1], [0, 1, 1, 2]]

        Note that in the N-D boundaries case, the shape of "values" and
        "boundaries" must match in every dimension _except_ the last.

        When right == False, bucket i refers to range (boundaries[i], boundaries[i+1]].
        When right == True,  bucket i refers to range [boundaries[i], boundaries[i+1]).

        Boundaries must be non-decreasing, or a sorter must be provided which
        would re-index offsets in a non-decreasing order (e.g. the second output
        of torch.sort(offsets)).  Otherwise, the result is undefined.
        )r  r  r  rJ   rJ   rK   r  	  s   FzCSEProxy.bucketize)r  r.  r/  r0  )r   rB   r  r4  r  r5  rC   r   )r   rB   r  r   r  r   rC   r~  r  )
r  r  r|   r  r  rW   r  rW   rC   r  r  r  )r   rB   rN  r  rC   rD   rX   r  r  r  r  r  r  r  )r[   r\   r]   r   r  r:  r8  r  r  r
  rG  r  r  r  r  r  r  rL  rJ   rJ   rJ  rK   r    s(    

=+
1







r  )rA   rB   rC   rD   rX   )
rf   rB   r   r   r   r   r   r   rC   rD   )rf   r   rC   r   )rf   r   r   r   rC   rW   )rf   rB   rC   r   )F)rf   rB   r   rW   rC   r   r|  )r   r   r   r   r   r   rC   r   )rf   rB   r   r   rC   rD   )rf   rB   rC   r   )r   rB   r  r   r  r   rC   r  )r  rB   rC   rW   rJ   r  )rC   r   )
__future__r   r,  dataclassesenumr)  r  rG   rg  r1  retypingr   r   r   r   r   r   r	   r
   r   r   r   r   Ztyping_extensionsr   rs   r   Ztorch.fxZtorch._prims_commonr   Ztorch.utilsr   r9  Ztorch.utils._ordered_setr   Ztorch.utils._sympy.numbersr   Ztorch.utils._sympy.printersr   Z_PythonPrinterZtorch.utils._sympy.symbolr   r   r   Ztorch.utils._sympy.value_rangesr   r   r   r   r   Zdtype_propagationr   Zops_handlerr   r   utilsr    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   Zvirtualizedr+   r,   r-   r.   r/   r0   collections.abcr1   r2   r3   r}   r4   r5   r6   r7   r>  r9   r?  r:   r;   r<   r   r?   r@   r   r   r   rB   r^  re  rO  Z_loggingZgetArtifactLoggerr[   rE   	getLoggerrf  rL   rM   ra   	dataclassr   r   r   r   r   ZKernelArgTyper   r   r   r   r   r   r   r   r   r   	lru_cacher   r   r   r   Zbfloat16r  Zfloat16rW   r\  Zfloat64Zint8Zint16rU  r  r   Zuint16rX  Zuint64r   r  r  rM  compile
IGNORECASEr  r  r  r  r?  ZINT_TO_FLOATr  r  r  r!  r/  r1  r4  r5  r7  r  r  r  r  rj   ZReductionCacheKeyr  r  r  r"  r  r  r  rJ   rJ   rJ   rK   <module>   sV   ,4 
^	6M*b
V e
&,49>CHOU[`flrw}             $  )  .  3  8  =  B  G  L  Q  V  [  c	  F$ %  !s