a
    hH                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZmZmZ d dlmZmZ d dlmZ d dlmZ d dl
mZmZmZmZmZmZmZmZmZ d dlmZmZm Z  d d	l!m"Z" d dl#Z#d d
l#m$Z$m%Z%m&Z& d dl'm(  m)  m*Z+ d dl,m-  m.Z/ d dl0Z1d dl2Z1d dl3m.  m4Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z?m@Z@mAZAmBZBmCZC d dlDmEZE d dlFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZP d dlQmRZR d dlSmTZTmUZUmVZV d dlWmXZX ddlYmZZZm[Z[ ddl\m]Z]m^Z^m_Z_m`Z` ddl[maZambZbmcZcmdZdmeZe ddlfmgZg ddlhmiZimjZjmkZkmlZl ddlmmnZn ddlompZpmqZq ddl.mrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~mZmZmZmZmZmZmZmZ ddlmZmZmZ er2d dlmZ d d lmZ dd!lmZ dd"lmZ dd#l.mZ neZd$ed%< zd dlZejZd&ZW n eyp   dZd'ZY n0 ed(Zed)Zed*Zeee$f Zd$ed+< eeee$f Zd$ed,< eeZejejd-d.Ze1jjZeed/eed/f d0d1eeeeeed/f d/d0d1f   f Zd$ed2< d3d4d5d6d7Zejd&d8G d9d: d:Zd;d<d=d>d?Zd@dAdBdCdDZdEdFdGdHdIZdEdFdGdJdKZdLdMdNdOdPdQZdRd4dSdTdUdVZg dWZg dXZdxdYdZdEd[d\d]Zd^dEdGd_d`ZdydYdZdEd[dadbZedzdcd4d<dddedfZed{d1d4dgdddhdfZd|did4djdddkdfZdldmdndodpZdqdrd5dsdtZdud4d5dvdwZdud4d5dxdyZdzd{d4d|d}d~Zdddd4dddZddddddZdd<dddZdddddZG dd1 d1Ze~d'd8G dd dZe~G dd deZddddddZe~G dd deăZe~G dd deƃZededededededdZded< d}d@dd4ddddZe~G dd deăZeee$ ee$ gef ZG dd deʃZG dd dẽZG dd dẽZe~G dd deăZe~G dd deσZe~G dd deăZd1d4d5ddZd1d4d5ddZӐd~d1d4d4dd4dddddÄZd1d^d4dĜddƄZd1d4dǜddɄZe~G dd˄ deZe~G dd̈́ de׃Ze~G ddτ de׃Ze~G ddф de׃Ze~G ddӄ de׃Ze~G ddՄ deۃZe~G ddׄ de׃Ze~G ddل de׃ZG ddۄ de܃Ze~G dd݄ deZe~G dd߄ deZe~G dd deZddd4dddZdd{dddZG dd dZe~G dd deZG dd deZG dd deZG dd deZG dd deZG dd deZe~G dd deZG dd deZe~d'd8G dd dee^Ze~d'd8G dd deeÃZG dd deZG d d deZG dd deZe~G dd deZe~G dd deZe~d'd8G dd	 d	eZG d
d deZG dd deZeeeeeeeeeeef  f ZG dd dZG dd deZG dd deZG dd deZG dd deZe~d'd8G dd deZ G dd de ZG dd deZe~d'd8G dd de Ze~d'd8G d d! d!eZG d"d# d#eZG d$d% d%eZG d&d' d'eZG d(d) d)eZG d*d+ d+eZ	G d,d- d-eZ
G d.d/ d/eZG d0d1 d1eZG d2d3 d3eZG d4d5 d5eZG d6d7 d7eZG d8d9 d9eZG d:d; d;eZG d<d= d=eZG d>d? d?eZG d@dA dAeZG dBdC dCeZG dDdE dEeZe~d'd8G dFdG dGZG dHdI dIeZe~d'd8G dJdK dKeZe~G dLdM dMeZG dNdO dOeZejG dPdQ dQeZG dRd/ d/eZG dSdT dTeZe~d'd8G dUdV dVeZdWd4dXdYdZZ e~d'd8G d[d\ d\eZ!e~d'd8G d]d^ d^eZ"d_d`dadbdcZ#e~d'd8G ddde deeZ$G dfdg dgeZ%G dhdi dieZ&e~G djdk dke&Z'e~G dldm dme&Z(G dndo doeZ)G dpdq dqe)Z*d3drdsdtduZ+d3drdsdvdwZ,dS (      )annotationsN)	GeneratorIterableSequence)AbstractContextManagernullcontext)Enum)partial)	AnyCallableClassVarLiteralOptionaloverloadTYPE_CHECKINGTypeVarUnion)assert_neverNever	TypeAlias)patch)ExprIntegerSymbol)identity)GraphModuleSerializer)can_auto_functionalize)metrics)compute_required_storage_lengthis_boolean_dtypeis_float_dtypemake_channels_last_strides_for
StrideType)get_schema_info)
&_remove_effect_token_unbacked_bindingscompute_unbacked_bindingsfree_symbolsfree_unbacked_symbolsIterateExprsrebind_unbackedresolve_unbacked_bindingsShapeEnvstatically_known_trueSymTypes
OrderedSet)CleanDivFloorDivModularIndexing)SymT   )configdependencies)BackendFeatureCodegenSymbolget_scheduling_for_deviceindex_prevent_reordering)Depextract_free_symbols#extract_input_node_reduction_rangesextract_read_writesvar_builder)LoopBody)OpCounterCSEOpCountResultReductionType	StoreMode)benchmarker)DevicePropertiesReductionHint)argsortargsort_symcache_on_selfceildivconvert_shape_to_inductorconvert_shape_to_symintdeveloper_warningdo_bench_using_profilingdtype_from_sizeget_dtype_sizeget_kernel_metadataGPU_ALIGN_BYTESir_dataclass
is_dynamicis_gpu	sympy_dotsympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_substensor_is_aligned)opsOpsValueV)FakeScriptObject)Node)CUDATemplate)GraphLowering)IndentedBufferr   rb   TF_T_U_V_IntLike_NumLikez  prefix	TensorBoxr   IRNode_NodeOrNodesobjectboolxreturnc                 C  s   t | ttfS N)
isinstanceintr   rr    rx   @/var/www/auris/lib/python3.9/site-packages/torch/_inductor/ir.py
_is_static   s    rz   )frozenc                   @  s>   e Zd ZU ded< ded< ded< ded< d	ed
< ded< dS )GraphPartitionSignatureOrderedSet[sympy.Symbol]Zsymbol_inputsz5dict[str, Union[IRNode, sympy.Expr, TorchBindObject]]input_nodeslist[IRNode]Zoutput_nodeszdict[str, bool]Zinput_deallocationrp   Zskip_cudagraphz	list[str]Zconstant_namesN__name__
__module____qualname____annotations__rx   rx   rx   ry   r|      s   
r|   Optional[_NodeOrNodes]None)node_or_nodesrs   c                   s    ddd fdd  |  d S )Nr   r   )nodesrs   c                   s   | d u r
nzt | ttfr,| D ]} | qnXt | trN|  D ]} | q>n6t | ttttt	j
jjttttf	sJ dt|  dd S )NzFound zE, which is not a supported top level IR node. See [Note: Inductor IR])ru   listtupledictvalues
ExpandViewDynamicScalarAssertScalarrl   sympylogicboolalgBooleanr   rv   EffectfulKernelShapeAsConstantBuffertype)r   node_check_tensorboxrx   ry   r      s.    
z%validate_ir.<locals>._check_tensorboxrx   )r   rx   r   ry   validate_ir   s    r   strCallable[..., OpsValue]namers   c                   s(   t  tsJ dddd fdd}|S )Nro   r^   argskwargsrs   c                    s   t t | i |S rt   )getattrr]   r   r   r   rx   ry   fn  s    zops_wrapper.<locals>.fn)ru   r   )r   r   rx   r   ry   ops_wrapper   s    r   zSequence[int]z&Callable[[Sequence[_T]], Sequence[_T]]orderrs   c                   s.   t t| tt|  ddd fdd}|S )NSequence[_T]indexrs   c                   s0   t  t ksJ  fddtt  D S )Nc                   s   g | ]} |  qS rx   rx   .0i)r   	inv_orderrx   ry   
<listcomp>      z4inverse_reorder.<locals>.reindex.<locals>.<listcomp>lenranger   r   r   ry   reindex
  s    z inverse_reorder.<locals>.reindex)r   zipr   r   r   r   rx   r   ry   inverse_reorder  s    r   c                   s   ddd fdd}|S )Nr   r   c                   s0   t  t ksJ  fddtt  D S )Nc                   s   g | ]} |  qS rx   rx   r   )r   r   rx   ry   r     r   z1same_reorder.<locals>.reindex.<locals>.<listcomp>r   r   r   r   ry   r     s    zsame_reorder.<locals>.reindexrx   r   rx   r   ry   same_reorder  s    r   z&Callable[[Sequence[_U]], Sequence[_V]]z&Callable[[Sequence[_T]], Sequence[_U]]&Callable[[Sequence[_T]], Sequence[_V]])reindex1reindex2rs   c                   s   ddd fdd}|S )Nr   zSequence[_V]r   c                   s    | S rt   rx   r   r   r   rx   ry   r     s    z fuse_reindexing.<locals>.reindexrx   )r   r   r   rx   r   ry   fuse_reindexing  s    r   r(   r}   )rr   unbacked_onlyrs   c                 C  s   |rt | S t| S d S rt   )r'   r&   )rr   r   rx   rx   ry   get_free_symbols#  s    r   )   r      r4   )   r   r   r   r4   z(Sequence[Union[int, torch.SymInt, Expr]]zOptional[ShapeEnv])seq	shape_envrs   c                 C  s    |du rt | }n
t|| }|S )z1
    Convert strides to fill order (argsort)
    N)rH   rI   )r   r   
sorted_idxrx   rx   ry   get_fill_order.  s    

r   zSequence[Union[int, Integer]]c                   s0   dd t | D   fddtt| D }|S )z
    Convert stride order to fill order
    For channel last format,

    stride order = [3, 0, 2, 1] and fill order = [1, 3, 2, 0]
    c                 S  s   i | ]\}}||qS rx   rx   r   idxposrx   rx   ry   
<dictcomp>C  r   z+stride_order2fill_order.<locals>.<dictcomp>c                   s   g | ]} | qS rx   rx   r   lookuprx   ry   r   D  r   z+stride_order2fill_order.<locals>.<listcomp>)	enumerater   r   )r   
fill_orderrx   r   ry   stride_order2fill_order<  s    r   c                 C  s>   t | |}dd tt| D }t|D ]\}}|||< q(|S )z)
    Convert strides to stride order
    c                 S  s   g | ]}d qS r   rx   r   _rx   rx   ry   r   O  r   z$get_stride_order.<locals>.<listcomp>)r   r   r   r   )r   r   r   outr   elemrx   rx   ry   get_stride_orderH  s
    

r   zLiteral[None])rr   guard_shapers   c                 C  s   d S rt   rx   rr   r   rx   rx   ry   ir_node_to_tensorU  s    r   torch.Tensorc                 C  s   d S rt   rx   r   rx   rx   ry   r   Y  s    Optional[IRNode]zOptional[torch.Tensor]c                   s   | d u rd S |st jjj nt  fdd|  D }t| rX fdd|  jD }n
t	
|}|  }|  }t|}t|}t jjj & tj||||d }W d    n1 s0    Y  |S )Nc                   s   g | ]} |qS rx   rx   r   sZshape_fnrx   ry   r   h  r   z%ir_node_to_tensor.<locals>.<listcomp>c                   s   g | ]} |qS rx   rx   r   r   rx   ry   r   k  r   )sizestridedtypedevice)r_   graphsizevars	size_hintr   get_sizeis_storage_and_layout
get_layoutr   FlexibleLayoutcontiguous_strides	get_dtype
get_devicerM   r   Zsuppress_guardstorchempty_stridedZzero_)rr   r   r   r   r   r   trx   r   ry   r   ]  s$    
(zOptional[Sequence[_T]]z Optional[Sequence[Optional[_T]]]valuers   c                 C  s   t | tr| sd gS | S rt   )ru   r   r   rx   rx   ry   may_convert_to_optionaly  s    r   z2Union[IRNode, OutputSpec, torch.device, None, str]Optional[str]c                 C  sb   t | ts| d u r| S t | tjr(| jS t | ttfrBt|  S t	d|  dt| j
 d d S )Nzget_device_type(: ))ru   r   r   r   r   rm   
OutputSpecget_device_typer   r   r   rw   rx   rx   ry   r     s    r   z&Union[IRNode, torch.device, None, str]c                 C  sl   t | }|dv r,tt| ddkr(dS dS |d u sDt| }d u rHdS ddlm} t|tsbJ t||S )N)cpucudaZ_backendtritonTFr4   )TritonScheduling)	r   r   r5   r9   Zcodegen.tritonr   ru   r   
issubclass)rr   r   Zdevice_schedulingr   rx   rx   ry   	is_triton  s    r  c                 C  s   t | dkS )Nr   )r   rw   rx   rx   ry   is_cpu  s    r  zUnion[Buffer, TensorBox]rv   )rr   	alignmentrs   c                   s~   t tr d u rdS t fddtt d D }tjj	
 d dkpttjj	
 d dk}|o||S )NFc                 3  s,   | ]$}t jj |   d kV  qdS )r   N)r_   r   r   size_hint_or_throw
get_strider   r  rr   rx   ry   	<genexpr>  s   z-is_aligned_realized_tensor.<locals>.<genexpr>r4   )ru   rm   maybe_get_strideallr   r   r  r_   r   r   r  r   )rr   r  Zaligned_stridesZaligned_last_dimrx   r  ry   is_aligned_realized_tensor  s    r  Sequence[_IntLike])strides1strides2shapers   c                 C  s   t |t | kr t | t |ks$J t|| |D ]N\}}}tjj|drLq0tjj||s0tjj|tjj|ks0 dS q0dS )zP
    Returns true if the strides are equal, ignoring dimensions of size 1 .
    r4   FT)r   r   r_   r   r   statically_known_leqstatically_known_equalssymbolic_hint)r  r  r  dims1s2rx   rx   ry   significant_strides_equal  s    $r  zUnion[TensorBox, BaseView]z"Sequence[Union[int, torch.SymInt]])tensorstridesrs   c                 C  s   t | s| S tdd t||  D r,| S t||  |  sD| S t| \}}g |j}t|  D ]$\}}t	j
j|drf|| ||< qft|j|j|j||j}tt||dS )a  
    Tries to match the strides of the tensor to those in the meta_strides. Strides of insignificant
    dimensions - size 0 or 1 - will be updated.

    If there are real stride differences (NHWC vs NCHW), or the tensor is not realized, then the input will be returned
    c                 s  s"   | ]\}}t jj||V  qd S rt   r_   r   r   r  r   r  r  rx   rx   ry   r    s   z2try_match_insignificant_strides.<locals>.<genexpr>r4   datalayout)r   r
  r   r  r  r   as_storage_and_layoutr   r   r_   r   r   r  FixedLayoutr   r   r   offsetrl   ReinterpretView)r  r  storage
old_layout
new_strider   r   
new_layoutrx   rx   ry   try_match_insignificant_strides  s*    

r&  torch.fx.GraphModule)gmrs   c                 C  sD   | j jddd }dd t|jD |jd< ddlm} ||  d S )Noutput)opr   c                 S  s   g | ]\}}|qS rx   rx   )r   r   r   rx   rx   ry   r     s   z.gm_original_output_strides.<locals>.<listcomp>Zuser_visible_output_idxs)record_original_output_strides)r   Z
find_nodesr   r   metaZtorch._inductor.compile_fxr+  )r(  output_noder+  rx   rx   ry   gm_original_output_strides  s    r.  list[Buffer]
list[Expr]inputsrs   c                 C  s@   t  }| D ],}|t| ddO }|t| ddO }q
t|S )NFr   )r/   r   r   r  r   )r2  Zsym_varsinprx   rx   ry   get_symbolic_inputs  s
    r5  c                   @  s  e Zd ZU e Zded< ejddZded< ejddZ	ded< ejddZ
d	ed
< eejdddddZddddddZddddZddddZddddZd	dddZd dd!d"Zdd$d%d&d'd(Zdd)d$d$dd*d+d,Zd-dd.d/Zd0dd1d2Zd3dd4d5Zd6dd7d8Zd9dd:d;Zd<dd=d>Zd$dd?d@ZdAddBdCZdDddEdFZe dGddHdIZ!dJddKdLZ"d$ddMdNZ#dOddPdQZ$ddSddTdUdVZ%dWddXdYZ&dZdd[d\Z'd$dd]d^Z(d_dd`daZ)dbddcddZ*deddfdgZ+dDddhdiZ,dddjdkZ-dOddldmZ.d$ddndoZ/ddpd$dqdrdsZ0dtddudvdwZ1dddxdyZ2d ddzd{Z3ddd|d}Z4dd~d$ddddZ5d~ddddZ6dddddZ7ddd$ddddZ8ddddZ9ddddZ:dtdddZ;ddddZ<dd$ddddZ=dOdddZ>ddddZ?d$dddZ@d$dddZAdZd dddZBd%dddZCddddZDd%dddZEeFre d-dddZGdRS )rm   zClassVar[OrderedSet[Any]]_current_originsF)initOrderedSet[Any]originsOptional[list[str]]	tracebackOptional[torch.fx.Node]origin_nodezOrderedSet[Node]zGenerator[None, None, None])r9  rs   c                 c  s.   t j}|| B t _zd V  W |t _n|t _0 d S rt   )rm   r6  )r9  oldrx   rx   ry   current_origins  s
    
zIRNode.current_originsr   r
   r   )attrr   rs   c                 C  s   t | || d S rt   )ro   __setattr__)selfr@  r   rx   rx   ry   _post_init_setattr  s    zIRNode._post_init_setattrrs   c                 C  s<   |  dt| j |  dtjr&t nd  |  dd  d S )Nr9  r;  r=  )rC  r/   r6  r5   Zdebug_ir_tracebackr;  format_stackrB  rx   rx   ry   __post_init__  s
    zIRNode.__post_init__OrderedSet[str]c                 C  s   t dd |  D S )Nc                 s  s   | ]}|j V  qd S rt   r   r   deprx   rx   ry   r  '  r   z(IRNode.get_read_names.<locals>.<genexpr>r/   	get_readsrF  rx   rx   ry   get_read_names&  s    zIRNode.get_read_namesc                 C  s   | j S rt   )r;  rF  rx   rx   ry   get_traceback)  s    zIRNode.get_tracebackc                 C  s   | j S rt   r=  rF  rx   rx   ry   get_origin_node,  s    zIRNode.get_origin_nodeOptional[Operation]c                 C  s   d S rt   rx   rF  rx   rx   ry   get_defining_op/  s    zIRNode.get_defining_opTrp   Sequence[str])shortenrs   c                 C  s:   dt | dd }|r4t|dkr4|d d  d}|gS )Nzorigins=r9   @   =   z...)r   r   )rB  rT  r9  rx   rx   ry   common_repr2  s    zIRNode.common_reprzSequence[object])linesrT  	multiliners   c                 C  sf   t |t | | }t tt|}|rLtd|}t| j d| dS t| j d| dS d S )Nz,
z(
z
)(r   )r   rX  mapr   indentjoinr   r   )rB  rY  rT  rZ  	new_linesrx   rx   ry   
str_helper9  s    zIRNode.str_helpertorch.dtypec                 C  s   | j S rt   r   rF  rx   rx   ry   r   D  s    zIRNode.get_dtypezOptional[torch.dtype]c                 C  s$   z
|   W S  ty   Y d S 0 d S rt   )r   NotImplementedErrorrF  rx   rx   ry   maybe_get_dtypeG  s    
zIRNode.maybe_get_dtypeLayoutc                 C  s   t dt|  dd S )Nz#get_layout() is not implemented by !rc  r   rF  rx   rx   ry   r   M  s    zIRNode.get_layoutzOptional[Layout]c                 C  s$   z
|   W S  ty   Y d S 0 d S rt   )r   rc  rF  rx   rx   ry   maybe_get_layoutP  s    
zIRNode.maybe_get_layoutr   c                 C  s   |   S rt   )r   rF  rx   rx   ry   get_output_specV  s    zIRNode.get_output_speczOptional[OutputSpec]c                 C  s$   z
|   W S  ty   Y d S 0 d S rt   )ri  rc  rF  rx   rx   ry   maybe_get_output_specY  s    
zIRNode.maybe_get_output_specc                 C  s   t |  tS )z4True for single tensor output (excludes MultiOutput))ru   rj  re  rF  rx   rx   ry   has_tensor_output_  s    zIRNode.has_tensor_outputSequence[Expr]c                 C  s   t dt|  dd S )Nz!get_size() is not implemented by rf  rg  rF  rx   rx   ry   r   c  s    zIRNode.get_sizeOptional[Sequence[_IntLike]]c                 C  s$   z
|   W S  ty   Y d S 0 d S rt   )r   rc  rF  rx   rx   ry   maybe_get_sizef  s    
zIRNode.maybe_get_sizez.Union[_IntLike, sympy.Rel, Sequence[_IntLike]]c                 C  s   |   S rt   r   rF  rx   rx   ry   r  l  s    zIRNode.shaper   c                 C  s   t |  S rt   )rZ   r   rF  rx   rx   ry   	get_numelp  s    zIRNode.get_numelc                 C  s   t jjt|  dS Nr   r_   r   r   r,   r   Eqrp  rF  rx   rx   ry   is_zero_elementss  s    zIRNode.is_zero_elementsr   c                 C  s   t dt|  dS )a)  
        If the IRNode refers to data which has not been materialized (e.g.,
        it is a Pointwise/Reduction that could potentially have more
        compute fused into it), realize the IRNode into physical memory,
        ending the possibility of fusing into it, but allowing, e.g., multiple
        users to access the data without having to recompute.

        Check StorageBox.realize for a particularly notable implementation.

        TODO(ezyang): I think, in principle, every IRNode should have an
        implementation of this, and most of the time no-op is OK, but you
        really do have to audit each IRNode for this, so for now, raise
        an error if it's not implemented.  Note that some code in graph.py
        will catch this thrown error and suppress it with a warning.
        zrealize NYI on Nrg  rF  rx   rx   ry   realizev  s    zIRNode.realizeNOptional[IndentedBuffer]writerrs   c                 C  s   t dt|  d S )Nzcodegen_reference NYI on rg  rB  rx  rx   rx   ry   codegen_reference  s    zIRNode.codegen_referenceOptional[torch.device]c                 C  s   d S rt   rx   rF  rx   rx   ry   r     s    zIRNode.get_devicetorch.devicec                 C  s   |   }|d usJ |S rt   r   rB  r   rx   rx   ry   get_device_or_error  s    zIRNode.get_device_or_errorc                 C  s   dS NFrx   rF  rx   rx   ry   has_exceeded_max_reads  s    zIRNode.has_exceeded_max_reads$Callable[[Sequence[Expr]], OpsValue]c                 C  s   t t| jd S rt   rc  r   r   rF  rx   rx   ry   make_loader  s    zIRNode.make_loader Callable[[Sequence[Expr]], Expr]c                 C  s   t t| jd S rt   r  rF  rx   rx   ry   make_indexer  s    zIRNode.make_indexerr  c                 C  s   t t| jd S rt   r  rF  rx   rx   ry   r    s    zIRNode.get_stridec                 C  s$   z
|   W S  ty   Y d S 0 d S rt   )r  rc  rF  rx   rx   ry   r	    s    
zIRNode.maybe_get_stridec                 C  s   t t| jd S rt   r  rF  rx   rx   ry   get_name  s    zIRNode.get_namec                 C  s$   z
|   W S  ty   Y d S 0 d S rt   )r  rc  rF  rx   rx   ry   maybe_get_name  s    
zIRNode.maybe_get_namec                 C  s,   z|   tjjv W S  ty&   Y dS 0 d S r  )r  r_   r   graph_inputsrc  rF  rx   rx   ry   is_input_buffer  s    zIRNode.is_input_bufferOptional[int]	thresholdrs   c                 C  s   dS r  rx   rB  r  rx   rx   ry   has_large_inner_fn  s    zIRNode.has_large_inner_fnrv   usersrs   c                 C  s   d S rt   rx   rB  r  rx   rx   ry   
mark_reuse  s    zIRNode.mark_reusec                 C  s   d S rt   rx   rF  rx   rx   ry   realize_hint  s    zIRNode.realize_hintc                 C  s   t t| jd S rt   r  rF  rx   rx   ry   unwrap_view  s    zIRNode.unwrap_viewc                 C  s   t t| jd S rt   r  rF  rx   rx   ry   freeze_layout  s    zIRNode.freeze_layout	list[int]r   allow_paddingrs   c                 C  s   t t| jd S rt   r  rB  r   r  rx   rx   ry   freeze_layout_with_stride_order  s    z&IRNode.freeze_layout_with_stride_orderr   c                 C  s   t t| jd S rt   r  rB  r   rx   rx   ry   freeze_layout_with_fill_order  s    z$IRNode.freeze_layout_with_fill_orderlist[_IntLike]r   rs   c                 C  s   t t| jd S rt   r  rB  r   rx   rx   ry   freeze_layout_with_same_order  s    z$IRNode.freeze_layout_with_same_orderexact_stridesr  rs   c                 C  s   t t| jd S rt   r  rB  r  r  rx   rx   ry    freeze_layout_with_exact_strides  s    z'IRNode.freeze_layout_with_exact_stridesdependencies.ReadWritesc                 C  s   t t| jd S rt   r  rF  rx   rx   ry   get_read_writes  s    zIRNode.get_read_writesOrderedSet[Dep]c                 C  s
   |   jS rt   r  readsrF  rx   rx   ry   rL    s    zIRNode.get_readsc                 C  s   t |  S rt   )r   rL  rF  rx   rx   ry   	num_reads  s    zIRNode.num_readsrh   c                 C  s   t t| jd S rt   r  rF  rx   rx   ry   get_storage_numel  s    zIRNode.get_storage_numelr}   r   rs   c                 C  s   t t| jd S rt   r  rB  r   rx   rx   ry   get_free_symbol_uses  s    zIRNode.get_free_symbol_usesc                 C  s   t t| jd S rt   r  rF  rx   rx   ry   get_reduction_type  s    zIRNode.get_reduction_typeSequence[sympy.Expr]c                 C  s   t t| jd S rt   r  rF  rx   rx   ry   get_reduction_size  s    zIRNode.get_reduction_sizec                 C  s   dS r  rx   rF  rx   rx   ry   	is_extern  s    zIRNode.is_externc                 C  s   dS r  rx   rF  rx   rx   ry   is_no_op  s    zIRNode.is_no_opr   rs   c                 C  s   t t| jd S rt   r  r~  rx   rx   ry   constant_to_device  s    zIRNode.constant_to_devicec                 C  s   t t| jd S rt   r  rF  rx   rx   ry   get_mutation_names  s    zIRNode.get_mutation_namesc                 C  s   t t| jd S rt   r  rF  rx   rx   ry   get_operation_name  s    zIRNode.get_operation_namec                 C  s   t t| jd S rt   r  rF  rx   rx   ry   get_inputs_that_alias_output  s    z#IRNode.get_inputs_that_alias_outputc                 C  s   d S rt   rx   rF  rx   rx   ry   r     s    zIRNode.dtype)T)TT)N)N)F)F)F)Hr   r   r   r/   r6  r   dataclassesfieldr9  r;  r=  staticmethod
contextlibcontextmanagerr?  rC  rG  rM  rN  rP  rR  rX  r`  r   rd  r   rh  ri  rj  rk  r   rn  propertyr  rp  rt  ru  rz  r   r  r  r  r  r  r	  r  r  r  r  r  r  r  r  r  r  r  r  r  rL  r  r  r  r  r  r  r  r  r  r  r  r   r   rx   rx   rx   ry   rm     s   
    c                   @  s   e Zd ZddddZddddZddd	d
ZddddZddddZddddZddddZ	ddddZ
dddddZddddZddd d!Zd"dd#d$Zd%dd&d'Zd0dd%d)d*d+Zd,dd-d.Zd/S )1	Operationr   rD  c                 C  s
   d | _ d S rt   Zoperation_namerF  rx   rx   ry   rG    s    zOperation.__post_init__r{  c                 C  s   t d S rt   rc  rF  rx   rx   ry   r     s    zOperation.get_devicer<  c                 C  s   t | dsJ | jS Nr=  )hasattrr=  rF  rx   rx   ry   rP  
  s    zOperation.get_origin_noder8  c                 C  s   t | dsJ | jS )Nr9  )r  r9  rF  rx   rx   ry   get_origins  s    zOperation.get_originsr   c                 C  s   | j d usJ | j S rt   r  rF  rx   rx   ry   r    s    zOperation.get_operation_namerp   c                 C  s   dS r  rx   rF  rx   rx   ry   r    s    zOperation.is_externc                 C  s   dS r  rx   rF  rx   rx   ry   r    s    zOperation.is_no_opr  c                 C  s   t d S rt   r  rF  rx   rx   ry   r    s    zOperation.get_read_writesr   c                 C  s   ||   v S rt   )rM  )rB  r   rx   rx   ry   
is_user_of  s    zOperation.is_user_ofrH  c                 C  s   t dd |  D S )Nc                 s  s   | ]}|j V  qd S rt   r   rI  rx   rx   ry   r  #  r   z+Operation.get_read_names.<locals>.<genexpr>rK  rF  rx   rx   ry   rM  "  s    zOperation.get_read_namesr  c                 C  s
   |   jS rt   r  rF  rx   rx   ry   rL  %  s    zOperation.get_readsr/  c                 C  s   t d S rt   r  rF  rx   rx   ry   get_outputs(  s    zOperation.get_outputsr}   c                 C  s   t  S rt   r.   rF  rx   rx   ry   get_unbacked_symbol_defs+  s    z"Operation.get_unbacked_symbol_defsFr  c                 C  s   t  S )a  
        When unbacked_only=True:
        Returns the unbacked symbols which are required to be in scope in
        order to successfully perform codegen for this buffer.  For example,
        a buffer that corresponds to an extern kernel call that takes i0 as
        an argument would return {i0} here.  This is used to generate necessary
        dependencies that ensure we actually bind i0 in codegen before you
        try to use it.

        Note that this is NOT transitive; in particular, if this buffer takes
        in as input another buffer with dynamic shape (e.g., (i0,)), we will
        not report it here, because you will already have a dependency
        on that buffer, which will eventually have a dependency on i0 if
        necessary.

        When unbacked_only=False:
        Similar to `unbacked_only=True` but including all free symbols
        instead of only free unbacked symbols.
        r.   r  rx   rx   ry   r  .  s    zOperation.get_free_symbol_usesrv   c                 C  s   dS )z
        Gets extra global memory size needed by this buffer.
        Some algorithms (e.g. group gemm) may require extra global memory in the generated code.
        r   rx   rF  rx   rx   ry   get_workspace_sizeF  s    zOperation.get_workspace_sizeN)F)r   r   r   rG  r   rP  r  r  r  r  r  r  rM  rL  r  r  r  r  rx   rx   rx   ry   r    s     r  c                      s  e Zd ZU ded< ded< ded< ded< dQd
ddddZdddddZdd fddZddddZeZddddZ	ddddZ
d dd!d"Zd dd#d$Zed%d%d&d'd(d)Zeejfdd*d d+d,d-Zed.dd/d0Zd1dd2d3Zeddd4d5ZdRd7d
d8d9d:ZdSd
d;dd<d=Zd>dd?d@ZdAddBdCZdDddEdFZdGddHdIZdJddKdLZddMdNdOdPZ  ZS )TLoopsr|  r   ra  r   Callable[..., Any]inner_fnr  rangesFrp   r}   r  c                   s,   t  jg  fdd| jD |  R  S )Nc                 3  s   | ]}t | V  qd S rt   r   r   er3  rx   ry   r  Y  r   z-Loops.get_free_symbol_uses.<locals>.<genexpr>)r/   unionr  inner_fn_free_symbolsr  rx   r3  ry   r  U  s
    zLoops.get_free_symbol_usesrS  r   )namesrs   c                   sF     d jj dt j  g fdd|D  d jg S )N'c                   s    g | ]}| d t  | qS =)r   )r   r   rF  rx   ry   r   d  r   z!Loops._to_str.<locals>.<listcomp>origin_node=)r`  r   r   r   r   inner_fn_strr=  )rB  r  rx   rF  ry   _to_str]  s    zLoops._to_strr   rD  c                   s   t    d S rt   )superrG  rF  	__class__rx   ry   rG  h  s    zLoops.__post_init__c                 C  s
   |  dS )Nr  r  rF  rx   rx   ry   __str__k  s    zLoops.__str__r{  c                 C  s   | j S rt   r   rF  rx   rx   ry   r   p  s    zLoops.get_devicer<  c                 C  s   | j S rt   rO  rF  rx   rx   ry   rP  s  s    zLoops.get_origin_noderl  c                 C  s   | j S rt   r  rF  rx   rx   ry   r   v  s    zLoops.get_sizec                 C  s   | j S rt   r  rF  rx   rx   ry   get_pointwise_sizey  s    zLoops.get_pointwise_sizer
   rl   r   c                 O  sN   | dd }| dd }| |i |}|d| |d|p@|j t|S )Nr=  r;  )poprC  r;  rl   create)clsr   r   r=  tbrrx   rx   ry   r  |  s    zLoops.creater3   )r  rk   rs   c                   s    fddt | D S )Nc                   s*   g | ]"\}}|d krt jjnt |qS r4   )r   SZerorY   )r   nr   rj   rx   ry   r     s   z Loops._index.<locals>.<listcomp>)r   )r  rk   rx   rj   ry   _index  s    
zLoops._indexrB   c              	   C  s   t t }t|b ttdd4 | j|    |	 W  d    W  d    S 1 s`0    Y  W d    n1 s~0    Y  d S Nallow_indexingT)
rA   r_   ZMockHandlerZset_ops_handlerr   ro   r   r  inner_fn_argsgetvalue)rB  Z	opcounterrx   rx   ry   inner_fn_opcount  s    zLoops.inner_fn_opcountSequence[Sequence[_IntLike]]c                 C  s   |  | jfS rt   )r  r  rF  rx   rx   ry   r    s    zLoops.inner_fn_argsc                 C  s   t jj| jg|  R  S rt   )r_   ZKernelFormatterHandlerZir_to_stringr  r  rF  rx   rx   ry   r    s
    zLoops.inner_fn_strNr  r  c                 C  s&   |d u rd}t |tj}|  j|kS rq  )maxr5   Zrealize_opcount_thresholdr  Znum_opsr  rx   rx   ry   r    s    zLoops.has_large_inner_fnOrderedSet[Symbol]c                 C  s   |  | j}t| j||dS Nr3  )r  r  r<   r  )rB  r   r   rx   rx   ry   r    s    zLoops.inner_fn_free_symbolsr  c                 C  s   t tddb |  r@t|  |  |  jW  d    S t|  |  jW  d    S W d    n1 sv0    Y  d S r  )	r   ro   r   r  r>   r  r   r  r  rF  rx   rx   ry   rL    s    zLoops.get_readsrH  c                 C  s   t |  jS rt   )r/   r  read_buffersrF  rx   rx   ry   rM    s    zLoops.get_read_namesrv   c                 C  s   t |  jS rt   )r   r  r  rF  rx   rx   ry   r    s    zLoops.num_readsr  c                 C  s   t dt|  dd S )Nz+get_reduction_size() is not implemented by rf  rg  rF  rx   rx   ry   r    s    zLoops.get_reduction_sizer   c                 C  s   t dt|  dd S )Nz+get_reduction_type() is not implemented by rf  rg  rF  rx   rx   ry   r    s    zLoops.get_reduction_typerm   r  c                 C  s   t dt|  dd S )Nz+constant_to_device() is not implemented by rf  rg  r~  rx   rx   ry   r    s    zLoops.constant_to_device)F)N)F) r   r   r   r   r  r  rG  r  __repr__r   rP  r   r  classmethodr  r  r3   INDEXr  rJ   r  r  r  r  r  rL  rM  r  r  r  r  __classcell__rx   rx   r  ry   r  N  s>   
 	r  zUnion[Expr, Sequence[Expr]]ra  r^   )r   r   rs   c                C  s&   |j rttd|S td|S d S )Nnanr   )is_floating_pointr]   constantfloat)r   r   rx   rx   ry   nop_loader_fn  s    r  c                   @  sZ   e Zd ZddddZddddZddd	d
ZdddddddZdddddZdS )	Pointwiser  rD  c                 C  s   |   rtt| jdS | jS Nrb  )rt  r	   r  r   r  rF  rx   rx   ry   r    s    zPointwise.make_loaderr  c                 C  s   g S rt   rx   rF  rx   rx   ry   r    s    zPointwise.get_reduction_sizer   c                 C  s   d S rt   rx   rF  rx   rx   ry   r    s    zPointwise.get_reduction_type!Callable[[Sequence[Expr]], Never]rl  r   output_nameindexervarsrs   c                 C  s"   |   }t|pd||||S Nunnamed)r  r]   storerB  r  r  r  loaderrx   rx   ry   store_output  s    zPointwise.store_outputr|  rm   r  c                 C  s.   |   }ttd||}t|| j|| jdS FMove this to a given device. Requires that all reads are to constants.override_devicer   r   r  r  )r  r   ro   ConstantBufferr  r   r  rB  r   r
  rx   rx   ry   r    s
    zPointwise.constant_to_deviceN)r   r   r   r  r  r  r  r  rx   rx   rx   ry   r    s
   	r  c                   @  sF   e Zd ZU ded< dZded< dddd	d
ZdddddddZdS )Scatterr  output_indexerNrD   scatter_moder|  rm   r  c                 C  s6   |   }ttd||}t|| j|| j| j| jdS )r  r  )r   r   r  r  r  r  )	r  r   ro   r  r  r   r  r  r  r  rx   rx   ry   r    s    zScatter.constant_to_devicer   r  rl  r   r  c                 C  s6   |   }|d u rd}tj||| |||| jdS )Nr  )mode)r  r]   r  r  r  r	  rx   rx   ry   r  
  s    zScatter.store_output)r   r   r   r   r  r  r  rx   rx   rx   ry   r    s   
r  
logical_ormaximumZminimummuladdZbitwise_xor)anyr  minprodsumxor_sumz"dict[str, Callable[..., OpsValue]]REDUCTION_COMBINE_FNCallable[..., object])reduction_typer   arg_break_ties_leftrs   c                   sf   t v rt  S dv r6dddd fdd}|S dkrTddddd	d
}|S td d S )Nargmaxargminztuple[object, object]tuple[OpsValue, OpsValue])abrs   c                   s   | \}}|\}}dkr&t ||}nt ||}t ||}trt ||}t ||}	t |t ||	}t |t ||	} rt ||n
t ||}
t |t ||
}t |||t |||fS )Nr%  )	r]   ltgteqr    ner  logical_andwhere)r'  r(  Za_valueZa_indexZb_valueZb_indexmaskequalZa_isnanZb_isnanZtier"  r   r!  rx   ry   argmax_combine_fn-  s&    
z3get_reduction_combine_fn.<locals>.argmax_combine_fnwelford_combinez#tuple[OpsValue, OpsValue, OpsValue]c                 S  sR   | \}}}|\}}}|| }|| }	||	 }
|||
  || || | |
  |	fS rt   rx   )r'  r(  Za_meanZa_m2Za_weightZb_meanZb_m2Zb_weightdeltaZ
new_weightZ	w2_over_wrx   rx   ry   welford_combine_fnN  s    


z4get_reduction_combine_fn.<locals>.welford_combine_fnzunknown reduction_type=)r  rc  )r!  r   r"  r2  r5  rx   r1  ry   get_reduction_combine_fn%  s    r6  c                      sD  e Zd ZU ded< ded< ded< ded< d	d
ddZeZdbddd fddZdd
ddZdd
ddZddddddddZ	d d
d!d"Z
d#d
d$d%Zdcdddd&d'Zd(d)d*d+d,Zeddd(ddd.ddd/d0d1d2d3
d4d5Zed6dd	dd7d8d9d:Zeejd-fd(ddd;ddddd1d<d=
d>d?Zed	dd@dAdBdCZed	dd@dAdDdEZedFd dddGdHdIZedFd1dJdKdLdMZeded.ddFdFdFd@d1dNdOdPdQZedRdddSdSdRdTdUdVZed(ddd;dddWdXddFdd<dYdZd[Zedfd(ddd;ddddFdd1d<d\d]d^Zed(ddd;dddXdXddd<d_d`daZ  ZS )g	Reductionr  reduction_rangesrC   r!  ra  	src_dtyperG   reduction_hintr   rD  c                 C  s
   |  dS )N)r  r8  r!  r  rF  rx   rx   ry   r  l  s    zReduction.__str__Frp   r  r  c                   s(   t   t j fdd| jD  B S )Nc                 3  s   | ]}t | V  qd S rt   r  r  r3  rx   ry   r  s  r   z1Reduction.get_free_symbol_uses.<locals>.<genexpr>)r  r  r/   r  r8  r  r  r3  ry   r  q  s    zReduction.get_free_symbol_usesr  c                 C  s   | j S rt   )r8  rF  rx   rx   ry   r  v  s    zReduction.get_reduction_sizer   c                 C  s   | j S rt   )r!  rF  rx   rx   ry   r  y  s    zReduction.get_reduction_typer  rl  Sequence[Symbol]r   r  r  r  reduction_varsrs   c              	   C  s4   t | j| j| j| ||}t |p(d|||S r  )r]   	reductionr   r9  r!  r  store_reduction)rB  r  r  r  r=  r   rx   rx   ry   r?  |  s    
zReduction.store_reductionrv   c                 C  s   t | jt | j S rt   )r   r  r8  rF  rx   rx   ry   index_length  s    zReduction.index_lengthSequence[Sequence[Expr]]c                 C  s$   |  | j}|  | jtj}||fS rt   )r  r  r8  r3   R0_INDEX)rB  r   rindexrx   rx   ry   r    s    zReduction.inner_fn_argsc                 C  s.   |  | j}|  | jtj}t| j|||dS r  )r  r  r8  r3   rB  r<   r  )rB  r   r   rC  rx   rx   ry   r    s
    
zReduction.inner_fn_free_symbolsr|  rm   r  c              
   C  s>   |   }ttd||}t|| j|| j| j| j| j	t
jdS )r  r  r   r   r  r  r8  r!  r9  r:  )r  r   ro   r  r7  r   r  r8  r!  r9  rG   DEFAULTr  rx   rx   ry   r    s    zReduction.constant_to_deviceNr   z%Union[ReductionType, Literal['scan']]r   r   tuple[ReductionHint, _IntLike])
r   	dst_dtyper9  r  r  r8  r!  reduction_numel
input_noders   c	           "   
   C  s  t jj|}	t jjt|}
|dkpFt j| tj oF|dvoFtj	}t
|	rXt
|
sbtjdfS t| }|j}d}|rtjt jj| dd}tjt jj| dd}ndddd	d
d}|}|
dkr||	|
}|dkrtj|fS |d urt|trttdd t|\}}W d    n1 s(0    Y  |d ur|d urt jjt|| }|	|krtd||||| tjdfS tj|fS |	|ks|
|d d krtjdfS t| |||||dkr|nd|tjd}ddddd}||\}}|r
||\}}t|dkr"tjdfS t |! |" \\}}}d}d}|D ]Z}t jj#||}t jj$||t%|& } t'dd | D }!|!r|d7 }n|d7 }qJ||krtj||	|
fS tj(||	|
fS d S )Nscanr#  r4       T)Zinner_reductionFrv   )reduction_numel_hint
numel_hintrs   c                 S  s   dS Nr4   rx   )rL  rM  rx   rx   ry   inner_reduction_splits  s    z4Reduction.num_splits.<locals>.inner_reduction_splitsr  zUse previous IRNode's range and reduction_ranges instead of split. current ranges: %s, current reduction ranges: %s, current split: %d, new ranges: %s, new reduction ranges: %sr  r   r  rD  r7  ztuple[Sequence[Expr], bool])r  rs   c                   s   t d t|  |  |  d| d}| }|jd us:J dd |jD }g }d}t|jdd dD ]n t	 fd	d
|D rd|
 j  jtjjv rdtjj j }t|jdd }|  t|jdd |krdd}qd||fS )Nr   r   r   r   r  r  c                 S  s&   g | ]}t |trt |tjs|qS rx   )ru   r   r   Numberr   r  rx   rx   ry   r     s   zBReduction.num_splits.<locals>.get_read_indices.<locals>.<listcomp>Fc                 S  s   | j S rt   r   rw   rx   rx   ry   <lambda>  r   z@Reduction.num_splits.<locals>.get_read_indices.<locals>.<lambda>keyc                 3  s   | ]}| j jv V  qd S rt   )r   r&   rS  Zmdrx   ry   r     r   zAReduction.num_splits.<locals>.get_read_indices.<locals>.<genexpr>r   T)ComputedBufferr   r   r   r   r  
range_varssortedr  r
  appendr   r   r_   r   Zname_to_bufferr   r  decide_layout)r  cbread_writesrY  indiceschangedbufZoriginal_striderx   rW  ry   get_read_indices	  s4    	z.Reduction.num_splits.<locals>.get_read_indicesr   c                 s  s   | ]}|d kV  qdS r4   Nrx   r   rx   rx   ry   r  <  r   z'Reduction.num_splits.<locals>.<genexpr>))r_   r   r   r  rZ   has_featurer7   ZREDUCE_TO_SINGLE_ELEMENTr5   Zsplit_reductionsrz   rG   rE  rF   r  Zmulti_processor_count	functoolsr	   choicesZreduction_split_factorZINNERru   rl   r   ro   r   r=   logdebugr7  r   r6   index_vars_squeezer   r  simplify_with_rangesstride_hintsr   keysr
  OUTER)"r   rG  r9  r  r  r8  r!  rH  rI  rL  rM  Zshould_splitpropsZnum_smZmin_elements_per_threadrO  Zouter_reduction_splitssplit
new_rangesnew_reduction_rangesZextracted_numel_hintr  rb  r_  r`  r   r=  Zranges1Z	num_outerZ	num_innerr   jr  outerrx   rx   ry   
num_splits  s    	






 




!


zReduction.num_splitsz<Callable[[Sequence[_IntLike], Sequence[_IntLike]], OpsValue]z(Callable[[Sequence[_IntLike]], OpsValue])r  r8  r!  r9  rs   c                   s   dd D t || ddd fdd|dv rttd	d	t ddd
dfddfddS S d	S )z1Convert inner_fn from a reduction to an pointwisec                 S  s   g | ]}t jj|qS rx   )r_   r   r   Zevaluate_static_shaper   rr   rx   rx   ry   r   R  s   z2Reduction._unroll_reduction_fn.<locals>.<listcomp>r  r
   r   c                   s,   t  fddtjdd D  D S )Nc                 3  s   | ]} |V  qd S rt   rx   )r   rC  )r   value_fnrx   ry   r  [  s   z=Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<genexpr>c                 S  s   g | ]}t |qS rx   )r   ru  rx   rx   ry   r   ^  r   z>Reduction._unroll_reduction_fn.<locals>.fn.<locals>.<listcomp>)re  reduce	itertoolsproductr   )
combine_fnr8  rv  r   ry   r   X  s    z*Reduction._unroll_reduction_fn.<locals>.fnr%  r$  Nr&  )r   rC  rs   c                   s*   dd |D }| |t  |tjfS )Nc                 S  s   g | ]}t |qS rx   )r   expandr   rx   rx   ry   r   o  r   zDReduction._unroll_reduction_fn.<locals>.value_fn.<locals>.<listcomp>)r]   
index_exprr   int64r   rC  )flatten_indexr  rx   ry   rv  l  s    z0Reduction._unroll_reduction_fn.<locals>.value_fnc                   s    | d S rN  rx   r   )r   rx   ry   rT  u  r   z0Reduction._unroll_reduction_fn.<locals>.<lambda>)r6  r  r   r   r  )r  r8  r!  r9  rx   )rz  r  r   r  r8  rv  ry   _unroll_reduction_fnJ  s     
	zReduction._unroll_reduction_fnr  rl   )
r   rG  r9  r  r  r8  r!  r:  rI  rs   c
                   s&  t jjtdkrddd fdd}
|
d|
d|
d|
dd v sfJ  d	d
dd fdd}tj|||t|dS dkrdv rd
dd fdd}nd
ddfdd}tj| ||dS t	t
r<t jjtjk r<t|dkst|jr<tj| | ||dS | | |||		\}}d
d
dfdd}||}|tjkr|}|dkr|	d usJ t|	\}}|d usJ |d usJ | | |||||
S |dkr| | |||||	
S tt| |||dS )Nr   ro   zUnion[bool, float, int]valrs   c                   sL    t jkrt| S  jr0t| tjs(J t| S t| tjs@J t| S d S rt   )	r   rp   r  ru   typingSupportsFloatr  SupportsIntrv   r  rG  rx   ry   py_cnst  s    
z!Reduction.create.<locals>.py_cnstr4   )r  r  r  r  z* not supported for zero-dimension tensors!rv   r^   r   c                   s   t   S rt   r]   r  r   )rG  r!  rtypes_to_initsrx   ry   const_fn  s    z"Reduction.create.<locals>.const_fnr  r{  c                   s   t d S rq  r  r   r  rx   ry   r     s    zReduction.create.<locals>.fnc                   s   dd D } | |S )Nc                 S  s   g | ]}t jjqS rx   r   r  r  r   rx   rx   ry   r     r   z0Reduction.create.<locals>.fn.<locals>.<listcomp>rx   r   reduction_index)r  r8  rx   ry   r     s    )ro  rs   c                   s(   t  r| S | dkr t| tjS | S d S rN  )rz   r  r5   Zmin_num_split)ro  )rH  rx   ry   _maybe_increase_split  s
    z/Reduction.create.<locals>._maybe_increase_splitr  rD  )r_   r   r   simplifyrZ   rl  r  r  r   ru   r   r  r5   Zunroll_reductions_thresholdrV   r   r  rt  rG   rE  r=   !create_multilayer_existing_rangescreate_multilayerrl   r7  )r  r   rG  r9  r  r  r8  r!  r:  rI  r  r  r   hintro  r  rp  rq  rx   )rG  r  rH  r8  r!  r  ry   r  z  s    

	

zReduction.create#Union[_NumLike, Sequence[_NumLike]]r!  r   rs   c                 C  s   | dv r0t |rtdS t|r$dS t|jS | dv r`t |rHtdS t|rTdS t|jS t|rldnd}t|r|dnd}|||||||f|||ftd|fd	|  S )
N)r  r$  z-infF)r  r%  infTr   r4   )r  r  r  r  welford_reducer3  online_softmax_reduce)r    r  r   r   iinfor  r  )r!  r   ZzeroZonerx   rx   ry   default_accumulator  s0    
zReduction.default_accumulatorc                 C  s   | dkrdS t | |S )Nr  r   )r7  r  r!  r   rx   rx   ry   default_value:  s    zReduction.default_valuerh   )ro  rM  r:  rs   c                 C  sP   | dkr|S | dkr,|dkr,|t jkr,t jS | dkrL|dkrL|t jkrLt jS |S )Nr     i      )rG   rm  Z
OUTER_TINY)ro  rM  r:  rx   rx   ry   _multilayer_second_step_hintB  s    z&Reduction._multilayer_second_step_hintr  )rH  rI  rs   c                 C  s   |du rdS t jj| |s$dS |  zt| W n tyL   Y dS 0 | }t	|dd D ] \}}t jj|drf|  S qfdS )z
        If we are reducing over the full tensor, and it is non-dense in the last dimension,
        reindex so we reduce over the dense dimension. initially just handle complete
        reduction case
        Nr  r4   )
r_   r   r   r  rp  ru  r  rc  r  r   )r  rH  rI  r  r   r   rx   rx   ry   $check_for_split_dense_dim_reindexingS  s     	
z.Reduction.check_for_split_dense_dim_reindexingr   )r
  r8  rH  ro  
block_sizedefaultrI  rs   c           
        s\   |  |}t|g|tjjt| d dddd fdd}	|	S )Nr   r;  r^   r   r  rs   c                   st   |\}| ^ }| |  dd fdd}rjt }tt |t|}t||S | S d S )Nr^   rD  c                     s    gS rt   rx   rx   )r_  r
  	new_indexr   rx   ry   body  s    zCReduction._multilayer_wrap_loader.<locals>.wrapper_fn.<locals>.body)rP   r]   r)  r}  Zmasked)r   r  Zreduction_blockr  Zindex_dtyper/  r  r  r
  	need_maskrH  r   )r_  r  ry   
wrapper_fn  s    


z5Reduction._multilayer_wrap_loader.<locals>.wrapper_fn)	r  Viewdynamic_reshape_indexerr_   r   r   r,   r   rs  )
r  r
  r8  rH  ro  r  r  rI  Zdense_indexr  rx   r  ry   _multilayer_wrap_loaders  s     z!Reduction._multilayer_wrap_loaderz@Callable[[Sequence[sympy.Expr], Sequence[sympy.Expr]], OpsValue]Sequence[Integer])r
  original_rangesoriginal_reduction_rangesrp  rq  rs   c                   sV   t dd D s J dt|t|t| dddd fdd}|S )	Nc                 s  s   | ]}|d kV  qdS rc  rx   rS  rx   rx   ry   r    r   zDReduction._multilayer_wrap_loader_existing_ranges.<locals>.<genexpr>z8Only enabled for numel_hint == 1, found original_ranges=r  r^   )merged_indexnew_reduction_indexrs   c                   s:   | d t  }| t d  } |t|t| S rt   )r   r   )r  r  Zoriginal_idxr  r
  r  r   rx   ry   r    s    zEReduction._multilayer_wrap_loader_existing_ranges.<locals>.wrapper_fn)r
  r  r  r   )r  r
  r  r  rp  rq  r  rx   r  ry   '_multilayer_wrap_loader_existing_ranges  s    	z1Reduction._multilayer_wrap_loader_existing_rangesr0  list[Integer])r   rG  r9  r  r  r  rp  rq  r!  ro  r:  rs   c                   s   |t jt jfvr|nt j}t|||||||	|}|  |  dddd fdd}tj	j
t|}| |
||}||dt| ksJ tt|||||t|d |	||dS )a
        Break a large reduction up into multiple smaller reductions
        recursively
        r  r^   r  c                   s    g | |S rt   rx   r  Zintermediate_loaderrx   ry   intermediate_fn  s    z;Reduction.create_multilayer_helper.<locals>.intermediate_fnNrD  )r   float16Zbfloat16r  r7  r  ru  r  r_   r   r   r   rZ   r  r   rl   )r  r   rG  r9  r  r  r  rp  rq  r!  ro  r:  Zintermediate_dtypeZintermediater  rM  rx   r  ry   create_multilayer_helper  sD    
z"Reduction.create_multilayer_helper)r   rG  r9  r  r  r8  r!  ro  r:  rI  rs   c                 C  sd   t |}t||d  |}| ||}| |||||||
}| ||||||g |||g|||	S )r  r4   )rZ   r1   r  r  r  )r  r   rG  r9  r  r  r8  r!  ro  r:  rI  rH  r  r  r  rx   rx   ry   r    s2    

zReduction.create_multilayer)r   rG  r9  r  r  r  rp  rq  r!  r:  rs   c                 C  s8   |  |||||}| ||||||g ||||	d|
S )r  r  )r  r  )r  r   rG  r9  r  r  r  rp  rq  r!  r:  r  rx   rx   ry   r  )  s(    
z+Reduction.create_multilayer_existing_ranges)F)F)N)N)N)r   r   r   r   r  r  r  r  r  r?  r@  r  r  r  r  rt  r  r  rG   rE  r  r  r  r  r  r  r  r  r  r  r  rx   rx   r  ry   r7  d  sZ   

 $ !/
$ !	  *&? &-r7  c                
      sR   e Zd ZU ded< dddddddddd		 fd
dZddddddddZ  ZS )MultiOutputReductionrv   output_indexr|  ra  z)Union[INNER_FN_TY, Sequence[INNER_FN_TY]]r  rC   rG   )	r   rG  	inner_fnsr  r8  r!  r9  r:  r  c
              
     s`   t  r f t dkr$ d }
ndddd fdd}
t j|||
|||||d |	| _d S )	Nr4   r   rl  ztuple[OpsValue, ...]r   reduction_idxrs   c                   s   t  fddD S )Nc                 3  s   | ]}| V  qd S rt   rx   r   r   r   r  rx   ry   r  n  r   z@MultiOutputReduction.__init__.<locals>.loader.<locals>.<genexpr>)r   r  r  r  ry   r
  k  s    z-MultiOutputReduction.__init__.<locals>.loaderrD  )callabler   r  __init__r  )rB  r   rG  r  r  r8  r!  r9  r:  r  r
  r  r  ry   r  W  s     

zMultiOutputReduction.__init__r   r  rl  r;  r   r<  c              	   C  sZ   t | j| j| j| ||}t|ttfs:J t	| || j
 }t |pNd|||S r  )r]   r>  r   r9  r!  r  ru   r   r   r   r  r?  )rB  r  r  r  r=  r   r   rx   rx   ry   r?  |  s    

z$MultiOutputReduction.store_reduction)r   r   r   r   r  r?  r  rx   rx   r  ry   r  T  s   
"%r  c                   @  s8   e Zd Zeejdfdddddddddd	d

ddZdS )OnlineSoftmaxReductionNr|  ra  r  rl  rv   rG   r   Sequence[TensorBox])
r   rG  r9  r  r  r8  
num_outputr:  rI  rs   c
                   s<   t  fddt|D }
|
D ]}|  q*|
S )z>
        Create the reduction disregarding splitting.
        c                 3  s,   | ]$}t t d |	V  qdS )r  N)rl   r  r  r   Z
output_idxr   rG  r  r  r:  r8  r9  rx   ry   r    s   z0OnlineSoftmaxReduction.create.<locals>.<genexpr>)r   r   ru  )r  r   rG  r9  r  r  r8  r  r:  rI  resultsr   rx   r  ry   r    s    
zOnlineSoftmaxReduction.create)r   r   r   r  rG   rE  r  rx   rx   rx   ry   r    s   
r  c                   @  sj   e Zd Zeejfdddddddddd	d
ZeddddddZedddddddddd	ddZ	dS )WelfordReductionr|  ra  zSequence[Callable[..., Any]]r  rC   rG   r  )r   r   r  r  r8  r!  r:  rs   c              
     sF  dv sJ t jjt}dddfdd}	|dkr`|	d}
|	d}|	d}|
||fS |dkrd	dd
fdd dkr d |	d|	dfS t fddD S tjd |d\}}tj	kr||dkr| 
|S fddtdD }|D ]}|  q2|S )N)r  r3  rv   rl   r  c                   s,   ddd fdd}t j|tdS )Nrl  r^   r   rs   c                   s   t  S rt   r  r   )r   r  rx   ry   r    s    z8WelfordReduction.create.<locals>.const.<locals>.inner_fnr  r  r  r   )r  r  )r   r   r  r  ry   const  s    z&WelfordReduction.create.<locals>.constr   r4   z4Callable[[Sequence[Expr], Sequence[Expr]], OpsValue])r
  rs   c                   s,   ddd fdd}t j|tdS )Nrl  r^   r  c                   s   dd D } | |S )Nc                 S  s   g | ]}t jjqS rx   r  r   rx   rx   ry   r     r   zKWelfordReduction.create.<locals>.copy.<locals>.inner_fn.<locals>.<listcomp>rx   )r   r  )r
  r8  rx   ry   r    s    z7WelfordReduction.create.<locals>.copy.<locals>.inner_fnr  r  )r
  r  )r   r   r  r8  r
  ry   copy  s    z%WelfordReduction.create.<locals>.copyr  c                 3  s   | ]} |V  qd S rt   rx   r  )r  rx   ry   r    r   z*WelfordReduction.create.<locals>.<genexpr>)r!  rH  c                   s*   g | ]"}t t |	qS rx   )rl   r  r  r  )r   r   r  r  r:  r8  r!  rx   ry   r     s   z+WelfordReduction.create.<locals>.<listcomp>r   )r_   r   r   r  rZ   r   r7  rt  rG   rE  r  r   ru  )r  r   r   r  r  r8  r!  r:  rH  r  meanm2Zweightr  ro  r  r   rx   )r  r   r   r  r  r:  r8  r!  ry   r    sT    



zWelfordReduction.creater   r  r  c                 C  s   dS )N)r   r   r   rx   r  rx   rx   ry   r  .  s    zWelfordReduction.default_valuerh   )	r   r   r  r  r8  r!  ro  r:  rs   c	              
     s8  t tjjt d }	|	r||dkr|dddddfdd}
j||d t|
dd	t|
d
d	f|d|dS t	d
   t
|t fdd|D g | g||}|D ]}|  qdddddddtjjt |}||}t
|tfdd|D |gd|S )r  r   r3  rl  rv   r^   )r   r  r   rs   c                   s   t | S rt   r  )r   r  r   rb  rx   ry   r  M  s    z4WelfordReduction.create_multilayer.<locals>.constantr   r4   )r   r   r  r  r8  r!  ro  r:  c              	   3  s$   | ]}j | d dV  qdS )r   )r  N)r  )r   r
  )r  r  rH  r8  ro  rx   ry   r  e  s   	z5WelfordReduction.create_multilayer.<locals>.<genexpr>r  )r   r  r
  rs   c                 S  s   |g | |S rt   rx   )r   r  r
  rx   rx   ry   intermediate_loader_fnx  s    zBWelfordReduction.create_multilayer.<locals>.intermediate_loader_fnc                 3  s   | ]}t  | d V  qdS )r  N)r	   r  r   )r  rx   ry   r    s   )rZ   r_   r   r   r,   r   rs  r  r	   r1   r  r  r   ru  r   r  )r  r   r   r  r  r8  r!  ro  r:  r  r  Zintermediatesr   rM  rx   )r  r  r   r  rH  r8  ro  ry   r  4  sb    

	

z"WelfordReduction.create_multilayerN)
r   r   r   r  rG   rE  r  r  r  r  rx   rx   rx   ry   r    s   	 xr  c                      sH  e Zd ZU ded< ded< ded< ded< ded	< d
ed< ded< ded< dAddd fddZdd fddZdddddddd Zddd!d"Zd#dd$d%Zddd&d'Z	ddd(d)Z
d
dd*d+Zd,dd-d.ZdBdddd/d0Zeejfd1d2d3dd4dd
dddd5d6d7
d8d9Zed3d:d;d
dddd<d=d>	d?d@Z  ZS )CScanr  scan_rangesr   z=Callable[[tuple[Any, ...], tuple[Any, ...]], tuple[Any, ...]]rz  zFCallable[[Sequence[_IntLike], Sequence[_IntLike]], Sequence[_IntLike]]r   rG   r:  rv   r  tuple[torch.dtype, ...]dtypestuple[Callable[..., Any], ...]r  Frp   r  r  c                   sD   t   t j fdd| jD  B t j fdd| jD  B S )Nc                 3  s   | ]}t | V  qd S rt   r  r  r3  rx   ry   r    r   z,Scan.get_free_symbol_uses.<locals>.<genexpr>c                 3  s   | ]}t | V  qd S rt   r  r  r3  rx   ry   r    r   )r  r  r/   r  r  r   r  r  r3  ry   r    s    
zScan.get_free_symbol_usesr   rD  c                   s0   t | jt | j t | jks"J t   d S rt   )r   r  r  r   r  rG  rF  r  rx   ry   rG    s    "zScan.__post_init__r   z%Callable[[Sequence[_IntLike]], Never]rl  r;  )r  r  r  	scan_varsrs   c                   sR   |  || t fdd| jD }t| j| j|}t|p@d| || j S )Nc                 3  s   | ]}| V  qd S rt   rx   r   r  r  rx   ry   r    r   z'Scan.store_reduction.<locals>.<genexpr>r  )	r   r   r  r]   rJ  r  rz  r  r  )rB  r  r  r  r  r   resultrx   r  ry   r?    s    zScan.store_reductionc                 C  s   dS )NZcustomrx   rF  rx   rx   ry   r    s    zScan.get_reduction_typer  c                 C  s   | j S rt   )r  rF  rx   rx   ry   r    s    zScan.get_reduction_sizec                 C  s   | j S rt   r   rF  rx   rx   ry   r     s    zScan.get_sizec                 C  s   | j S rt   r  rF  rx   rx   ry   r    s    zScan.get_pointwise_sizec                 C  s   t | jt | j S rt   )r   r  r  rF  rx   rx   ry   r@    s    zScan.index_lengthr  c                 C  s.   |  | j}|  | jtj}| ||}|fS rt   )r  r  r  r3   rB  r   rB  r   rC  r   rx   rx   ry   r    s    zScan.inner_fn_argsc                 C  s8   |  | j}|  | jtj}| ||}t| j||dS r  )r  r  r  r3   rB  r   r<   r  rB  r   r   rC  r   rx   rx   ry   r    s    zScan.inner_fn_free_symbolsT)can_fallback_to_atenr|  z+tuple[Callable[[Sequence[Expr]], Any], ...]r
   Sequence[Optional[TensorBox]])
r   r  r  r   axisrz  r:  r  r   rs   c                  s  g d    d d    g	t jtjsHd gt S tdkrrt jtjsrd gt S t jj}
|
t		}ttksJ |

t|dr·fddttD S | jd d  	|d\}t
|dkrPtjjd u s to*tdko*tdk}|sL|rFd gt S d}nt
dddd	 	fd
d	
fddttD }|D ]}|  q|S )Nr4   c                   s&   g | ]}t j | | d qS r  r  r  r   r  r   r  r  r   rx   ry   r     s   zScan.create.<locals>.<listcomp>r   )r   r   r  r  pointwise_rangesr  rz  
scan_numelz3.3.0rl  r0  )r   
scan_indexrs   c                   sH   t |t ksJ t | t ks(J g | d   ||  d  S rt   r   )r   r  )r  r  r  rx   ry   r   	  s    zScan.create.<locals>.reindexc                   sB   g | ]:}t 	f | | 
 |d qS ))r   r   r  r  r  r   r  r  rz  r   r:  r  )rl   r  r  )rz  r   r  r  r   r  r:  r   r  	scan_typer   rx   ry   r   $	  s&   )r_   r   rd  r7   ZSCANr   ZTUPLE_REDUCTIONr   r  rZ   r,   r   Ler   rt  r  r   versionZhip
has_tritontriton_version	SplitScanru  )r  r   r  r  r   r  rz  r:  r  r   r   r  rt  Zsupports_splitr  r  rx   )r  rz  r   r  r  r   r  r:  r   r  r  r   ry   r    sV     







zScan.createra  r  r   rF  )	r   r   r  r  r  r  rz  r  rs   c	           
   
     s2   dddd fdd}	t j||||	||d|dS )Nrl  r^   r  c                   s$   g | d   ||  d  S rt   rx   r  r  r  rx   ry   r  K	  s    z#Scan.num_splits.<locals>.wrapper_fnrJ  )r   rG  r9  r  r  r8  r!  rH  )r7  rt  )
r  r   r   r  r  r  r  rz  r  r  rx   r  ry   rt  >	  s    zScan.num_splits)F)F)r   r   r   r   r  rG  r?  r  r  r   r  r@  r  r  r  rG   rE  r  rt  r  rx   rx   r  ry   r    s2   
	&ar  c                   @  s   e Zd ZdS )r  Nr   r   r   rx   rx   rx   ry   r  [	  s   r  c                      s(  e Zd ZU ded< ded< ded< ded< ded	< d
ed< ded< ded< ded< d6ddd fddZdd fddZddddddddZddd d!Zddd"d#Zddd$d%Z	ddd&d'Z
ddd(d)Zd*dd+d,Zd7dddd-d.Zeejfd/d
d0dddddd1d2d3
d4d5Z  ZS )8Sortr  sort_rangesr   z:Callable[[Sequence[Expr], Sequence[Expr]], Sequence[Expr]]r   rG   r:  rv   r  r  r  r  r  rp   stable
descendingFr  r  c                   sD   t   t j fdd| jD  B t j fdd| jD  B S )Nc                 3  s   | ]}t | V  qd S rt   r  r  r3  rx   ry   r  u	  r   z,Sort.get_free_symbol_uses.<locals>.<genexpr>c                 3  s   | ]}t | V  qd S rt   r  r  r3  rx   ry   r  x	  r   )r  r  r/   r  r  r   r  r  r3  ry   r  q	  s    
zSort.get_free_symbol_usesr   rD  c                   s0   t | jt | j t | jks"J t   d S rt   )r   r  r  r   r  rG  rF  r  rx   ry   rG  |	  s    "zSort.__post_init__r   r  rl  r<  c                   sV   |  || t fdd| jD }t| j|| j| j}t|pDd| || j	 S )Nc                 3  s   | ]}| V  qd S rt   rx   r  r  rx   ry   r  	  r   z'Sort.store_reduction.<locals>.<genexpr>r  )
r   r   r  r]   sortr  r  r  r  r  )rB  r  r  r  r=  r   r  rx   r  ry   r?  	  s    zSort.store_reductionc                 C  s   dS )Nr  rx   rF  rx   rx   ry   r  	  s    zSort.get_reduction_typec                 C  s   | j S rt   )r  rF  rx   rx   ry   r  	  s    zSort.get_reduction_sizec                 C  s   | j S rt   r  rF  rx   rx   ry   r   	  s    zSort.get_sizec                 C  s   | j S rt   r  rF  rx   rx   ry   r  	  s    zSort.get_pointwise_sizec                 C  s   t | jt | j S rt   )r   r  r  rF  rx   rx   ry   r@  	  s    zSort.index_lengthrA  c                 C  s.   |  | j}|  | jtj}| ||}|fS rt   )r  r  r  r3   rB  r   r  rx   rx   ry   r  	  s    zSort.inner_fn_argsc                 C  s8   |  | j}|  | jtj}| ||}t| j||dS r  )r  r  r  r3   rB  r   r<   r  r  rx   rx   ry   r  	  s    zSort.inner_fn_free_symbolsr|  z'tuple[Callable[[list[Expr]], Any], ...]r
   r  )
r   r  r  r   r  r  r  r:  r   rs   c	                   s4  g 	d   	 d d  	  g
t jtjsHd gt S t jj}
|
t
}d}t	j
joz|
t||}|sd gt S ttksJ |
t|drԇ	fddttD S dddd 
fdd		
fd
dttD }|D ]}|  q |S )Nr4   r  c                   s&   g | ]}t j | | d qS r  r  r  r  rx   ry   r   	  s   zSort.create.<locals>.<listcomp>rl  r0  )r   
sort_indexrs   c                   sH   t |t ksJ t | t ks(J g | d   ||  d  S rt   r  )r   r  )r  r  r  rx   ry   r   	  s    zSort.create.<locals>.reindexc                   sD   g | ]<}t tf | | 	|
 d qS ))r   r   r  r  r  r   r  r  r   r:  r  r  r  )rl   r  r  r  )r  r   r  r  r   r  r:  r   r   r  r  rx   ry   r   	  s(   )r_   r   rd  r7   ZSORTr   r   r  rZ   r5   r   Zpersistent_reductionsr,   r   r  r   ru  )r  r   r  r  r   r  r  r  r:  r   r   Z
sort_numelZ
max_rblockZis_persistent_kernelr  r  rx   )r  r  r   r  r  r   r  r:  r   r   r  r  ry   r  	  s0     



zSort.create)F)F)r   r   r   r   r  rG  r?  r  r  r   r  r@  r  r  r  rG   rE  r  r  rx   rx   r  ry   r  `	  s,   

r  c                 C  s,   zt | dd W dS  ty&   Y dS 0 d S )NFfreezeT)r  rc  rw   rx   rx   ry   r   	  s
    r   c                 C  sD   z*t | dd\}}| r"|  | W S  ty>   Y dS 0 d S NFr  )r  should_pad_stridespad_stridesis_contiguousrc  )rr   _bufferr  rx   rx   ry    is_contiguous_storage_and_layout
  s    
r  z'Optional[Sequence[Union[int, Integer]]]ztuple[StorageBox, Layout])rr   r  want_contiguousstride_orderr  r  rs   c           	      C  s   t | tr t| j|||||dS t | trRt| j|||||d\}}| | j fS t | tr|r|r~|   |   sJ n8|dur| j	||d n |dur| j
||d n|   t| |  fS t | trt| j|d\}}|| jfS tdS )z
    Try to simplify x into a StorageBox and a Layout.

    allow_padding only affect how we apply stride_order. When allow_padding
    is True, we have the freedom to add padding when applying the stride_order.
    r  r   r  r  r  Nr  r  )ru   rl   r  r  
StorageBoxr   Bufferr  r  r  r  r\  r!  r  rc  )	rr   r  r   r  r  r  r   r  bufferrx   rx   ry   r  
  sR    






r  )rr   r  rs   c                 C  s6   zt | dd\}}||W S  ty0   Y dS 0 d S r  )r  is_stride_orderedrc  )rr   r  r  r  rx   rx   ry   "is_stride_order_storage_and_layoutI
  s
    r  )r   rs   c                 C  sr   t | ttfrt| jS t | trT| j}t|jt	|j
 t dk }t| jpR|S t | trn|  tjjv S dS )Nr   F)ru   rl   r  is_unalignedr  r!  r  r,   r   rQ   r   rS   r  r  r_   r   unaligned_buffers)r   r  Zhas_unaligned_layoutrx   rx   ry   r	  S
  s    


r	  c                   @  s6  e Zd ZU ded< dDdddddZd	d
ddZdd
ddZdd
ddZedd
ddZ	dd
ddZ
dd
ddZdd
ddZdd
d d!Zd"d
d#d$Zd%d&d'd(d)Zdd
d*d+Zd,d
d-d.Zd/d0 Zd1d2 Zdd
d3d4Zdd
d5d6Zd7d
d8d9Zd:d
d;d<Zd=d> Zd?dd@dAdBZdCS )EBaseViewrm   r  Frp   r  r  c                 C  s   | j |S rt   r  r  r  rx   rx   ry   r  i
  s    zBaseView.get_free_symbol_usesz*Callable[[Sequence[Expr]], Sequence[Expr]]rD  c                 C  s   t d|  d S )Nzmake_reindexer NYI on r  rF  rx   rx   ry   make_reindexerl
  s    zBaseView.make_reindexerr  c                   s,   | j   |  ddd fdd}|S )Nrl  r   r  c                   s    | S rt   rx   r  innerr   rx   ry   r  s
  s    z&BaseView.make_indexer.<locals>.indexer)r  r  r  rB  r  rx   r  ry   r  o
  s    
zBaseView.make_indexerr  c                   s,   | j   |  ddd fdd}|S )Nrl  r^   r  c                   s    | S rt   rx   r  r  rx   ry   r
  |
  s    z$BaseView.make_loader.<locals>.loader)r  r  r  rB  r
  rx   r  ry   r  x
  s    
zBaseView.make_loaderra  c                 C  s
   | j  S rt   )r  r   rF  rx   rx   ry   r   
  s    zBaseView.dtypere  c                 C  s
   | j  S rt   r  r   rF  rx   rx   ry   r   
  s    zBaseView.get_layoutr{  c                 C  s
   | j  S rt   r  r   rF  rx   rx   ry   r   
  s    zBaseView.get_devicer<  c                 C  s   d S rt   rx   rF  rx   rx   ry   rP  
  s    zBaseView.get_origin_noder   c                 C  s
   | j  S rt   r  r  rF  rx   rx   ry   r  
  s    zBaseView.get_namerl  c                 C  s   |   S rt   ro  rF  rx   rx   ry   r  
  s    zBaseView.get_pointwise_sizerv   r   r  c                 C  s   | j |S rt   r  r  r  rx   rx   ry   r  
  s    zBaseView.mark_reusec                 C  s
   | j  S rt   r  r  rF  rx   rx   ry   r  
  s    zBaseView.has_exceeded_max_readsr   c                 C  s
   | j  S rt   r  ru  rF  rx   rx   ry   ru  
  s    zBaseView.realizec                 C  s
   | j  S rt   r  r  rF  rx   rx   ry   r  
  s    zBaseView.realize_hintc                 C  s
   | j  S rt   r  r  rF  rx   rx   ry   r  
  s    zBaseView.get_storage_numelc                 C  s
   | j  S rt   r  r  rF  rx   rx   ry   r  
  s    zBaseView.is_externc                 C  s
   | j  S rt   )r  is_module_bufferrF  rx   rx   ry   r  
  s    zBaseView.is_module_bufferrH  c                 C  s
   | j  S rt   r  rM  rF  rx   rx   ry   rM  
  s    zBaseView.get_read_namesr  c                 C  sF   t tdd$ t|  |  jW  d    S 1 s80    Y  d S r  )r   ro   r   r>   r  r   r  rF  rx   rx   ry   rL  
  s
    zBaseView.get_readsc                 C  s   | }t |tr|j}q|S rt   )ru   r  r  )rB  rr   rx   rx   ry   r  
  s    
zBaseView.unwrap_viewr|  r  c                 C  s2   |   }ttd||}t||  ||  dS r  )r  r   ro   r  r  r   r   r  rx   rx   ry   r  
  s    zBaseView.constant_to_deviceN)F)r   r   r   r   r  r  r  r  r  r   r   r   rP  r  r  r  r  ru  r  r  r  r  rM  rL  r  r  rx   rx   rx   ry   r  e
  s.   
		r  c                   @  sD   e Zd ZU ded< edd Zedd Zddd	d
Zdd Z	dS )r   r0  r   c                 C  s   t jj}tttj|}|  }dgt|t|  t| }t|t|ksRJ t	t|D ]}|| dkr|| dus~J || ||< q^|| du st jjj
jt|| dddrq^|j|| ||  dddks^J dq^|S )	zReplace `-1` with correct sizesNr  r4   TZsize_obliviousr   fallbackzKBroadcast failed in ExpandView({x.get_size()}, {new_size}) on dimension {i})r_   r   r   r   r\  r   r|  r   r   r   r   evaluate_exprrs  r   )rr   new_sizer   old_sizer   rx   rx   ry   _normalize_size
  s"     zExpandView._normalize_sizec           
      C  s   |  ||}t|rt|\}}t|t|j }|dks>J tjjg| }t|j	|jD ]6\}}|
tjjjjt|ddds|ntjj qZt|j|jt|||j}	t||	dS t||dS )Nr   r4   Tr  r  )r  r   )r#  r   r  r   r   r   r  r  r   r   r[  r_   r   r   r   r   rs  r  r   r   r   r   r!  r   )
r  rr   r!  r"  r#  skipr$  r   r   r%  rx   rx   ry   r  
  s.    
zExpandView.createrl  rD  c                 C  s   | j S rt   r  rF  rx   rx   ry   r      s    zExpandView.get_sizec                   s4   |   }| j   t|t   fdd}|S )Nc                   sR   t | d  } t| t ks$J tt D ]} | dkr0tjj| |< q0| S rN  )r   r   r   r   r  r  )r   r   actualr$  rx   ry   r     s    z*ExpandView.make_reindexer.<locals>.reindex)r   r  r   )rB  targetr   rx   r%  ry   r    s
    
	zExpandView.make_reindexerN)
r   r   r   r   r  r#  r  r  r   r  rx   rx   rx   ry   r   
  s   


r   c                   @  sD   e Zd ZU ded< edd Zedd Zddd	d
Zdd ZdS )PermuteViewr0  dimsc                   s   |  |}t|ttt|ks&J t|rxt|\} t j j fdd|D  fdd|D  j	}t
||dS t||dS )Nc                   s   g | ]} j | qS rx   r  r   r#  rx   ry   r   "  r   z&PermuteView.create.<locals>.<listcomp>c                   s   g | ]} j | qS rx   r   r   r*  rx   ry   r   #  r   r  )r  r)  )_map_neg_dimsr/   r   r   r   r  r  r   r   r   r!  r(  )r  rr   r)  r"  r%  rx   r*  ry   r    s    
zPermuteView.createc                   s    fdd D S )Nc                   s$   g | ]}|d kr|n
t  | qS r   r  )r   r  r)  rx   ry   r   ,  r   z-PermuteView._map_neg_dims.<locals>.<listcomp>rx   )r  r)  rx   r-  ry   r,  *  s    zPermuteView._map_neg_dimsrl  rD  c                   sD   t | | jt tt| jks&J | j   fdd| jD S )Nc                   s   g | ]} | qS rx   rx   r   r  rx   ry   r   3  r   z(PermuteView.get_size.<locals>.<listcomp>)r/   r,  r)  r   r   r  r   rF  rx   r  ry   r   .  s
    

zPermuteView.get_sizec                   s^   dd t | jD   fddtt| jD  t ttt| jksNJ  fdd}|S )Nc                 S  s   i | ]\}}||qS rx   rx   )r   r   rr  rx   rx   ry   r   6  r   z.PermuteView.make_reindexer.<locals>.<dictcomp>c                   s   g | ]} | qS rx   rx   r   invrx   ry   r   7  r   z.PermuteView.make_reindexer.<locals>.<listcomp>c                   s    fddD S )Nc                   s   g | ]} | qS rx   rx   r   r   rx   ry   r   ;  r   z?PermuteView.make_reindexer.<locals>.reindex.<locals>.<listcomp>rx   r   r.  r   ry   r   :  s    z+PermuteView.make_reindexer.<locals>.reindex)r   r)  r   r   r/   )rB  r   rx   r.  ry   r  5  s
    zPermuteView.make_reindexerN)	r   r   r   r   r  r  r,  r   r  rx   rx   rx   ry   r(    s   


r(  c                   @  s>   e Zd ZeddddZeddddZd	d
ddZdS )SqueezeViewNr  c                  sF  t |rt|\}}g }g } d urPt ts6J dd krL t|jk sPJ tt|j|jD ]`\}\}}	 d u r|dkr|	| |	|	 qb| kr|	| |	|	 qb|dksbJ dqbt
|j|j|||j}
t||
dS  d u r
t|dd | D S |   dks J t| fddt| D S d S )	Nzexpected integer dim argumentr   r4   zexpected squeezed size to be 1r  c                 S  s   g | ]}|d kr|qS r  rx   r   rx   rx   ry   r   c  r   z&SqueezeView.create.<locals>.<listcomp>c                   s   g | ]\}}| kr|qS rx   rx   r   r   r   r1  rx   ry   r   f  r   )r   r  ru   rv   r   r   r   r   r   r[  r  r   r   r   r!  r  r  r   )r  rr   r  r"  r#  r!  r$  r   r   r   r%  rx   r1  ry   r  B  s8    


zSqueezeView.creater  r  c                   sF   dd | D }dd t | D t|  ddd fdd}||fS )	Nc                 S  s   g | ]}|d kr|qS r  rx   r   rx   rx   ry   r   j  r   z(SqueezeView.squeezer.<locals>.<listcomp>c                 S  s   g | ]\}}|d kr|qS r  rx   r2  rx   rx   ry   r   k  r   zlist[sympy.Expr]ztuple[sympy.Expr, ...]r   c                   sT   t | t ks"J |  d tjjg  }t| D ]\}}|||< q:t|S )N )r   r   r  r  r   r   )r   r  r   r   lengthZnot_onerx   ry   r   n  s
    "
z%SqueezeView.squeezer.<locals>.reindex)r   r   )r   r!  r   rx   r4  ry   squeezerh  s
    zSqueezeView.squeezerr   rD  c                 C  s   t dd S )Nzuse SqueezeView.create())AssertionError)rB  r  rx   rx   ry   r  w  s    zSqueezeView.__init__)r   r   r   r  r  r  r6  r  rx   rx   rx   ry   r0  @  s
   %r0  c                   @  s`   e Zd ZU ded< ded< dd Zddd	d
ZddddZeZedd Z	ddddZ
dS )GenericViewr0  r   r  r   c                 C  s   | j S rt   )r   rF  rx   rx   ry   r    s    zGenericView.make_reindexerr   rD  c                 C  sB   dd t t| jD }t| |}ddtt| d| S )Nc                 S  s   g | ]}t tj|qS rx   )rY   r3   r  )r   r  rx   rx   ry   r     s   z+GenericView.reindex_str.<locals>.<listcomp>zlambda , r   )r   r   r   r   r   r^  r\  r   )rB  Z	index_oldZ	index_newrx   rx   ry   reindex_str  s
    zGenericView.reindex_strc                 C  s$   |  | jd| j d|   gS )Nsize=zreindex=)r`  r  r   r:  rF  rx   rx   ry   r    s    zGenericView.__str__c                 C  s   | |t ||dS )Nr  r   r   )r   )r  rr   r!  r   rx   rx   ry   r    s    zGenericView.createrl  c                 C  s   | j S rt   r  rF  rx   rx   ry   r     s    zGenericView.get_sizeN)r   r   r   r   r  r:  r  r  r  r  r   rx   rx   rx   ry   r8  {  s   

r8  c                   @  s^   e Zd Zedd Zedd Zedd Zedddd	d
dddZedd	dddZ	dS )r  c                 C  s<   t | } t |}tjjjj}|t | dr8| | } | S rq  )r   r|  r_   r   r   r   r   Lt)r   r   r   rx   rx   ry   handle_negative_index  s    

zView.handle_negative_indexc           	        s   t |ttfsJ | | |\ }tjj |r:|S d}t	t
 dks^t	t
|dkrbd}d|v r fdd}| |t||dS t|s|r|rt|st|}t|dd\}}t|j|j|t||j}t||dS |  |}| |t||dS )	NFr   Tc                   s   t dgt  S rq  )r   r   r   r"  rx   ry   fake_reindex  s    z!View.create.<locals>.fake_reindexr<  )r   r  )ru   r   r   resolve_negative_sizer   r_   r   r   Zstatically_known_list_equalsr   r'   r  ExternKernelrequire_contiguousr  r  r   r   r   r   r   r!  r  )	r  rr   r!  Zunbacked_symbols_in_sizesr@  r"  r#  r%  r   rx   r?  ry   r    s6    
zView.createc                 C  s   dd |D }dd | D } t |}tt|D ]6}|| dkr0tjj||< tt| t|||<  qhq0tj	j
t| t| | |fS )Nc                 S  s   g | ]}t jj|qS rx   r_   r   r   r  ru  rx   rx   ry   r     r   z.View.resolve_negative_size.<locals>.<listcomp>c                 S  s   g | ]}t jj|qS rx   rD  ru  rx   rx   ry   r     r   r  )r   r   r   r   r  Oner0   rZ   r_   r   r   guard_equals)r"  r!  r   rx   rx   ry   rA    s    zView.resolve_negative_sizeNr  r  r   )r"  r!  	dense_dimrs   c              	   C  sZ   z|  |||}W nB ttfyT   t|g}|  ||}|  ||}t||}Y n0 |S rt   )_dynamic_reshape_indexerr7  
IndexErrorrZ   r   )r  r"  r!  rG  r   Zflatr   r   rx   rx   ry   r    s    
zView.dynamic_reshape_indexer)rG  c                   s  t jjj}dd tt|D  tt |}t| }|duoX|t|d koXt|dk}|r~|dusjJ ||}|	| g |r|r| }| \}	}
|dkrȈ	t
jj |	|	|
f q|
dkr|	| q||
||kr
	|	 t jj|
| q||
||k rn||
||k rR| \}}||
 |	 }	|
| }
q	|	 t jj|
| q||
||krt
jj}|}	t|	|| || }||
||kr| }	t|	|| || }|| }qt jj|
| qtq|r.| }t jj|d 	t
jj q|rT| \}	}
t jj|
d q.|durt|dkr   }|| n  tt| ksJ  fdd}|S )zG
        Perform a reshape entirely by modifying indexing math
        c                 S  s   g | ]}t tj|qS rx   )rY   r3   ZVIEWr   rx   rx   ry   r     s   z1View._dynamic_reshape_indexer.<locals>.<listcomp>Nr4   c                   sH   t | t ks$J t | t ftt|  t fddD S )Nc                 3  s   | ]}t | V  qd S rt   )r[   ru  replacementsrx   ry   r  B  r   zAView._dynamic_reshape_indexer.<locals>.reindex.<locals>.<genexpr>)r   r   r   r   r   r  Z	view_exprrJ  ry   r   ?  s    $z.View._dynamic_reshape_indexer.<locals>.reindex)r_   r   r   r   r   r   r   r   r  r[  r   r  r  rF  rE  r2   r7  reverseinsert)r"  r!  rG  r   Z	stack_newZ	stack_oldZreordering_dense_dimZold_dimZsize_oldvarZsize_newZvar2Z	size_new2ZdivisormodulusZ
dense_exprr   rx   rL  ry   rH    sz    






zView._dynamic_reshape_indexer)N)N)
r   r   r   r  r>  r  r  rA  r  rH  rx   rx   rx   ry   r    s   

,
 r  c                      s   e Zd ZU dZded< dd fddZddd	d
ZeZddddZddddZ	ddddZ
edd ZddddZdd ZddddZddddZddd d!Zd"d# Zd2d%d&d'd(d)Zd3d+dd,d-d.Zd/dd0d1Z  ZS )4r!  z*Pretend our storage has a different layoutre  r  r   rD  c                   s.   t    t| jtr*t| d| j  d S )Nr  )r  rG  ru   r  r  ro   rA  r  rF  r  rx   ry   rG  M  s    
zReinterpretView.__post_init__r   c                 C  s   |  | j| jgS rt   )r`  r  r  rF  rx   rx   ry   r  R  s
    zReinterpretView.__str__c                 C  s
   | j  S rt   r  rF  rx   rx   ry   r  \  s    zReinterpretView.get_namer{  c                 C  s   | j jS rt   )r  r   rF  rx   rx   ry   r   _  s    zReinterpretView.get_devicer<  c                 C  s   d S rt   rx   rF  rx   rx   ry   rP  b  s    zReinterpretView.get_origin_nodec                 C  s   | j jS rt   )r  r   rF  rx   rx   ry   r   e  s    zReinterpretView.dtyperl  c                 C  s   t | jjS rt   )r   r  r   rF  rx   rx   ry   r   i  s    zReinterpretView.get_sizec                 C  s   t | jjS rt   )r   r  r   rF  rx   rx   ry   r  l  s    zReinterpretView.get_strider  c                   s   ddd fdd}|S )Nrl  r^   r   c                   sJ    j  }t  || } j j jjkrBt| j jjS |S d S rt   )r  r  r]   loadr  r   r  to_dtype_bitcast)r   r  Z
tmp_loaderrF  rx   ry   r
  p  s
    
z+ReinterpretView.make_loader.<locals>.loaderrx   r  rx   rF  ry   r  o  s    zReinterpretView.make_loaderr  c                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r  z  s    zReinterpretView.make_indexerc                 C  s   | j S rt   r  rF  rx   rx   ry   r   }  s    zReinterpretView.get_layoutc                 C  s   d S rt   rx   rF  rx   rx   ry   r    s    zReinterpretView.freeze_layoutFrp   r}   r  c                 C  s*   t | jj|t | jj|B t | jj|B S rt   )r   r  r   r   r   r  rx   rx   ry   r    s    z$ReinterpretView.get_free_symbol_usesNrv  rw  c                 C  s@   t jjj| j| jj| jj| jj|d ur,|j	nt jjj	| jj
dS r   )r_   r   wrapper_codeZcodegen_reinterpret_viewr  r  r   r   r   	writeliner   ry  rx   rx   ry   rz    s    z!ReinterpretView.codegen_referencerv   c                 C  s   dS rN  rx   rF  rx   rx   ry   r    s    zReinterpretView.num_reads)F)N)r   r   r   __doc__r   rG  r  r  r  r   rP  r  r   r   r  r  r  r   r  r  rz  r  r  rx   rx   r  ry   r!  G  s(   

 	r!  c                   @  s`   e Zd ZU dZded< edd Zdddd	ZeZe	d
d Z
ddddZddddZdS )	DtypeViewz(Pretend our storage has a different typera  target_dtypec                 C  sD   t |r8t|\}}t|j||j|j|j}t||dS t||dS )Nr  )r  rX  )	r   r  r  r   r   r   r   r!  rW  )r  rr   Z	new_dtyper"  r#  r%  rx   rx   ry   r    s    zDtypeView.creater   rD  c                 C  s   |  | j| jgS rt   )r`  r  rX  rF  rx   rx   ry   r    s    zDtypeView.__str__c                 C  s   | j S rt   )rX  rF  rx   rx   ry   r     s    zDtypeView.dtyperl  c                 C  s
   | j  S rt   r  r   rF  rx   rx   ry   r     s    zDtypeView.get_sizer  c                   s   j    fdd}|S )Nc                   s   t  | jjjS rt   )r]   rR  rX  r  r   r  r  rB  rx   ry   r
    s    z%DtypeView.make_loader.<locals>.loaderr  r  r  rx   rZ  ry   r    s    
zDtypeView.make_loaderN)r   r   r   rV  r   r  r  r  r  r  r   r   r  rx   rx   rx   ry   rW    s   


rW  c                   @  s&   e Zd Zedd ZedddZdS )		SliceViewc                   s   t jj| | tdd ||fD r:tjtjnjj	fdd  fdd}||dd}|||}||fS )zz
        Normalize start and end such that both are in the range
        [0, x.get_size()[dim]] and start <= end.
        c                 s  s   | ]}t |V  qd S rt   )r'   ru  rx   rx   ry   r    r   z0SliceView.normalize_start_end.<locals>.<genexpr>c                   s8    | |r| n | |}||r*|n||}|S rt   )statically_known_geqr  )rr   lowerupperZclamped_lowerZclamped_full)max_funcmin_funcr   rx   ry   clamp  s    
z,SliceView.normalize_start_end.<locals>.clampc                   s$   | d u r|S  | }  | ||S rt   )r>  )r  r^  r_  r  )rb  r  dim_sizerx   ry   
clamp_wrap  s    z1SliceView.normalize_start_end.<locals>.clamp_wrapr   )
r_   r   r   r   r  r   ZMinZMaxZevaluate_minZevaluate_max)r  rr   r  startendrd  rx   )rb  r  rc  r`  ra  r   ry   normalize_start_end  s    zSliceView.normalize_start_endr4   Tc                   s  t tt js"dks"J z"dkrB|dkrBdkrB|W S W n tyV   Y n0 t| |r|| | |\}t| d   < t	|rt
|\}}t|j}	|	   |	 < t|j|j|	|j|j    }
t||
dS  fdd}t||dS )Nr   l    r4   r  c                   sD   t | t ks$J d|  d t| } |     |  < | S )Nzwrong ndim r3  )r   r   r   r  r!  re  steprx   ry   r     s    $z!SliceView.create.<locals>.reindexr<  )r   r|  ru   r   	TypeErrorr   r   rg  r1   r   r  r   r  r   r   r   r!  r\  )r  rr   r  re  rf  ri  rb  r"  r#  r$  r%  r   rx   rh  ry   r    s2    


zSliceView.createN)r4   T)r   r   r   r  rg  r  rx   rx   rx   ry   r\    s   
$r\  c                   @  sV   e Zd ZU ded< ded< ddddZd	dd
dZddddZddddZdS )BaseConstantra  r   r|  r   rl  rD  c                 C  s   dS Nrx   rx   rF  rx   rx   ry   r     s    zBaseConstant.get_sizer{  c                 C  s   | j S rt   r  rF  rx   rx   ry   r      s    zBaseConstant.get_devicer<  c                 C  s   d S rt   rx   rF  rx   rx   ry   rP  #  s    zBaseConstant.get_origin_noder  c                 C  s   t  S rt   r.   rF  rx   rx   ry   rL  &  s    zBaseConstant.get_readsN)r   r   r   r   r   r   rP  rL  rx   rx   rx   ry   rk    s   
rk  c                   @  sR   e Zd ZU ded< ded< ded< ddd	d
ZddddZdddddZdS )Constantr
   r   ra  r   r|  r   r  rD  c                   s   ddd fdd}|S )Nrl  r^   r   c                   s   t  j jS rt   )r]   r  r   r   r   rF  rx   ry   r
  1  s    z$Constant.make_loader.<locals>.loaderrx   r  rx   rF  ry   r  0  s    zConstant.make_loaderr   c                 C  s   d S rt   rx   rF  rx   rx   ry   ru  6  s    zConstant.realizerm   r  c                 C  s   t | j| j|dS )N)r   r   r   )rm  r   r   r~  rx   rx   ry   r  9  s    zConstant.constant_to_deviceN)r   r   r   r   r  ru  r  rx   rx   rx   ry   rm  *  s   
rm  c                   @  sD   e Zd ZU ded< ded< ded< ddd	d
ZdddddZdS )IndexingConstantr
   r   ra  r   r|  r   r  rD  c                   s   ddd fdd}|S )Nrl  r^   r   c                   s   t  j jS rt   )r]   r}  r   r   r   rF  rx   ry   r
  D  s    z,IndexingConstant.make_loader.<locals>.loaderrx   r  rx   rF  ry   r  C  s    zIndexingConstant.make_loaderrm   r  c                 C  s   t | j| j|dS )N)r   r   r   )rn  r   r   r~  rx   rx   ry   r  I  s    z#IndexingConstant.constant_to_deviceN)r   r   r   r   r  r  rx   rx   rx   ry   rn  =  s
   
rn  )r   r  rs   c                 C  s    t dd t| t||D S )Nc                 s  s$   | ]\}}}|d kp||kV  qdS rc  rx   )r   leftrightr   rx   rx   ry   r  P  s   z2is_contiguous_strides_for_shape.<locals>.<genexpr>)r
  r   r   r   )r   r  rx   rx   ry   is_contiguous_strides_for_shapeM  s
    rq  )r   rs   c                 C  s   t j| j S rt   )r5   Zpadding_alignment_bytesitemsizerb  rx   rx   ry   get_align_for_dtypeX  s    rs  c                   @  s,   e Zd ZdZddddZddddZd	S )
r   zxAbstract base for Layout, MultiOutputLayout, NoneLayout.
    Represents the memory layout of the output of an Operation.r{  rD  c                 C  s   t t| jd S rt   r  rF  rx   rx   ry   r   `  s    zOutputSpec.get_devicerv   c                 C  s   t t| jd S rt   r  rF  rx   rx   ry   storage_sizec  s    zOutputSpec.storage_sizeN)r   r   r   rV  r   rt  rx   rx   rx   ry   r   \  s   r   c                   @  s   e Zd Zdedfddddddd	d
dZddddZeZddddZddddZddddZ	e
ddddddZddddZddddZd d! Ze
d"d# Zd$d% Zd&d' Zd(d) Zd*dd+d,Zddd-d.Zd/dd0d1ZdS )2re  Nr   r|  ra  r0  zOptional[list[Expr]]r   r   )r   r   r   r   r   rs   c                 C  sn   |d u rt |}|| _|| _t|t|ksBJ d| d| tdd |D sXJ || _|| _|| _d S )Nr;  	, stride=c                 s  s   | ]}t |ttfV  qd S rt   )ru   r   rv   r   rx   rx   ry   r  v  r   z"Layout.__init__.<locals>.<genexpr>)	r   r   r   r   r   r
  r   r   r   )rB  r   r   r   r   r   rx   rx   ry   r  i  s    
$zLayout.__init__r   rD  c                 C  sr   d}| j dkrd| j  }| jjd u r*dnd| jj }t| j d| jj | d| j d| j d| j | d	S )
NrU  r   z	, offset=:z('z', z, size=ru  r   )r   r   r   r   r   r   r   r   )rB  r   Zdevice_index_strrx   rx   ry   r  {  s    
"zLayout.__str__c                 C  s   | j S rt   r  rF  rx   rx   ry   r     s    zLayout.get_devicer   c                 C  sL   t j2 tjt| jt| j| j| jdW  d    S 1 s>0    Y  d S )Nr   r   )	r_   	fake_moder   r   rM   r   r   r   r   rF  rx   rx   ry   get_example  s    zLayout.get_examplerp   c                 C  s   t | j| jS rt   )rq  r   r   rF  rx   rx   ry   r    s    zLayout.is_contiguousr  )r  r  rs   c                 C  sV   t | }|dvs| d dkr dS t|t| | D ] \}}}|dkr0||kr0 dS q0dS )N)r      r4   FT)r   r   r!   )r  r  ndimro  rp  r   rx   rx   ry   is_channels_last_contiguous  s    
z"Layout.is_channels_last_contiguousc                 C  sJ   t | jtttt| j| jD ] \}}}|dkr$||kr$ dS q$dS )Nr4   FT)r   r   reversedr   r   r   r   )rB  ro  rp  r   rx   rx   ry   is_transposed  s    zLayout.is_transposedc                   s   t jt  ksJ dd tjD }fdd|D } fdd|D  dd }|  dgt   }tt  D ]}|| | | < qxtt  d D ]N}|| ||d  k}t|tstjj	j
|| ||d  kd	d
}|r dS qd	S )Nc                 S  s*   g | ]"\}}t jjj|d ddkr|qS )r   r  r4   )r_   r   r   r   )r   r   r  rx   rx   ry   r     s   z,Layout.is_stride_ordered.<locals>.<listcomp>c                   s   g | ]} j | qS rx   r+  r   rF  rx   ry   r     r   c                   s   g | ]} | qS rx   rx   r   r   rx   ry   r     r   c                   s   t |   fdd| D S )Nc                   s   g | ]}  |qS rx   r   )r   elementZ
sorted_arrrx   ry   r     r   zDLayout.is_stride_ordered.<locals>.sorted_indices.<locals>.<listcomp>)rZ  )Zarrrx   r  ry   sorted_indices  s    z0Layout.is_stride_ordered.<locals>.sorted_indicesr  r4   Tr  F)r   r   r   r   r   ru   rp   r_   r   Z
_shape_envr   )rB  r   Znon_1_indicesr   r  stride_orderedr   exprrx   )r   rB  ry   r    s(    
zLayout.is_stride_orderedc                 C  s:   dgt ttdt| jd  }t|g| }| |S Nr   r4   )r   r}  r   r   r   r  r  rx   rx   ry   is_channels_last_stride_ordered  s    "z&Layout.is_channels_last_stride_orderedc                 C  s0  t |}t| dkr| S tjs.t|| r.| S t }t|drR|j	
ddrR| S tdd t| |D sp| S t| }t|}dd tt| D }d	||d < d}t|d	d
 d	dD ]X\}	}
||	d	  }|| ||  }|tjkr
|| dkr
t||| }d}|||
< q|s| S t jd	7  _|S )z
        The padding does not change stride order but makes sure all strides larger
        than the threshold are multiple of align.
        r   r,  Zdislike_paddingFc                 s  s   | ]}t |ttjfV  qd S rt   )ru   rv   r   r   r   rx   rx   ry   r    s   z&Layout._pad_strides.<locals>.<genexpr>c                 S  s   g | ]}d qS r   rx   r   rx   rx   ry   r     r   z'Layout._pad_strides.<locals>.<listcomp>r4   N)re  T)rs  r   r5   Zpad_channels_lastre  r|  r_   Zget_current_noder  r,  getr
  rx  chainr   r   r   r   Zpadding_stride_thresholdrK   r   Znum_comprehensive_padding)Z
in_stridesr   r   ZalignZcurrent_fx_noder  r   Znew_stridespaddedrankr   Zprev_idxr   rx   rx   ry   _pad_strides  s@    


zLayout._pad_stridesc                 C  s6   t | tsJ | jd usJ | | j| j| j| _d S rt   )ru   r   r   r  r   r   rF  rx   rx   ry   r    s    zLayout.pad_stridesc                 C  s   t jot| tS rt   )r5   comprehensive_paddingru   r   rF  rx   rx   ry   r    s    zLayout.should_pad_stridesc                 C  s8   t | tr| S |  r|   t| j| j| j| j| jS rt   )	ru   r  r  r  r   r   r   r   r   rF  rx   rx   ry   as_fixed  s    
zLayout.as_fixedr  c                 C  s(   t jsJ dt| j d|   S )Nzconvert z to FixedLayout first)r   r  r   r   r  r  rF  rx   rx   ry   r  )  s    zLayout.make_indexerc                 C  s<   | j |j ko:| j|jko:| j|jko:| j|jko:| j|jkS rt   r   r   r   r   r   )rB  otherrx   rx   ry   __eq__/  s    



zLayout.__eq__
sympy.Exprc                 C  s   t | j| j| jS rt   )r   r   r   r   rF  rx   rx   ry   rt  8  s    zLayout.storage_size)r   r   r   r   r  r  r  r   ry  r  r  r|  r~  r  r  r  r  r  r  r  r  rt  rx   rx   rx   ry   re  g  s*   	
#
:	re  c                   @  s   e Zd ZdZddddZdS )r  z A Tensor layout we cannot changer  rD  c                   s    fdd}|S )z1A closure containing math to read a given elementc                   sf   t | t  jksJ t | t  jks,J  j}t|  j jD ]\}}}|dkrB|||  }qB|S rN  )r   r   r   r   r   )r   r  r   r   szrF  rx   ry   r  B  s    z)FixedLayout.make_indexer.<locals>.indexerrx   r  rx   rF  ry   r  ?  s    	zFixedLayout.make_indexerN)r   r   r   rV  r  rx   rx   rx   ry   r  <  s   r  c                      s   e Zd ZdZdZedd Zedd Zedd Zed	d
 Z	edd Z
dddZdddZdd Zdd Zddd fddZ  ZS )r   z(A Tensor layout we are allowed to changeFc                 C  sN   t | dkrg S tjjg}t| dd  D ]}|||d   q*tt|S )Nr   r4   r  )r   r   r  rE  r}  r[  r   )sizesZreversed_stridesr   rx   rx   ry   r   T  s    
z!FlexibleLayout.contiguous_stridesc                 C  s\   t tt| t |ks$J | |ftjj}dgt| }|D ]}|||< || |  }q>|S )z
        Create a stride based on the order the dimensions should be filled in.

        In this format, channels last would be:
            [1, 3, 2, 0]
        N)r/   r   r   r   r  rE  )r  r   Znext_strider  r   rx   rx   ry   fill_ordered]  s    $zFlexibleLayout.fill_orderedc                 C  s0   t tt| t |ksJ t|}t| |S )z
        Create a stride based on the sorted order of a permuted range.

        In this format, channels last would be:
            [3, 0, 2, 1]
        )r/   r   r   r   r   r  )r  r   r   rx   rx   ry   r  n  s    zFlexibleLayout.stride_orderedc                 C  sT   |t jkrt| tS |t jkr,t| tS |t jkr@t| S t	
d| tdS )aq  
        Create a stride based on a memory format.

        Memory format is translasted into a stride order,
        so channels_last is the same as:
            FlexibleLayout.stride_ordered(sizes, [3, 0, 2, 1])

        This interface does not support memory_format `torch.preserve_format`
        which should be used to deduce a format from another source
        z>stride_ordered_for_memory_format, unsuppored memory_format: %sN)r   channels_lastr   r  NHWC_STRIDE_ORDERchannels_last_3dNHWDC_STRIDE_ORDERZcontiguous_formatr   rg  rh  rc  )r  memory_formatrx   rx   ry    stride_ordered_for_memory_formatz  s    



z/FlexibleLayout.stride_ordered_for_memory_formatc                 C  sD   t | t |ksJ dd |D }ttt ||jd}t| |S )z
        Create a stride that has the same stride order as given stride

        For example, if given stride is [1000, 1, 100, 10],
        the fill order should be [1, 3, 2, 0]
        c                 S  s   g | ]}t jj|qS rx   )r_   r   r   r  ru  rx   rx   ry   r     r   z/FlexibleLayout.same_ordered.<locals>.<listcomp>rU  )r   rZ  r   __getitem__r   r  )r  r   r   rx   rx   ry   same_ordered  s    zFlexibleLayout.same_orderedc                 C  sD   |  | j|}|  r,|r,| || j| j}t| j| j| j|| jS rt   )r  r   r  r  r   r  r   r   )rB  r   r  r$  rx   rx   ry   as_stride_order  s    zFlexibleLayout.as_stride_orderc                 C  s:   |}|   r"|r"| || j| j}t| j| j| j|| jS rt   )r  r  r   r   r  r   r   )rB  r  r  r$  rx   rx   ry   as_exact_strides  s    zFlexibleLayout.as_exact_stridesc                 C  s@   |  | j|}|  r(| || j| j}t| j| j| j|| jS rt   )r  r   r  r  r   r  r   r   )rB  r   r$  rx   rx   ry   as_fill_order  s    zFlexibleLayout.as_fill_orderc                 C  s@   |  | j|}|  r(| || j| j}t| j| j| j|| jS rt   )r  r   r  r  r   r  r   r   )rB  r   r$  rx   rx   ry   as_same_order  s    zFlexibleLayout.as_same_orderNr   rD  c                   s2   |rt ||}n
t |}t |||| d S rt   )r   r  r   r  r  )rB  r   r   r   r  r  r  rx   ry   r    s    
zFlexibleLayout.__init__)F)F)N)r   r   r   rV  r  r  r   r  r  r  r  r  r  r  r  r  r  rx   rx   r  ry   r   N  s"   






r   c                      s>   e Zd ZdZddd fddZddd	d
Zdd Z  ZS )NonOwningLayoutz,Is a view into the storage of another tensorzUnion[BaseView, TensorBox]r   )viewrs   c                   s,   |  }t |j|j|j|j || _d S rt   )r   r  r  r   r   r   r   r  )rB  r  r  r  rx   ry   r    s    zNonOwningLayout.__init__r  rD  c                 C  s   |    S rt   )r  r  rF  rx   rx   ry   r    s    zNonOwningLayout.make_indexerc                 C  s4   | j  j}|dkrdS ddlm} tjj||S )Nr   Tr4   )	ALIGNMENT)	r  r   r   utilsr  r_   r   r   Zstatically_known_multiple_of)rB  r   r  rx   rx   ry   maybe_guard_aligned  s
    z#NonOwningLayout.maybe_guard_aligned)r   r   r   rV  r  r  r  r  rx   rx   r  ry   r    s   
r  c                   @  s   e Zd ZdZdS )CommBufferTypeZsymm_memN)r   r   r   ZSYMM_MEMrx   rx   rx   ry   r    s   r  c                      s<   e Zd ZU dZded< ded< dddd fdd	Z  ZS )
CommBufferLayoutax  
    A layout that signifies the buffer is a comm buffer.
    In terms of striding, the layout is identical to `FixedLayout`.

    Buffers with this layout do not participate in in-place reuse - it can be
    neither the source nor the target for in-place reuse.

    For detailed motivation and usage of this layout, see
    NOTE [lowering-time collective optimization].
    r  comm_buffer_typer   
group_namer   )r  r  r  c                   sR   t |tstd| d| }t j|j|j|j|j	|j
d || _|| _d S )NzJA `CommBufferLayout` can only be initialized with a `FlexibleLayout` (got z).r  )ru   r   r7  r  r  r  r   r   r   r   r   r  r  )rB  r  r  r  fixedr  rx   ry   r    s     
zCommBufferLayout.__init__)r   r   r   rV  r   r  r  rx   rx   r  ry   r    s   
r  c                   @  sj   e Zd ZU ded< ejdd dZded< ejdd dZded	< d
dddZdd Z	ddddZ
dS )
NoneLayoutr{  r   c                   C  s   dgS rq  rx   rx   rx   rx   ry   rT  )  r   zNoneLayout.<lambda>default_factoryr  r   c                   C  s   dgS rq  rx   rx   rx   rx   ry   rT  *  r   r   rv   rD  c                 C  s   dS rq  rx   rF  rx   rx   ry   rt  ,  s    zNoneLayout.storage_sizec                 C  s   | S rt   rx   rF  rx   rx   ry   r  /  s    zNoneLayout.as_fixedc                 C  s   | j S rt   r  rF  rx   rx   ry   r   2  s    zNoneLayout.get_deviceN)r   r   r   r   r  r  r   r   rt  r  r   rx   rx   rx   ry   r    s   

r  c                      s   e Zd Zddd fddZedddd	Zejd
dddd	ZddddZddddZdd Z	e
dddZdd ZddddZ  ZS )MutationLayoutSHOULDREMOVErm   r   )r'  rs   c                   s@   t  | | | d  || _|   }tj	
| d S rt   )r  r  r  r   r   r'  
get_bufferr  r_   r   mark_buffer_mutated)rB  r'  r   r  rx   ry   r  7  s    z#MutationLayoutSHOULDREMOVE.__init__r0  rD  c                 C  s
   |   jS rt   )real_layoutr   rF  rx   rx   ry   r   B  s    z!MutationLayoutSHOULDREMOVE.strider   r   c                 C  s   d S rt   rx   )rB  r   rx   rx   ry   r   F  s    r  c                 C  s   |    S rt   )r  rt  rF  rx   rx   ry   rt  J  s    z'MutationLayoutSHOULDREMOVE.storage_sizer  c                   s,    fdd  | j }t|ts(J d|S )Nc                   sB   t | tr | jS t | tr* |  S t | tr> | jS | S rt   )ru   r  r'  r  r  
MutableBoxr  )r'  unwrap_viewsrx   ry   r  N  s    




z;MutationLayoutSHOULDREMOVE.get_buffer.<locals>.unwrap_viewsz1MutationLayoutSHOULDREMOVE must refer to a buffer)r'  ru   r  )rB  r  rx   r  ry   r  M  s    	
z%MutationLayoutSHOULDREMOVE.get_bufferc                 C  s
   |   jS rt   )r  r  rF  rx   rx   ry   r  ]  s    z&MutationLayoutSHOULDREMOVE.real_layoutFc              	   C  s   |   tj|  t|tr(|j}|  |slt	j
| | | dd t| | D dj}|   t|jjtsJ t||j_|jS )Nc                 S  s    g | ]\}}t jj||qS rx   r_   r   r   rF  r   r'  r(  rx   rx   ry   r   x  s   z;MutationLayoutSHOULDREMOVE.realize_into.<locals>.<listcomp>r  )ru  r_   r   r  r  ru   rl   r  r  r  r  r   r   r  r   r   r  r   r  )r  srcdstZunsafe_aliasrx   rx   ry   realize_into`  s$    

z'MutationLayoutSHOULDREMOVE.realize_intoc                 C  s   | S rt   rx   rF  rx   rx   ry   r    s    z#MutationLayoutSHOULDREMOVE.as_fixedr  c                 C  s
   | j  S rt   )r'  r  rF  rx   rx   ry   r    s    z'MutationLayoutSHOULDREMOVE.make_indexer)F)r   r   r   r  r  r   setterrt  r  r  r  r  r  r  r  rx   rx   r  ry   r  6  s   "r  c                      s  e Zd ZU ded< ded< dd fddZd	dd
dZddddZddddZddddZddddZ	e
ddddZddddZdddd Zd!dd"d#Zd$dd%d&Zddd'd(Zd)d* Zd+d, ZdUddd.d/Zddd0d1Zddd2d3ZdVddd4d5Zd6d7 Zd8dd9d:ZdWd<dd=d>d?Zd@dA ZdBddCdDZdBddEdFZdGddHdIZdXdJdKdLdMdNZdKddOdPZdddQdRZ dJddSdTZ!  Z"S )Yr  r   r   r   r  r   rD  c                   s   t    | dd  d S r  )r  rG  rC  rF  r  rx   ry   rG    s    
zBuffer.__post_init__r  c                 C  s   |    S rt   )r   r  rF  rx   rx   ry   r    s    zBuffer.make_indexerr   c                 C  s   | j sJ | | j S rt   r   rF  rx   rx   ry   r    s    zBuffer.get_namez!Union[torch.Tensor, sympy.Symbol]c                 C  s*   t | jtr| j S tt| jjd S rt   )ru   r  re  ry  rc  r   r   rF  rx   rx   ry   ry    s    
zBuffer.get_exampler{  c                 C  s   |    S rt   )ri  r   rF  rx   rx   ry   r     s    zBuffer.get_devicerQ  c                 C  s   d S rt   rx   rF  rx   rx   ry   rR    s    zBuffer.get_defining_opra  c                 C  s
   |   jS rt   )r   r   rF  rx   rx   ry   r     s    zBuffer.dtyperl  c                 C  s   g |   jS rt   )r   r   rF  rx   rx   ry   r     s    zBuffer.get_sizer0  c                 C  s   g |   jS rt   )r   r   rF  rx   rx   ry   r    s    zBuffer.get_strider   c                 C  s
   |   jS rt   )r   r   rF  rx   rx   ry   
get_offset  s    zBuffer.get_offsetre  c                 C  s&   t | jtr| jS tt| jjd S rt   )ru   r  re  rc  r   r   rF  rx   rx   ry   r     s    zBuffer.get_layoutc                 C  s   | j S rt   rS  rF  rx   rx   ry   ri    s    zBuffer.get_output_specc                 C  s   |   S rt   )rp  rF  rx   rx   ry   r    s    zBuffer.get_storage_numelc                 C  s(   t | jtr$t | jts$| j | _d S rt   )ru   r  re  r  r  rF  rx   rx   ry   r    s    zBuffer.freeze_layoutFc                 C  s&   t | jtsJ | jj||d| _d S Nr  )ru   r  r   r  r  rx   rx   ry   r    s    z&Buffer.freeze_layout_with_stride_orderc                 C  s"   t | jtsJ | j|| _d S rt   )ru   r  r   r  r  rx   rx   ry   r    s    z$Buffer.freeze_layout_with_fill_orderc                 C  s"   t | jtsJ | j|| _d S rt   )ru   r  r   r  r  rx   rx   ry   r    s    z$Buffer.freeze_layout_with_same_orderc                 C  s&   t | jtsJ | jj||d| _d S r  )ru   r  r   r  r  rx   rx   ry   r    s    z'Buffer.freeze_layout_with_exact_stridesc                 C  s   t jjt|  dS rq  rr  rF  rx   rx   ry   rt    s    zBuffer.is_zero_elementsr  c                   s(      rtt  dS  fdd}|S )Nrb  c                   s      }t jpd|| S r  )r  r]   rQ  r   r   r  rF  rx   ry   r
    s    z"Buffer.make_loader.<locals>.loader)rt  r	   r  r   r  rx   rF  ry   r    s    zBuffer.make_loaderNrv  rw  c                 C  s   |   S rt   r  ry  rx   rx   ry   rz    s    zBuffer.codegen_referencec                 C  s   d S rt   rx   rF  rx   rx   ry   r\    s    zBuffer.decide_layoutrS  c                 C  s   t | jtr| jj gS dS rl  )ru   r  r  r  r  rF  rx   rx   ry   r    s    z#Buffer.get_inputs_that_alias_outputc                 C  s   t | jtr| jj gS dS rl  )ru   r  r  r'  r  rF  rx   rx   ry   r    s    zBuffer.get_mutation_namesrH  c                 C  s   t |  gS rt   )r/   r  rF  rx   rx   ry   rM    s    zBuffer.get_read_namesrp   r}   r  c                 C  s   t  S rt   r.   r  rx   rx   ry   r    s    zBuffer.get_free_symbol_usesc                 C  s   t  S rt   r.   rF  rx   rx   ry   r    s    zBuffer.get_unbacked_symbol_defsc                 C  s   d S rt   rx   rF  rx   rx   ry   ru    s    zBuffer.realizec                 C  s   dS r  rx   rF  rx   rx   ry   should_allocate  s    zBuffer.should_allocate)F)F)N)F)#r   r   r   r   rG  r  r  ry  r   rR  r  r   r   r  r  r   ri  r  r  r  r  r  r  rt  r  rz  r\  r  r  rM  r  r  ru  r  r  rx   rx   r  ry   r    sD   
  r  c                   @  s<   e Zd ZddddZddddZejZddd	d
ZdS )OperationBufferr/  rD  c                 C  s   | gS rt   rx   rF  rx   rx   ry   r    s    zOperationBuffer.get_outputsr  c                 C  s   | S rt   rx   rF  rx   rx   ry   rR    s    zOperationBuffer.get_defining_opr   c                 C  s   t |  t|  d S rt   )r  rG  r  rF  rx   rx   ry   rG    s    
zOperationBuffer.__post_init__N)r   r   r   r  rR  r  r  rG  rx   rx   rx   ry   r    s   r  c                   @  s   e Zd ZddddZdS )InputBufferrv   rD  c                 C  s   dS rN  rx   rF  rx   rx   ry   r    s    zInputBuffer.num_readsN)r   r   r   r  rx   rx   rx   ry   r    s   r  c                   @  s   e Zd ZdZdS )DonatedBufferaY  
    Represents a donated buffer which is a saved tensor that is not alias to any
    fwd inputs, fwd user outputs, and bwd outputs. We generally cannot inplace
    reuse the input tensor memory during backward since it might be used in another
    function. However, donated buffer can be inplace reused during backward
    to save memory.
    N)r   r   r   rV  rx   rx   rx   ry   r  #  s   r  c                   @  s8   e Zd ZU dZded< ddddZdd	d
ddZdS )r  Nr{  r  r  rD  c                   s   ddd fdd}|S )Nrl  r^   r   c                   s,       }ttj   j|| S rt   )	r   r  r]   rQ  r_   r   constant_namer  r  r  rF  rx   ry   r
  1  s
    z*ConstantBuffer.make_loader.<locals>.loaderrx   r  rx   rF  ry   r  0  s    zConstantBuffer.make_loaderr|  rm   r  c                 C  s   t tj|  || jdS N)r   r  )r  r_   r   r  r  r  r~  rx   rx   ry   r  :  s    z!ConstantBuffer.constant_to_device)r   r   r   r  r   r  r  rx   rx   rx   ry   r  -  s   

r  c                   @  sZ   e Zd ZddddZddddd	d
ZddddddZddddZddddZdS )NoneAsConstantBufferr  rD  c                 C  s   t  S rt   r.   rF  rx   rx   ry   rL  B  s    zNoneAsConstantBuffer.get_readsFrp   r}   r  c                 C  s   t  S rt   r.   r  rx   rx   ry   r  E  s    z)NoneAsConstantBuffer.get_free_symbol_usesNrv  r   rw  c                 C  s
   t jjjS rt   )r_   r   rT  none_strry  rx   rx   ry   rz  J  s    z&NoneAsConstantBuffer.codegen_referencer   c                 C  s
   t d dS Nr  )r  rF  rx   rx   ry   ri  M  s    z$NoneAsConstantBuffer.get_output_specc                 C  s   dS r  rx   rF  rx   rx   ry   rk  P  s    z&NoneAsConstantBuffer.has_tensor_output)F)N)r   r   r   rL  r  rz  ri  rk  rx   rx   rx   ry   r  @  s    r  c                   @  sH   e Zd ZU ded< ddddddZdd
ddddZddddZd	S )r   r   r  Frp   r}   r  c                 C  s   t | j|S rt   )r   r  r  rx   rx   ry   r  X  s    z*ShapeAsConstantBuffer.get_free_symbol_usesNrv  r   rw  c                 C  s   t jj| jS rt   )r_   r   rT  Zcodegen_sizevarr  ry  rx   rx   ry   rz  ]  s    z'ShapeAsConstantBuffer.codegen_referencerD  c                 C  s   dS r  rx   rF  rx   rx   ry   rk  `  s    z'ShapeAsConstantBuffer.has_tensor_output)F)N)r   r   r   r   r  rz  rk  rx   rx   rx   ry   r   T  s
   
 r   c                      s*  e Zd ZU ded< ddddZdddd	Zd
dddZddddZddddZd?dddddZ	dd fddZ
ddddZddd d!Zd"dd#d$Zed%dd&d'Zd@d)d*d+d,d-d.ZedAd/d0Zd1dd2d3Zddd4d5Zddd6d7Zddd8d9Zd:d;d<d=d>Z  ZS )BrX  r  r  r   rD  c                 C  s(   | j dur| j S t| jdr$| jj S dS )z
        Returns self.name if it exists, otherwise returns the name of the data node if that exists.
        If neither exist, returns None.
        Nr   )r   r  r  rF  rx   rx   ry   get_computed_buffer_nameh  s
    
z'ComputedBuffer.get_computed_buffer_namerv   c                 C  s
   | j  S rt   r  r  rF  rx   rx   ry   r  s  s    zComputedBuffer.num_readsr  c                 C  s
   | j  S rt   r  rL  rF  rx   rx   ry   rL  v  s    zComputedBuffer.get_readsrH  c                 C  s
   | j  S rt   r  rF  rx   rx   ry   rM  y  s    zComputedBuffer.get_read_namesr  c                 C  s   t tddf | j rDt|  | j | j W  d    S t|  | j	 W  d    S W d    n1 sz0    Y  d S r  )
r   ro   r   r  r  r>   get_store_functionr  r  r   rF  rx   rx   ry   r  |  s    
zComputedBuffer.get_read_writesFrp   r}   r  c                 C  s6   t |  |t |  |B t |  |B | j|B S rt   )r   r   r  r  r  r  r  rx   rx   ry   r    s    
z#ComputedBuffer.get_free_symbol_usesr  c                   s6   |   s,| jtjjvr,|  dkr,| j S t  S rq  )	r  r   r_   r   Zmutated_buffersr  r  r  r  rF  r  rx   ry   r    s    

zComputedBuffer.make_loaderzCallable[..., None]c                 C  sZ   |     }t| jtttfr4t| jj	| j
|S t| jtsDJ t| jj| j
|S d S rt   )r   r  r  ru   r  r7  r  r  r	   r?  r   r  r  r  rx   rx   ry   r    s
    z!ComputedBuffer.get_store_functionzOptional[list[int]]c                   s   t | jtrt| j | j \\}}|  j	}t
dd |D sLJ fdd|D }|rt | jttfr| j| n|  fdd|D }ddlm} |||  S dS )	al  
        If our layout is still flexible, try to determine the stride order based on stride orders of reads.

        TODO(jansel): A better algorithm here would look at downstream consumers of this
                      value and try to do global graph-level layout optimization.
                      This is also something just begging to be autotuned.
        c                 s  s    | ]}t |tjtjfV  qd S rt   )ru   r6   StarDep	MemoryDeprS  rx   rx   ry   r    s   z0ComputedBuffer.get_fill_order.<locals>.<genexpr>c                   s.   g | ]&}t |tjrt|jd d  D qS )c                 S  s   i | ]}|d kr|t jjqS r   r  )r   vrx   rx   ry   r     r   z<ComputedBuffer.get_fill_order.<locals>.<listcomp>.<dictcomp>)ru   r6   r  r[   r   rS  )r=  rx   ry   r     s   z1ComputedBuffer.get_fill_order.<locals>.<listcomp>c                   s   g | ]}t jj| qS rx   r_   r   r   rk  r   r  )r_  rx   ry   r     s   r4   pick_loop_orderN)ru   r  r   r6   ri  r  r  r  r  r  r
  r  r  r   	schedulerr  r   )rB  
index_varsr   r  Zstride_lengthsr  rx   )r_  r=  ry   r     s*    


zComputedBuffer.get_fill_orderr   c                 C  s0   t | jtr,|  }|r$| | n|   d S rt   )ru   r  r   r   r  r  r  rx   rx   ry   r\    s
    zComputedBuffer.decide_layoutzetuple[tuple[list[sympy.Expr], list[sympy.Expr]], LoopBody, tuple[list[sympy.Expr], list[sympy.Expr]]]c           
      C  s   t j| j | j dd\}}ttd|  < t	| 
 |  rH|n
|d d |g|R  }W d    n1 st0    Y  g }g }g }g }| D ]V\}}	||d v r|rJ || ||	 q||d v sJ || ||	 q||f|||ffS )Nqrj   r  r4   r   )r6   ri  r  r  r  r   ro   r  r   r@   r  r  itemsr[  )
rB  r   
var_rangesr  r  reduce_vars
index_sizereduce_sizer  r   rx   rx   ry   get_default_sizes_body  s0    
&

z%ComputedBuffer.get_default_sizes_bodyN*Optional[tuple[dict[Any, Any], list[Any]]]Optional[Callable[..., Any]]z:tuple[tuple[list[sympy.Expr], list[sympy.Expr]], LoopBody])extra_indexing_constraintsrecompute_sizes_body_funcrs   c                   s    \\}}}\}}|r<|||f|||f\\}}}\}}g |j  |durt|trht|dkslJ |\}}	t|tsJ t|	tsJ tdd |	D sJ |j	}
|
|ksJ |
|f fdd|	D }	 |	7  g |
 tjtjs|   fdd}|| }tt p4tj }|||||\}}}|||||\}}}tj||d	d
\\}}}t|||||g|||}||f|fS )an  
        This is a main place where we do loop transformations in a
        backend-agnostic way.

        Here we:
            1) Remove any 1 dimensions
            2) Fuse contiguous dimensions together
            3) Reorder dimensions based on stride orders

        Optional argument extra_indexing_constraints can be used to append additional
        indexing expressions to existing ones derived from buffer's body. This can be useful
        to fuse scheduler nodes with compatible ranges, e.g. (s0*s1*...,) and (s0, s1, s2, ...)
        on CPU by preventing indexing simplifications and obtaining index/reduce ranges for
        the scheduler node compatible with other nodes.
        Optional argument recompute_sizes_body_func can be used to recompute sizes and body
        on the default body. This can be useful to append additional loop transformations.
        Nr   c                 s  s   | ]}t |tV  qd S rt   )ru   r   )r   frx   rx   ry   r  5  r   z6ComputedBuffer.simplify_and_reorder.<locals>.<genexpr>c                   s   g | ]}| vr|qS rx   rx   r  )index_formulasrx   ry   r   =  s   z7ComputedBuffer.simplify_and_reorder.<locals>.<listcomp>c           	        s\    | ||\}}}|| } |rNtjj| |t | |\}}}t||}n|}|||fS rt   )_apply_loop_reorderingr_   r   r   _simplify_loopsr:   r   )	Zx_varssupport_varsr  Zsimplify_loopsZreindex0r   r   _pruner   r  memory_addrsrB  rx   ry   simplify_and_reorderF  s    


zAComputedBuffer.simplify_and_reorder.<locals>.simplify_and_reorderprj   )r  Zindexing_exprsr   ru   r   r   r   r   r
  r  Zget_write_exprsr_   r   rd  r7   ZPREFER_STORE_LOOP_ORDERextendZget_read_exprsrV   r   r5   Zloop_ordering_after_fusionr6   Zindex_vars_no_squeezer@   )rB  r  r  r  r  r  r  r  Zextra_indexing_rangesZextra_indexing_exprZexpected_var_rangesr  r  Zshould_merge_loopsZiter_rangesZiter_reindexr   Zreduce_rangesZreduce_reindexZ	iter_varsr  rx   r  ry   r    sz    




z#ComputedBuffer.simplify_and_reorderc              
     s   ddl m} |du rg }zT fdd|D }t|t|krRt|d t ksVJ tt|||}W n@ ty   tjrt	dt
t | ttt}Y n0 fdd|D t|t|fS )	zU
        Shuffle the order of loops around to hopefully improve performance.
        r4   r  Nc                   s   g | ]}t jj| qS rx   r  r  )r  r  rx   ry   r     s   z9ComputedBuffer._apply_loop_reordering.<locals>.<listcomp>r   z%Did not simplify complex index:
%s
%sc                   s   g | ]} | qS rx   rx   r   )r  rx   ry   r     r   )r  r  r   r   r}  	Exceptionr5   rh  rg  warningr   r   r   r   r   )r  r  r  r  Zpriority_idxr  r  r   rx   )r  r  r  ry   r  z  s*    
z%ComputedBuffer._apply_loop_reorderingr  c                 C  s
   | j  S rt   r  r  rF  rx   rx   ry   r    s    z!ComputedBuffer.get_reduction_sizec                 C  s
   | j  S rt   r  r  rF  rx   rx   ry   r    s    z!ComputedBuffer.get_reduction_typec                 C  s
   | j  S rt   )r  rt  rF  rx   rx   ry   r    s    zComputedBuffer.is_no_opc                 C  s   dS NTrx   rF  rx   rx   ry   r    s    zComputedBuffer.should_allocater|  rm   r  c                 C  s   | j |S )r  r  r  r~  rx   rx   ry   r    s    z!ComputedBuffer.constant_to_device)F)NN)N)r   r   r   r   r  r  rL  rM  r  r  r  r  r   r\  rJ   r  r  r  r  r  r  r  r  r  r  rx   rx   r  ry   rX  d  s2   
 
'"  s #rX  c                      s~   e Zd ZdZddddd fddZd	d
ddZdd Zdd
ddZdd
ddZdd
ddZ	ddddddZ
  ZS )TemplateBufferzt
    Represents a Triton (in the future other type) of template operator
    that we can fuse an epilogue onto.
    re  Sequence[IRNode]r  r   )r  r2  make_kernel_renderrs   c                   s@   t  jd |d t|| _|| _tj| | _	tj
|  d S r  )r  r  InputsKernelunwrap_storager2  r  r_   r   register_bufferr   register_operation)rB  r  r2  r  r  rx   ry   r    s
    zTemplateBuffer.__init__r  rD  c                 C  s   | j ddS )NT	normalize)r>   rF  rx   rx   ry   r    s    zTemplateBuffer.get_read_writesc              	     s   |   |     fdd}tj||  d|d}| jD ]>j   fdd}| jtj| dddjO  _q>|S )Nc                   s"   t |dksJ t | dS )Nr   Zfake)r   r]   r  r  )r  r   rx   ry   dummy  s    z1TemplateBuffer.extract_read_writes.<locals>.dummyrx   r  c                   s(   t |dksJ t  |  d S rq  )r   r]   rQ  r  r  )r  r4  rx   ry   r    s    T)	r  r   r  r6   r>   r   r2  r  r  )rB  r  r  depsrx   )r  r4  r   ry   r>     s    


z"TemplateBuffer.extract_read_writesr  c                 C  s   t jjS rt   )r   r  rE  rF  rx   rx   ry   r    s    z!TemplateBuffer.get_reduction_sizer   c                 C  s   d S rt   rx   rF  rx   rx   ry   r    s    z!TemplateBuffer.get_reduction_typerp   c                 C  s   dS r  rx   rF  rx   rx   ry   r    s    zTemplateBuffer.should_allocateNr  r  )r  r  c                 C  s   |   dfd fS rl  ro  )rB  r  r  rx   rx   ry   r    s
    z#TemplateBuffer.simplify_and_reorder)NN)r   r   r   rV  r  r  r>   r  r  r  r  r  rx   rx   r  ry   r    s     r  c                      sh   e Zd Zddddd fddZdd	d
d fddZddddZddddZddddZ  ZS )TritonTemplateBufferNzOptional[Iterable[IRNode]]zOptional[OrderedSet[str]]r   )mutated_inputsallowed_prologue_inpsrs   c                   s   t  ||| |_g_|durtjjjtjjjf}t	j
jj}||v s`J d| d| jd    j fdd|D 7  _|r|nt _d_d_dS )a  
        NOTE:[TritonTemplates with multiple outputs]
        We want the ability for TritonTemplates to output multiple tensors. Triton
        kernels have no notion of outputs and this is done by creating tensors that
        are then mutated by the kernel. Currently our STORE_OUTPUT codegen doesn't
        support creating multinode outputs for triton templates.
        We work around this by creating an extra input buffer during the lowering
        and we mark them as mutated inputs.
        Nz$Mutated inputs are only allowed for z	 but got r   c                   s   g | ]}t t d |qS r  MutationOutputr  r   ra  r   rB  rx   ry   r     s   z1TritonTemplateBuffer.__init__.<locals>.<listcomp>)r  r  r  outputsr   r]   Zhigher_orderZflex_attentionZflex_attention_backwardr_   r   current_noder'  r2  r   r/   r  subgraph_inpssubgraph_outs)rB  r  r2  r  r  r  Zallowed_setr  r  r  ry   r    s&    


zTritonTemplateBuffer.__init__Frp   r}   r  c                   s   t  |}| jr| jng }| jr(| jng }|D ]J}t|tjrR|t|| q0t|t	rn||| q0|d u s0J q0|D ],}t|t	r||| q|d u sJ q|S rt   )
r  r  r  r  ru   r   r   updater   rm   )rB  r   resr  r  r4  r   r  rx   ry   r     s    

z)TritonTemplateBuffer.get_free_symbol_usesr/  rD  c                 C  s   | j S rt   )r  rF  rx   rx   ry   r  7  s    z TritonTemplateBuffer.get_outputsrH  c                 C  s   | j S rt   )r  rF  rx   rx   ry   get_allowed_prologue_inps:  s    z.TritonTemplateBuffer.get_allowed_prologue_inpsr   c                 C  s   d| j  d}|S )NzTritonTemplateBuffer(layout=r   rS  )rB  r   rx   rx   ry   r  =  s    zTritonTemplateBuffer.__str__)NN)F)	r   r   r   r  r  r  r	  r  r  rx   rx   r  ry   r    s     , r  c                      s   e Zd ZdZdddddd fddZd	d
ddZdd
ddZdd Zdd
ddZdd
ddZ	dd
ddZ
dd
ddZdd
ddZ  ZS )ChoiceCallera.  
    Represents a possible choice used in autotune_process.py.
    During autotuning, self.benchmark() is first called to get benchmark result,
    and if this choice is selected, self.output_node() is called to get the output_node.

    Children classes: TritonTemplateCaller, CUDATemplateCaller.
    r   r/  re  r   )r   r~   r  descriptionrs   c                   s&   t    || _|| _|| _|| _d S rt   )r  r  r   r  r~   r  )rB  r   r~   r  r  r  rx   ry   r  N  s
    
zChoiceCaller.__init__r  rD  c                  s2   |    tjr t fddS t d|iS )Nc                     s     S rt   rx   rx   algor   rx   ry   rT  `  r   z(ChoiceCaller.benchmark.<locals>.<lambda>r   )to_callabler5   Z/profile_bandwidth_with_do_bench_using_profilingrO   rE   	benchmark)rB  r   r   rx   r  ry   r  ]  s    zChoiceCaller.benchmarkc                 C  s   t d S rt   r  rF  rx   rx   ry   	call_namec  s    zChoiceCaller.call_namec                 C  s   t d S rt   r  rF  rx   rx   ry   r  f  s    zChoiceCaller.to_callablec                 C  s   |   S )z
        Hash key for the underlying kernel. By default, we assume there are no
        runtime params, so kernel hash key defaults to choice caller's hash key.
        )hash_keyrF  rx   rx   ry   kernel_hash_keyi  s    zChoiceCaller.kernel_hash_keyc                 C  s   t d S rt   r  rF  rx   rx   ry   r  p  s    zChoiceCaller.hash_keyrl   c                 C  s   t d S rt   r  rF  rx   rx   ry   r-  s  s    zChoiceCaller.output_nodez<dict[str, Union[PrimitiveInfoType, list[PrimitiveInfoType]]]c                 C  s   i S )zRInformation returned here is logged to the autotune log file when that is enabled.rx   rF  rx   rx   ry   	info_dictv  s    zChoiceCaller.info_dictc                 C  s   dS )NZunsupported_choicerx   rF  rx   rx   ry   autoheuristic_idz  s    zChoiceCaller.autoheuristic_id)r   r   r   rV  r  r  r  r  r  r  r-  r  r  r  rx   rx   r  ry   r
  E  s   r
  c                   @  s   e Zd ZddddZdS )TritonTemplateCallerBaser
   rD  c                 C  s   t d S rt   r  rF  rx   rx   ry   get_make_kernel_render  s    z/TritonTemplateCallerBase.get_make_kernel_renderN)r   r   r   r  rx   rx   rx   ry   r  ~  s   r  c                      s   e Zd ZdZddddddd fd	d
ZeddddZeddddZej	ddddZ
dddddZddddZ  ZS )MultiTemplateBufferaG  
    Represents a Buffer with multiple backing implementation choices.

    Choices can be TritonTemplates or ExternKernels. During scheduling if there is a potential
    epilogue we will benchmark each of the choices with the epilogue to determine an implementation.
    Otherwise, the fastest base choice will be chosen.
    re  r   z'Callable[[], dict[ChoiceCaller, float]]zlist[ChoiceCaller]rH  r   )r  r2  choice_timings_fnunfiltered_choicesr  rs   c                   s>   t  j||d |d || _d | _|| _tdd |D | _d S )N)r  r2  r  r  c                 s  s.   | ]&}t |tp$t |tjjjo$|jV  qd S rt   )ru   r  r   	_inductorselect_algorithmZExternKernelCallerZhas_out_variant)r   choicerx   rx   ry   r    s   
z/MultiTemplateBuffer.__init__.<locals>.<genexpr>)r  r  _choice_timings_fn_choice_timingsZoriginal_inputsr
  _output_plannable)rB  r  r2  r  r  r  r  rx   ry   r    s    zMultiTemplateBuffer.__init__rp   rD  c                 C  s   | j S )z^
        Are all possible choices TritonTemplates or Extern Kernels with out variants
        )r  rF  rx   rx   ry   output_plannable  s    z$MultiTemplateBuffer.output_plannablezdict[ChoiceCaller, float]c                 C  s   | j d u r|  | _ | j S rt   )r  r  rF  rx   rx   ry   choice_timings  s    

z"MultiTemplateBuffer.choice_timingsr  )callerc                 c  sR   t |tjjjsJ | j|jks$J | j}| | _zd V  W || _n|| _0 d S rt   )ru   r   r  r  TritonTemplateCallerr  r  r  )rB  r"  renderrx   rx   ry   swap_as_triton_caller  s    
z)MultiTemplateBuffer.swap_as_triton_caller)r"  rs   c                 C  sJ   t |tjjjsJ |  |jjks(J |  |jj	ks<J |
 | _d S rt   )ru   r   r  r  r#  r   r  r   r  r   r  r  )rB  r"  rx   rx   ry   finalize_as_triton_caller  s    z-MultiTemplateBuffer.finalize_as_triton_callerztuple[ChoiceCaller, float]c                 C  s    t | j| jjd}|| j| fS )NrU  )r  r!  r  )rB  Z
min_choicerx   rx   ry   get_min_choice  s    z"MultiTemplateBuffer.get_min_choice)r   r   r   rV  r  r  r   r!  r  r  r%  r&  r'  r  rx   rx   r  ry   r    s   r  c                      s>   e Zd Zddddd fddZdd	 Zdd
ddZ  ZS )CUDATemplateBufferrv   rb   rp   r   )workspace_sizetemplatesupports_epilogue_fusionrs   c                   s&   t  ||| || _|| _|| _d S rt   )r  r  r)  r*  r+  )rB  r  r2  r  r)  r*  r+  r  rx   ry   r    s    	zCUDATemplateBuffer.__init__c                 C  s   | j d ur| j S dS rq  )r)  rF  rx   rx   ry   r    s    z%CUDATemplateBuffer.get_workspace_sizerD  c                 C  s$   |   D ]}t| d d  qd S rt   )r  r]   r  r  )rB  r)  rx   rx   ry   emulate_store_fn  s    z#CUDATemplateBuffer.emulate_store_fn)r   r   r   r  r  r,  r  rx   rx   r  ry   r(    s   r(  c                      s4   e Zd Zdd fddZdd fddZ  ZS )CppTemplateBufferr   rD  c                   s&   t  ||| || _|| _d | _d S rt   )r  r  r*  r  r  )rB  r  r2  r  r*  r  r  rx   ry   r    s    zCppTemplateBuffer.__init__re  c                   sZ   t | jtrLt | jtsJ | jd }t |ts4J |j}t |tsHJ |S t  S d S rq  )	ru   r  MultiOutputLayoutr  r   r  re  r  r   )rB  Zfirst_outputr  r  rx   ry   r     s    
zCppTemplateBuffer.get_layout)r   r   r   r  r   r  rx   rx   r  ry   r-    s   r-  c                   @  sn   e Zd ZU ded< ddddZdddd	Zed
d
dddZedd Z	ddddZ
ddddZdS )r  r/  r2  r  rD  c                   s   t tj  }tj | jD ]H}t|tr@| fdd|D  qt|trNqq|	 |
  qt tj  fdd|  D }tj||t  dS )Nc                 3  s   | ]} |  V  qd S rt   r  ru  r  rx   ry   r    r   z/InputsKernel.get_read_writes.<locals>.<genexpr>c                 3  s   | ]} |  V  qd S rt   r  r  r/  rx   ry   r    s   )r  writesZindex_exprs)r/   r6   r;   r  r2  ru   r   r  r   r  r  r  Z
ReadWrites)rB  r  inputr0  rx   r/  ry   r    s     


zInputsKernel.get_read_writesr  c                 C  s
   |   jS rt   r  rF  rx   rx   ry   rL    s    zInputsKernel.get_readsrm   rq   c                 C  sz   t |tr|j}t |tr |j}t |tr>t |ts>t|}t |trR| |S t |t	r`|S t |t
tfsvJ ||S rt   )ru   rl   r  r  r  r!  rB  realize_inputunwrap_storage_for_inputTorchBindObjectr  r  rr   rx   rx   ry   r3    s    





z%InputsKernel.unwrap_storage_for_inputc                 C  s@   g }| D ]2}t |tr&dd |D }n
t|}|| q|S )Nc                 S  s   g | ]}t |qS rx   )r  r3  r   rx   rx   ry   r   *  r   z/InputsKernel.unwrap_storage.<locals>.<listcomp>)ru   r   r  r3  r[  )r2  Z
inputs_newrr   rx   rx   ry   r  %  s    

zInputsKernel.unwrap_storagerp   c                 C  s   dS r  rx   rF  rx   rx   ry   r  0  s    zInputsKernel.is_externrv   c                 C  s   dS rN  rx   rF  rx   rx   ry   r  3  s    zInputsKernel.num_readsN)r   r   r   r   r  rL  r  r3  r  r  r  r  rx   rx   rx   ry   r    s   


r  c                   @  s(   e Zd ZddddZddddZdS )		NopKernelrp   rD  c                 C  s   dS r  rx   rF  rx   rx   ry   r  8  s    zNopKernel.is_no_opr  c                 C  s   t  S rt   r.   rF  rx   rx   ry   rL  ;  s    zNopKernel.get_readsN)r   r   r   r  rL  rx   rx   rx   ry   r6  7  s   r6  c                   @  sD   e Zd ZdZedd ZedddZedd Zd	d
ddZdS )ConcatKernelzn
    There isn't actually a real kernel for concat, we just change the
    storage for the upstream data.
    c                 C  s  |d   }|d  }t|d  }dg}|| g}d|  krPt|k sVn J tdt|D ]}||  }	|||  t|	t|ksJ ||  |ksJ ||   |ksJ tt|D ]>}
|
|kr||
 |	|
  ||
< qtjj	
||
 |	|
 ||
< q|||  qdt|}tjr@t|||d j}tt|D ]L}|| }t|rL| }t|trLt|j|jrLt|} qqLtdd |D }tjjjd }t|tsJ |du rtdd |D rt|}td t||||dg d}t|}g }tt|D ]}| || t j!|||| || dd	}|j"| t|| j#t$r||| j#% }n
|| j#}|& r"t'||   j(r"t)|s"||*  q"t|dkrtj+|t,j-rtj.| tj/||_0| 1|j"|_"tj2| |S )
Nr   r4   c                 s  s   | ]}t |V  qd S rt   )r   ru  rx   rx   ry   r  n  r   z&ConcatKernel.create.<locals>.<genexpr>Fc                 s  s@   | ]8}d |j v o6|j d  jtjdp6|j d  jtjdV  qdS )r  r  N)r,  r  r   r  r  r   argrx   rx   ry   r  r  s   
)r   r   r   r   r   r  r2  )rb  )3r   r   r   r   r   r   r[  r_   r   r   rF  r   r   r5   r  re  r  r   r   r   ru   r  r|  r   r   r!   r  r  r   r7  r  r  r\  r  r2  r  r  r  r  rV   r   rU   r  rd  r7   ZFOREACHZregister_operation_listr  r   r  r  )r  r2  r  r   r   r!  Zoffsets_startZoffsets_endr   Z
input_sizerr  Zoutput_striderr   r  Zany_input_is_storage_and_layoutZfx_node_argsZconcat_kernelkernelZop_namesZinput_bufferZinput_unwrappedrx   rx   ry   r  E  s    







 zConcatKernel.createNc                 C  s   t |tr| |j|S t |jtrt |jjtr:|jjs>dS |d u rJdS t|	 t|	 ksfdS t
dd t|	 |	 D S t |jjtot |jt S )NFTc                 s  s"   | ]\}}t jj||V  qd S rt   r  r  rx   rx   ry   r    s   z=ConcatKernel.can_realize_into_without_copy.<locals>.<genexpr>)ru   rl   can_realize_into_without_copyr  r  r  r  r   r   r  r
  r   r   ExternKernelAlloc)r  r  r  rx   rx   ry   r=    s$    
z*ConcatKernel.can_realize_into_without_copyc              	   C  s   t |ts*t|r*t|\}}t||d}t |ts<J |t |trT| |j|S t |tr|  t	|jdsvJ | 
||rt||j_|jS tj| | | dd t| | D d}| ||S )Nr  r  c                 S  s    g | ]\}}t jj||qS rx   r  r  rx   rx   ry   r     s   z-ConcatKernel.realize_into.<locals>.<listcomp>r  )ru   r!  r   r  rl   r  r  r  ru  r  r=  r  r  r  r  r   r   r  r   r   )r  r  r  r"  r  pwrx   rx   ry   r    s,    


	zConcatKernel.realize_intorp   rD  c                 C  s   dS r  rx   rF  rx   rx   ry   r    s    zConcatKernel.should_allocate)N)	r   r   r   rV  r  r  r=  r  r  rx   rx   rx   ry   r7  ?  s   
`
 r7  c                      sx  e Zd ZU dZded< ejedZded< dZ	ded	< dZ
d
ed< dZd
ed< ejedZded< dZded< dZded< dZded< ejedZded< ejedZded< dodd fddZddddZd dd!d"Zd#d$ Zd%d& Zddd'd(Zd)d* Zdpd
dd+d,d-Zd
dd.d/d0Zd1d2 Zed3d4 Zed5dd6d7Z ed8d9 Z!ed:d; Z"ed<d= Z#edqd?d@dAdBdCZ$edrdDdEZ%edsdFdGZ&edHdI Z'edJdK Z(edLdM Z)edNdO Z*dddPdQZ+dRdS Z,dtdTdUdVdWZ-dXdY Z.dZd[ Z/dud\d]Z0d^dd_d`Z1dddadbZ2dddcddZ3dedf Z4dgdh Z5dvdid djdkdlZ6d^ddmdnZ7e7Z8  Z9S )wrB  rx   ztuple[Any, ...]constant_argsr  zdict[str, Any]r   NzOptional[ReinterpretView]output_viewr   python_kernel_namecpp_kernel_namezIterable[str]ordered_kwargs_for_cpp_kernelzFOptional[Union[torch._ops.OpOverload, torch._ops.HigherOrderOperator]]op_overloadzOptional[list[dict[str, Any]]]arg_propertiesz#Optional[dict[str, dict[str, Any]]]kwarg_propertiesz"dict[sympy.Symbol, pytree.KeyPath]unbacked_bindingszlist[MutationOutput]mutation_outputsr   rD  c                   sn   t  j|||d || _|r |ni | _|| _|
| _| | | | |	| _| 	  i | _
g | _tjj| _d S Nr;  )r  r  r@  r   rA  rE  set_cpp_kernel_nameset_python_kernel_namerD  collect_arg_kwarg_propertiesrH  rI  r_   r   r  fx_node)rB  r   r  r2  r@  r   rA  rB  rC  rD  rE  r  rx   ry   r     s     

zExternKernel.__init__r/  c                 C  s   | g| j S rt   )rI  rF  rx   rx   ry   r    s    zExternKernel.get_outputsr}   c                 C  s   t  S rt   r.   rF  rx   rx   ry   r  !  s    z%ExternKernel.get_unbacked_symbol_defsc                 C  s   t | jtjjr$dd | jjjD ndd tt| j	D | _
t | jtjjrbdd | jjjD ni | _t | jtjjr| jsdd | jjjD | _dd | jjjD | _ng | _d S )Nc                 S  s$   g | ]}|j s|j|j|jd qS ))r   r   r  )
kwarg_onlyr   	real_typer  ru  rx   rx   ry   r   (  s   z=ExternKernel.collect_arg_kwarg_properties.<locals>.<listcomp>c                 S  s   g | ]}i qS rx   rx   r   rx   rx   ry   r   2  r   c                 S  s   i | ]}|j |j|jd qS ))r   r  )r   rP  r  ru  rx   rx   ry   r   5  s   z=ExternKernel.collect_arg_kwarg_properties.<locals>.<dictcomp>c                 S  s   g | ]}|j r|jqS rx   rO  r   ru  rx   rx   ry   r   @  s   c                 S  s   g | ]}|j r|qS rx   )rO  ru  rx   rx   ry   r   C  s   )ru   rE  r   _ops
OpOverload_schema	argumentsr   r   r2  rF  allarg_propertiesrD  schema_kwargsrF  rx   rx   ry   rM  $  s*    


z)ExternKernel.collect_arg_kwarg_propertiesc                 C  s    t | jtr|   |   d S rt   )ru   r  r   apply_constraintr  rF  rx   rx   ry   r\  I  s    zExternKernel.decide_layoutc                 C  s    t | |\}}|r|| d S rt   )rR   Zmake_comment)rB  wrapperZ
origin_strZ_detailed_origin_strrx   rx   ry   codegen_commentN  s    zExternKernel.codegen_commentc                 C  s   t d S rt   r  rB  rY  rx   rx   ry   codegenS  s    zExternKernel.codegenrC  rs   c                 C  s   || _ tjjrt| jtjjs"d S | j}| j d u r~|j	dkrt|j
dkrV|jdd n|jdd}d| d| _ n
|jj| _ d S )Natenr  .r   r   z
at::_ops::z::call)rC  r_   r   cpp_wrapperru   rE  r   rR  rS  	namespaceZ_overloadnamer   ro  replacerT  r   )rB  rC  r<  opnamerx   rx   ry   rK  V  s    



z ExternKernel.set_cpp_kernel_name)rB  rs   c                 C  s`   || _ |d urd S | j}|d u r"n:t|tjjr@d|j | _ n|jdd d|j | _ d S )Nztorch.ops.higher_order.._ops..ops.r_  )	rB  rE  ru   r   rR  HigherOrderOperatorr   r   rb  )rB  rB  r<  rx   rx   ry   rL  n  s    z#ExternKernel.set_python_kernel_namec                 C  s:   |    }r|jntjj}tjjr4tjj| j|S | j	S rt   )
r   r   r_   r   Zdevice_typer`  rT  Zget_c_shim_func_namerC  rB  )rB  dr   rx   rx   ry   get_kernel_name}  s
    zExternKernel.get_kernel_namec                 C  s:   t j|  |  |  |  |  |  d}|  |S )N)r   r   r  r  r=  r;  )	r  r  r   r   r  r   rP  rN  ru  )rr   r?  rx   rx   ry   
copy_input  s    zExternKernel.copy_inputzituple[Any, list[Any], list[Any], Callable[[Any, Any], Any], Optional[dict[sympy.Symbol, pytree.KeyPath]]]c                   s  ||d}t |\} g g }g }|D ]^}t|toDt|t  d r\|| q(t|tjr|tj	j
jj|d d}|| q( fdd}	fdd|D }|D ]}
t|
rt|
dd	 qg }|D ]}
t|
ts|
 tj	jv r|tj	j|
   qt|
ts>|
 tj	jv r>|tj	j|
   qt|
trZ||
  qt|
tjjjr|
jj}|
jjd
kr|d usJ |tjj|   q|t|
dd q|	||\}}||i |}d }tjj }rdtj j!"d}t# }tj j$tj%j&j'kr$|d }t(tj }| t)|tj | W d    n1 sN0    Y  t*|||}t|t+t,fsz|gn|}|D ]N}t|tj-r|j.rd}tj	j j!"dd  }r| d| }|tj	_/q||||	|fS )Nr   r  )r  c                   sd   g }t | }t |}D ]&}|r0|t| q|t| qt| }|dg |di fS )Nr   r   )iterr[  nextpytreeZtree_unflattenr  )Znew_tensor_argsZnew_non_tensor_argsr  Z
it_tensorsZit_non_tensorsZ	is_tensorr  )	args_specis_arg_tensorrx   ry   unflatten_args  s    z3ExternKernel.process_kernel.<locals>.unflatten_argsc                   s   g | ]}  |qS rx   r2  ru  r  rx   ry   r     r   z/ExternKernel.process_kernel.<locals>.<listcomp>Tr  r   )r   r  r4   zEsparsity not handled. Please file issue for sparse inference weights.stack_tracez Found from : 
 )0rl  tree_flattenr[  ru   rm   GeneratorStater   r   r_   r   r   r   Zcreate_symintnoder   r  r  r  	constantsZtorchbind_constantsr4  	get_valuer   r  Zirr   r   r   r   Zdefault_generatorsZclone_stater   rx  r  r,  r  r   r'  _higher_order_opsZeffectsZwith_effectsr$   r)   r%   r   r   TensorZ	is_sparseZdisable_cudagraphs_reason)r  r<  r   r   Zbinded_argsZ	args_flattensor_argsnon_tensor_argsr:  ro  rr   Zexample_argsZdevice_indexnew_argsZ
new_kwargsexample_outputrH  r   Znode_meta_valctxZexample_out_lir   msgrr  rx   )rm  r  rn  ry   process_kernel  s    

		
.
zExternKernel.process_kernelc              	   C  sX  t |tsJ t |tr|S | }tj| }|dus@J | }|durd|j	v rt |j
tr|j	d jtjds|j	d jtjdr|t|  n|  tj| dd\}}|d }| |}tjj||}tjj||}	tjj||}
t||	|
 }||kr0td|	|
| tt|jt |! |" | |	|
dd	S )
z
        In order to pass this to an extern kernel we need a
        ReinterpretView not a View.  This allows us to avoid some
        unneeded copies.
        Nr  r8  r  rj   r   z@convert_to_reinterpret_view failed: stride=%s offset=%s index=%sr  r  )#ru   r  r!  r  r_   r   r  r  rP  r,  r  r   r  r   r  r  r  r!   r   r  r6   ri  r  r   rj  Zstride_varsZ
offset_varrW   rg  rh  rc  r  r  r  r   )r  rr   Zx_unwrap_viewra  Zx_unwrap_view_fx_nodeZ
index_argsr  rY  r   r  r   expectedrx   rx   ry   convert_to_reinterpret_view
  sj    






z(ExternKernel.convert_to_reinterpret_viewc                 C  s  |d u rt  S t|tjtjjjtfr0t|dS t|t	r\t
jtj|j| | dS t|trj|S t|tr| |jS t|trt| |j| dS t|tr|  t| rz| |W S  ty   Y n0 t|tr|  |S t|ttfr
|S |  |S )N)r  rw  r  )!r  ru   r   r   r   r   r   rv   r   rm  r_   r   Zadd_tensor_constantr   r  r   r   r   r  rl   r2  r  r!  r   r  ru  r   r  r  rc  r  NonTensorObjri  r5  rx   rx   ry   r2  O  s:    






zExternKernel.realize_inputc                 C  sD   t |r:t| dkr|S | D ]}|dkr$|  S q$| |S r  )r   r   r  ri  )r  rr   r   rx   rx   ry   require_stride1p  s    
zExternKernel.require_stride1FzOptional[Sequence[int]]rm  )r   r  c              	     s"  |d us d usJ   dv r( s(S trlt tr|r|tddt|rnttj	j
 jn||d S tddd | d S nt ttfr|r |sԈ rt  j r d urt S S t trlt  trtdnTt  trl|rF  |sh rlt   j rlS ttr|r |s rt  j rS ttrPtjtrPtjtsPt rPt jtsPzB| j_|r | j||dW S  r8| j |dW S W n t yN   Y n0 d } } d urtj	j
 fdd	t!t" D }|D ]}t#j$j%&|d
dq| 'tdd|| d |rt|sJ n8|r|d ur  d usJ t#j$j%(|t S S )N)r   r4   TF)r  r   r  r  r  zHthe MutationLayoutSHOULDREMOVE's real layout shouldn't be FlexibleLayoutr  c                   s4   g | ],}  | d r | dr|qS )r   r   )r  r]  r   r   r  r   rr   rx   ry   r     s   z0ExternKernel.require_strides.<locals>.<listcomp>r   r4   ))rp  r   ru   r   r   r  r  r   r_   r   r   Z
size_hintsr   r  r  r  r  r   r&  r  r  r7  r  rl   r  r  r!  r  r>  r  require_stride_orderrequire_exact_stridesrc  r   r   r   r  loweringZslice_ri  r|  )r  rr   r   r  r  Zexpanded_dimsZ	orig_sizer  rx   r  ry   require_stridesz  s    
	





zExternKernel.require_stridesc                 C  s   | j |||dS )N)r  r  r  )r  rr   r  r  rx   rx   ry   r    s    z"ExternKernel.require_exact_stridesc                 C  s   | j |||dS )N)r   r  r  )r  rr   r   r  rx   rx   ry   r    s    z!ExternKernel.require_stride_orderc                 C  s   |  |tS rt   )r  r  r5  rx   rx   ry   require_channels_last!  s    z"ExternKernel.require_channels_lastc                 C  s   |  |tS rt   )r  r  r5  rx   rx   ry   require_channels_last_3d%  s    z%ExternKernel.require_channels_last_3dc                 C  s.   dd }||r|S |  |t| S d S )Nc                 S  s*   dd }|| t jjv o(t jj||  jS )Nc              	   S  s(   z
|   W S  ttfy"   Y d S 0 d S rt   )r  AttributeErrorrc  rw   rx   rx   ry   safe_get_name,  s    
zPExternKernel.require_contiguous.<locals>.is_mkldnn_tensor.<locals>.safe_get_name)r_   r   ru  Z	is_mkldnn)rr   r  rx   rx   ry   is_mkldnn_tensor+  s    z9ExternKernel.require_contiguous.<locals>.is_mkldnn_tensorr  r   r   r   )r  rr   r  rx   rx   ry   rC  )  s    zExternKernel.require_contiguousc                 C  s   |  |t| S rt   r  r5  rx   rx   ry   require_contiguous_strides?  s    z'ExternKernel.require_contiguous_stridesc                 C  s   d S rt   rx   rF  rx   rx   ry   rX  G  s    zExternKernel.apply_constraintc                 C  s   t |ttfsJ t |tr$t|}| js2J dt|}t| j}||k rtd| j||  t||D ]6}| j| d }|	||v r|| n| j| d  qj|S )Nz/ExternKernel.arg_properties should not be emptyzv%s has %d unprovided positional arguments. Will check if they are in the keyword arguments or will use default values.r   r  )
ru   r   r   rF  r   rg  rh  rE  r   r[  )rB  r   r   Zn_argsZ
n_pos_argsr   arg_namerx   rx   ry   fill_non_provided_argsJ  s(    	


z#ExternKernel.fill_non_provided_argsr:  )r  c           	      C  s   t jjrg }d }|rD| jrDt| jt|ks4J ddd | jD }t| jD ]~\}}|d ur||| }|rz|dnd }n6t| j| }| jr|t| jk r| j| dnd }|	t jj
|| qN|S tt jj
j| jS d S )NzDnames passed to codegen_const_args does not match self.constant_argsc                 S  s   i | ]}| d |qS r   )r  r9  rx   rx   ry   r   x  s   z3ExternKernel.codegen_const_args.<locals>.<dictcomp>r   )r_   r   r`  rF  r   r@  r   r  r2  r[  rT  val_to_arg_strr\  )	rB  r  r  Zname_to_arg_propertiesr   rr   proptype_r   rx   rx   ry   codegen_const_argsl  s0    
zExternKernel.codegen_const_argsc                 C  s   t jjr2| jd ur2| g | j| j| j}d}n
| j}d}g }t|D ]h\}}t jjr| j	rl|t
| j	k stJ d| j	| d}|t jj|| qH|t jj| qH|r||   |S )NFTz-Invalid access to ExternKernel.arg_propertiesr   )r_   r   r`  rE  r  r2  r@  r   r   rF  r   r  r[  rT  r  r  r  )rB  r2  Zneed_codegen_constant_argsr   r   rr   r  rx   rx   ry   codegen_args  s&    zExternKernel.codegen_argsc                 K  s\   ||v r| |S || jv r(| j |S | jrJ|| jv rJ| j | dS t| ddS )zGiven an argument name, queries for values in (in order):
        1. any provided kwargs for this function.
        2. the class self.kwargs member.
        3. any available default arguments in self.allarg_properties.r  z not in self.allarg_propertiesN)r  r   rV  r7  )rB  r  r   rx   rx   ry   get_kwargs_value  s    

zExternKernel.get_kwargs_valuec                 C  s   t jjr| jd ur$t| jdkr$g S g }| jD ]p}|r@|dkr@q.| |}t|t	j
rb|| q.| jr|| jv r| j|dnd }|t jj|| q.ndd | j D }|S )Nr   r   r   c                 S  s(   g | ] \}}| d t jj| qS r  r_   r   rT  r  )r   kr  rx   rx   ry   r     s   z/ExternKernel.codegen_kwargs.<locals>.<listcomp>)r_   r   r`  rE  r   rW  rD  r  ru   r   r   r[  rV  r  rT  r  r   r  )rB  Zskip_outr   r  r  r  rx   rx   ry   codegen_kwargs  s*    

zExternKernel.codegen_kwargsr   c                 C  sR   | j d urJ| j j}t|dd}|dd}|ddd }| d| }nd}|S )	Nr   Zunknown_namespacerd  re  r_  r4   r   Z
unknown_op)rN  r'  r   rb  rsplit)rB  r'  Zop_namespaceop_namerx   rx   ry   get_op_name  s    
zExternKernel.get_op_namec                 C  sz   t jrvtjjsvt|  dkr"d S tjj|  }tjj| 	 }| 
 }|d|   d| d| d|d	 d S )Nr   zassert_size_stride(r9  r   )r5   Zsize_assertsr_   r   r`  rZ   r   rT  Zcodegen_shape_tupler  r  rU  r  )rB  rY  r   r   r  rx   rx   ry   codegen_size_asserts  s     z!ExternKernel.codegen_size_assertsc              	   C  sj   t jrftjjsf|  }|tjjv}|  }|rN|d| dt	 d|d n|d| d| d d S )Nzassert_alignment(r9  r   z	# buffer z (op: z) is assumed to be not aligned)
r5   Zalignment_assertsr_   r   r`  r  r
  r  rU  rS   )rB  rY  r   Zalignedr  rx   rx   ry   codegen_alignment_asserts  s    z&ExternKernel.codegen_alignment_assertsc                 C  s   |   }|  }|g g|fS )zD
        get output sizes and strides, for template_codegen
        )r   r  )rB  _sizeZ_striderx   rx   ry   get_group_stride  s    zExternKernel.get_group_stridec                   s  t jj|  }|  }fdd|D }dd tt|D ttt||jdd}dd t	|D fddttD }fd	d|D | 
 }|}t jj||g\}}}	td
\}
 tt| fdd|D }tt||}|t|fS )zC
        Manually get canonicalization of the output index
        c                   s   g | ]}  |qS rx   )r   ru  )r   rx   ry   r     r   z-ExternKernel.canonicalize.<locals>.<listcomp>c                 S  s   g | ]}t d | qS )rg  )rX   r   rx   rx   ry   r     r   T)rV  rM  c                 S  s   i | ]\}}||qS rx   rx   r   rx   rx   ry   r     r   z-ExternKernel.canonicalize.<locals>.<dictcomp>c                   s   g | ]} | qS rx   rx   r   r   rx   ry   r     r   c                   s   g | ]} | qS rx   rx   r   )r  rx   ry   r     r   cc                   s   g | ]} |qS rx   rx   ru  )add_varrx   ry   r     r   )r_   r   r   r   r  r   r   rZ  r  r   r  r  r?   r   r   r[   r   r|  r   )rB  r  r  Zindex_orderr   r  r   Z	new_sizesr   r  r   replacementrx   )r  r  r   r   ry   canonicalize  s$    
 zExternKernel.canonicalizerp   r  c                 C  sP   |rt nt}ttj  }| jD ]}|||O }q| j D ]}|||O }q:|S rt   )maybe_free_unbacked_symbolsmaybe_free_symbolsr/   r   r   r@  r   r   )rB  r   Zmaybe_get_symbolsr  r:  rx   rx   ry   r    s    

z!ExternKernel.get_free_symbol_usesc                   sP   t  dd }d|g}| fddt D 7 }|d j  |S )NrB  zpython_kernel_name=c                   s$   g | ]}|j  d t |j  qS r  )r   r   )r   r  rF  rx   ry   r   1  s   z(ExternKernel.__str__.<locals>.<listcomp>r  )r   r  fieldsr[  r=  r`  )rB  Zkernel_namerY  rx   rF  ry   r  ,  s    zExternKernel.__str__)rx   NNNNrx   N)N)NNF)F)F)N)F)F):r   r   r   r@  r   r  r  r   r   rA  rB  rC  r   rD  rE  rF  rG  rH  rI  r  r  r  rM  r\  rZ  r\  rK  rL  rh  r  ri  r  r  r  r2  r  r  r  r  r  r  rC  r  rX  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rx   rx   r  ry   rB    s   

       %
w
D
 
	    



" 
	  rB  c                      s@   e Zd ZddddZddd fddZd	dd
dZ  ZS )ExternKernelOutr   rD  c                 C  s   | |  d S rt   )Zgenerate_extern_kernel_outr[  rx   rx   ry   r\  =  s    zExternKernelOut.codegenrx   Nc
           
        sF   t  d || |||pi d ||||	
 tj| | _tj|  d S rt   )r  r  r  r_   r   r  r   r  )
rB  r  r2  r@  r   rA  rB  rC  rD  rE  r  rx   ry   r  @  s    zExternKernelOut.__init__rp   c                 C  s   dS r  rx   rF  rx   rx   ry   r  [  s    zExternKernelOut.should_allocate)rx   NNNNrx   N)r   r   r   r\  r  r  r  rx   rx   r  ry   r  ;  s          r  c                      s&   e Zd Zdddd fddZ  ZS )RandomSeedsrv   r|  r   )countr   rs   c                   sF   t t j}t jt|t j|gdg |j|j|ggddtj	j
d d S )NrP  zaten.randint.low_outzat::_ops::randint_low_out::call)r  r2  r@  rB  rC  rE  )r   r  r~  r  r  r  r  r  r^  randintZlow_out)rB  r  r   Zlimitsr  rx   ry   r  `  s    zRandomSeeds.__init__r   r   r   r  r  rx   rx   r  ry   r  _  s   r  c                      sH   e Zd ZddddZddd fddZd	dd
dZdd Z  ZS )r>  r   rD  c                 C  s   | |  d S rt   )Zgenerate_extern_kernel_allocr[  rx   rx   ry   r\  t  s    zExternKernelAlloc.codegenrx   Nc	           	        sL   t  d || |||pi d ||||
 g | _tj| | _tj|  d S rt   )	r  r  r  r  r_   r   r  r   r  )	rB  r  r2  r@  r   rB  rC  rD  rE  r  rx   ry   r  w  s    zExternKernelAlloc.__init__rp   c                 C  s   dS r  rx   rF  rx   rx   ry   r    s    z!ExternKernelAlloc.should_allocatec                 C  s   t d S rt   r  rF  rx   rx   ry   rX    s    z"ExternKernelAlloc.apply_constraint)rx   NNNrx   N)r   r   r   r\  r  r  rX  r  rx   rx   r  ry   r>  s  s         r>  c                      sR   e Zd ZdZddd fddZdddd	Zd
dddZddddZ  ZS )r   zP
    An output buffer that represents the mutation of a pre-existing buffer
    r  r   )mutating_noders   c                   sD   t  jd |d | }tj| |g| _|| _tj| | _	d S r  )
r  r  r  r_   r   r  mutation_namesr  r  r   )rB  r  Zmutated_noder  Zmutated_node_namer  rx   ry   r    s    zMutationOutput.__init__rD  c                 C  s   | j S rt   )r  rF  rx   rx   ry   rR    s    zMutationOutput.get_defining_oprS  c                 C  s   | j S rt   )r  rF  rx   rx   ry   r    s    z!MutationOutput.get_mutation_namesrp   c                 C  s   dS r  rx   rF  rx   rx   ry   r    s    zMutationOutput.should_allocate)	r   r   r   rV  r  rR  r  r  r  rx   rx   r  ry   r     s
   r   c                      s|   e Zd ZU dZi Zded< eddd dddZeddd dd	d
Zdd fddZ	ddddZ
ddddZ  ZS )TMADescriptorad  
    An IR node representing a generic host-side TMA descriptor in the Triton API
    Mostly useful for user-defined Triton kernels relying on host-side TMA;
    but can, in principle, be used for Inductor's Triton templates, too.

    See TMADescriptorExperimental and TMADescriptorStable for the two implementations
    (the old API and the new API)
    zdict[Any, TMADescriptor]_CACHErm   ztuple[str, tuple[Any, ...]])r  tma_metars   c                 C  sX   t |dksJ |d dkr0t|g|d R  S |d dks@J t|g|d R  S d S )Nr   r   Zexperimentalr4   r  )r   TMADescriptorExperimentalTMADescriptorStable)r  r  r  rx   rx   ry   _create_impl  s
    zTMADescriptor._create_implc                 C  s2   t ||f}|| jvr(| ||| j|< | j| S rt   )idr  r  )r  r  r  rV  rx   rx   ry   r    s    
zTMADescriptor.creater  c                   sL   t  d tt|| d|t|d  || _tj	| | _
tj|  d S )Nr  )r  r  r  r!  r   r   r  r_   r   r  r   r  )rB  r  r2  r@  r  rx   ry   r    s    zTMADescriptor.__init__r   rD  c                 C  s   | |  d S rt   )Zgenerate_tma_descriptorr[  rx   rx   ry   r\    s    zTMADescriptor.codegenc                 C  s   | j S rt   r  rF  rx   rx   ry   
get_tensor  s    zTMADescriptor.get_tensor)r   r   r   rV  r  r   r  r  r  r  r\  r  r  rx   rx   r  ry   r    s   

r  c                      s0   e Zd ZdZd
dddddd fdd	Z  ZS )r  z
    the new host-side TMA Descriptor API:
    (the ones obtained via create_{1d,2d}_tma_descriptor calls).

    See also TMADescriptorStable for the new API.
    Nrm   list[Union[int, torch.SymInt]]r  r   )r  r)  
block_dimselement_sizers   c                   s   t |dv sJ t |t |ks$J |d u r6| j}|| _|| _|| _t | j| _|g}g | j| j| j}t j|||d d S )N)r4   r   r  r2  r@  )	r   r   rr  r)  r  r  r  r  r  )rB  r  r)  r  r  r2  r@  r  rx   ry   r    s*    
z"TMADescriptorExperimental.__init__)Nr   r   r   rV  r  r  rx   rx   r  ry   r    s    r  c                      s(   e Zd ZdZddd fddZ  ZS )r  z
    the new host-side TMA descriptor API
    (the ones obtained via TensorDescriptor.from_tensor).

    See also TMADescriptorExperimental for the old API.
    rm   r  )r  block_shapec                   s   || _ t j||g|d d S )Nr  )r  r  r  )rB  r  r  r  rx   ry   r  !  s    zTMADescriptorStable.__init__r  rx   rx   r  ry   r    s   r  c                      s8   e Zd Zdddddd fddZd	d
ddZ  ZS )SubgraphBufferre  r/  r'  	list[Any]r   )r  r~   r(  example_inputssubgraph_namec           	   	     s  t  d || || _|| _tj| | _tj|  tj	| j||| _
t| j}|D ]"}|| j
j|j< | j
j|j qXdd |D | _dd lm  m} t| j
N |jdddd | j
j| j  W d    n1 s0    Y  W d    n1 s0    Y  d S )Nc                 S  s   g | ]
}|j qS rx   r   )r   Zsym_varrx   rx   ry   r   B  r   z+SubgraphBuffer.__init__.<locals>.<listcomp>r   FZATEN)Zmax_autotuneZmax_autotune_gemmZmax_autotune_gemm_backends)r  r  r(  r  r_   r   r  r   r  make_subgraphsubgraphr5  r2  r  Zgraph_input_namesr[  
sym_inputsZtorch._inductor.configr  r5   set_graph_handlerr   run)	rB  r  r~   r(  r  r  r  Zsym_inpZinductor_configr  rx   ry   r  ,  s&    
zSubgraphBuffer.__init__r   rD  c                 C  sD   G dd d}dd | j D }||| jg | j|| jg d S )Nc                   @  s   e Zd ZddddZdS )z,SubgraphBuffer.codegen.<locals>.CodegenGraphrc   )r   c                 S  s   || _ |j| _d S rt   )r   r   )rB  r   rx   rx   ry   r  Q  s    z5SubgraphBuffer.codegen.<locals>.CodegenGraph.__init__N)r   r   r   r  rx   rx   rx   ry   CodegenGraphP  s   r  c                 S  s   g | ]}|  qS rx   rz  r   r   rx   rx   ry   r   U  r   z*SubgraphBuffer.codegen.<locals>.<listcomp>)r2  Z'codegen_subgraph_with_flattened_outputsr  r  r   )rB  rY  r  Zouter_inputsrx   rx   ry   r\  O  s    zSubgraphBuffer.codegen)r   r   r   r  r\  r  rx   rx   r  ry   r  +  s   #r  c                      sx   e Zd Zdd ZddddZddd	d
 fddZd	dddZdd fddZddddZddddZ	  Z
S )UserDefinedTritonKernelc                   s   ddl m} ddlm} || j g }g }g }t |rt dr`| fdd j	D  nt dsnJ | j
 t dr jD ]}| jj|  qnt d	sJ | j  j} j  |||fS )
Nr   )	Autotuner)kernel_side_tablerestore_idxc                 3  s   | ]} j j| V  qd S rt   )r   	arg_namesr   r<  rx   ry   r  k  s   zBUserDefinedTritonKernel.get_kernel_and_metadata.<locals>.<genexpr>restore_value	reset_idxreset_to_zero)Ztriton.runtime.autotunerr  *torch._higher_order_ops.triton_kernel_wrapr  Z
get_kernel
kernel_idxru   r  r  r  r  r  r[  r   r  r  configs)rB  r  r  r  restore_value_argsreset_to_zero_argsr   rx   r  ry   get_kernel_and_metadata^  s*    




z/UserDefinedTritonKernel.get_kernel_and_metadatar   rD  c                   s  ddl m}  \ }}}| |j||j\}}}fddjD }	t fdd jD }
g }g }g }g }t	
|	 tt	d|D ]\}}|| || t|tr||  ||  qt|ttttjfr|| |t| q||
v r&|d |t q|d u r`| rN|d |t n|  |  qtd	t| d
| q| |j||||||d jjd	 d S )Nr   )triton_version_uses_attrs_dictc                   s   i | ]}|  |qS rx   r  r   r  rF  rx   ry   r     s   z3UserDefinedTritonKernel.codegen.<locals>.<dictcomp>c                   s   g | ]} j | qS rx   )r  r   r  rx   ry   r     r   z3UserDefinedTritonKernel.codegen.<locals>.<listcomp>rU  r  zUnsupported arg type: r   T)	arg_typesZraw_argsZraw_keystriton_metar   r   Zoriginal_fxnode_name) Ztorch._inductor.utilsr  r  Z!define_user_defined_triton_kernelr   gridrD  r/   Z
constexprsrx  r  r  r   repeatr[  ru   rm   rz  r   rv   r  rp   r   r   r   r  rc  rZ  Zgenerate_kernel_callr   rN  r   )rB  rY  r  r  r  r  new_namer  Zextra_launch_argsZ
named_argsZconstexpr_namesr   r  Zraw_keys_filteredZraw_args_filteredr   r:  rx   )r<  rB  ry   r\  }  sx    	











zUserDefinedTritonKernel.codegenFrp   r}   r  c                   s   t  |t| j|B S rt   )r  r  r   r  r  r  rx   ry   r    s    z,UserDefinedTritonKernel.get_free_symbol_usesc                 C  s   t  S rt   r.   rF  rx   rx   ry   r    s    z0UserDefinedTritonKernel.get_unbacked_symbol_defsc                  sX  g }i }g }   D ]`\}}	t|	trbt|	}
||v rNt|
|| }
||
 |
||< q||	 |	||< qt	|dksJ |d 
 _t d tjd|t|| |_|_ \}}}} fdd|jD _ddlm} t	|dkr
|d jni } fdd||i  ||D _fddjD _tj d S )Nr   r  c                   s   g | ]}| v r|qS rx   rx   r9  kernel_argsrx   ry   r     s   z4UserDefinedTritonKernel.__init__.<locals>.<listcomp>)identify_mutated_tensorsc                   s   g | ]} | qS rx   rx   r   rV  r  rx   ry   r     s   c                   s    g | ]}t t jd | qS r  )r   r  r   r  rF  rx   ry   r     s   )r  ru   rl   r  r3  r2  r  r  r[  r   r   r   r  r  r  r   r  r  r  r  rD  r  r  r   Zmutable_argsrI  r_   r   r  )rB  r  r  Ztma_descriptor_metadatar  r2  r   r@  r  r  r   r<  r  r   r  Zautotuned_kwargsr  )r  rB  ry   r    sL    








z UserDefinedTritonKernel.__init__r/  c                 C  s
   t | jS rt   )r   rI  rF  rx   rx   ry   r    s    z#UserDefinedTritonKernel.get_outputsr{  c                 C  s   | j S rt   r  rF  rx   rx   ry   r     s    z"UserDefinedTritonKernel.get_device)F)r   r   r   r  r\  r  r  r  r  r   r  rx   rx   r  ry   r  ]  s   Q 	5r  c                      s^   e Zd ZdZddddZddddZd	dd
dZddddZdd fddZ  Z	S )InplaceBernoulliFallbackE
    This needs to be a custom class to handle mutation properly
    r   rD  c                 C  s   dd | j D \}tjjrP||   d| ddtt| j	 d|j
  n4||   d| ddtt| j	 d|j
  d S )Nc                 s  s   | ]}|  V  qd S rt   r  r  rx   rx   ry   r    r   z3InplaceBernoulliFallback.codegen.<locals>.<genexpr>r[  r9  z, NULL)r   )r2  r_   r   r`  rU  rh  r^  r\  reprr@  ending)rB  rY  rr   rx   rx   ry   r\    s    ,,z InplaceBernoulliFallback.codegenrp   c                 C  s   dS r  rx   rF  rx   rx   ry   r  (  s    z(InplaceBernoulliFallback.should_allocaterS  c                 C  s   | j d  gS rq  r2  r  rF  rx   rx   ry   r  +  s    z+InplaceBernoulliFallback.get_mutation_namesr}   c                 C  s   t  S rt   r.   rF  rx   rx   ry   r  .  s    z1InplaceBernoulliFallback.get_unbacked_symbol_defsc                   sV   t  jd t| d| |g||d tj|  tj	| | _
tj|  d S )Nr  rE  )r  r  r  r   r  r_   r   r  r  r  r   r  )rB  rE  rr   r@  r  rx   ry   r  1  s    
z!InplaceBernoulliFallback.__init__
r   r   r   rV  r\  r  r  r  r  r  rx   rx   r  ry   r    s   r  c                      sr   e Zd ZdZddddZddddZd	dd
dZddddZdd fddZe	dddddZ
  ZS )InplaceCopyFallbackr  r   rD  c                 C  s    |   \}}}|||| d S rt   )r  codegen_device_copy)rB  rY  r  r  non_blockingrx   rx   ry   r\  D  s    zInplaceCopyFallback.codegenrp   c                 C  s   dS r  rx   rF  rx   rx   ry   r  H  s    z#InplaceCopyFallback.should_allocaterS  c                 C  s   | j d  gS rq  r  rF  rx   rx   ry   r  K  s    z&InplaceCopyFallback.get_mutation_namesr}   c                 C  s   t  S rt   r.   rF  rx   rx   ry   r  N  s    z,InplaceCopyFallback.get_unbacked_symbol_defsc                   sJ   t  jd |||ddd tj|d   tj| | _tj|  d S )Nz
aten.copy_Zaoti_torch_copy_)rB  rC  r   )	r  r  r_   r   r  r  r  r   r  )rB  r  r2  r@  r  rx   ry   r  Q  s    zInplaceCopyFallback.__init__F)r  c                   s6    fdd||fD }|f}t t| d||}|S )Nc                   s   g | ]}  |qS rx   rp  r  rq  rx   ry   r   e  r   z.InplaceCopyFallback.create.<locals>.<listcomp>r  )r  r  r   )r  r  r  r  r2  r@  r  rx   rq  ry   r  c  s    zInplaceCopyFallback.create)F)r   r   r   rV  r\  r  r  r  r  r  r  r  rx   rx   r  ry   r  ?  s   r  c                   @  sV   e Zd ZdZddddZddddZd	dd
dZddddZddddZdS )MutatingFirstArgExternKernelr  r   rD  c                 C  sJ   g dd | j D tt| j}||   dd| d|j  d S )Nc                 s  s   | ]}|  V  qd S rt   r  r  rx   rx   ry   r  v  r   z7MutatingFirstArgExternKernel.codegen.<locals>.<genexpr>r[  r9  r   )r2  r\  r  r@  rU  rh  r^  r  )rB  rY  Zargrefsrx   rx   ry   r\  t  s    
z$MutatingFirstArgExternKernel.codegenrp   c                 C  s   dS r  rx   rF  rx   rx   ry   r  }  s    z,MutatingFirstArgExternKernel.should_allocaterS  c                 C  s   | j d  gS rq  r  rF  rx   rx   ry   r    s    z/MutatingFirstArgExternKernel.get_mutation_namesr}   c                 C  s   t  S rt   r.   rF  rx   rx   ry   r    s    z5MutatingFirstArgExternKernel.get_unbacked_symbol_defsc                 C  s   dS r  rx   rF  rx   rx   ry   has_side_effects  s    z-MutatingFirstArgExternKernel.has_side_effectsN)	r   r   r   rV  r\  r  r  r  r  rx   rx   rx   ry   r  o  s   	r  c                      s"   e Zd Zdd fddZ  ZS )ResizeStorageBytesr   rD  c                   s   t |tsJ dt jd t| d| |g|fd tj	|
  tj| | _tj|  d| _d| _tjj|j
  d S )NzTODO: dynamic shapesr  )r@  z"inductor_ops.resize_storage_bytes_z&torch::inductor::resize_storage_bytes_)ru   rv   r  r  r  r   r  r_   r   r  r  r  r   r  rB  rC  never_reuse_buffersr  r  )rB  variabler!  r  rx   ry   r    s    
zResizeStorageBytes.__init__r  rx   rx   r  ry   r    s   r  c                      s0   e Zd Zdd fddZddddZ  ZS )SetSourceTensorKernelr   rD  c                   s   |   t j| ||gdtjjjjd t	j
j|j  t	j
j|  t	j
j|   | }tt|d|| tt|d|| g| _d S )Nz!torch.ops.aten.set_.source_Tensor)rB  rE  r  )r  r  r  r   r   r]   r^  set_Zsource_Tensorr_   r   r  r  r  r  r   r   r  rI  )rB  Zself_tensorZstorage_tensorr   r  rx   ry   r    s    
zSetSourceTensorKernel.__init__rS  c                 C  s   | j d  | j d  gS r  r  rF  rx   rx   ry   r    s    z2SetSourceTensorKernel.get_inputs_that_alias_output)r   r   r   r  r  r  rx   rx   r  ry   r    s   r  c                      sl   e Zd ZdZddddZddddZd	dd
dZddddZdddddddd fddZ  Z	S )ScatterFallbackz
    This needs to be a custom class to handle mutation properly.
    This class handles both aten.scatter_ and aten.scatter_reduce_.
    It also handle the case `src` being a scalar properly.
    r   rD  c              
   C  s   | j d }tjjr,ddd}||v r,|| }| jrJdd | jD \}}}ndd | jD \}}| jd }|||| jd	 ||g| j| j	| j|| 
  d S )
Nrw  r  r  )r  multiplyc                 s  s   | ]}|  V  qd S rt   r  r  rx   rx   ry   r    r   z*ScatterFallback.codegen.<locals>.<genexpr>c                 s  s   | ]}|  V  qd S rt   r  r  rx   rx   ry   r    r   r4   r   )r   r_   r   r`  src_is_tensorr2  r@  Zgenerate_scatter_fallbackrC  rB  r  )rB  rY  rw  Zget_operator_enumrr   r   r  rx   rx   ry   r\    s$    


zScatterFallback.codegenrp   c                 C  s   dS r  rx   rF  rx   rx   ry   r    s    zScatterFallback.should_allocaterS  c                 C  s   | j d  gS rq  r  rF  rx   rx   ry   r    s    z"ScatterFallback.get_mutation_namesr}   c                 C  s   t  S rt   r.   rF  rx   rx   ry   r    s    z(ScatterFallback.get_unbacked_symbol_defsNTrw  include_selfrv   r   )r  rw  r  rs   c          
   
     s   t |t _ jr2 fdd|||fD }|f}	n fdd||fD }||f}	t jd t| d ||	||dt|ddg|d t	j
|  t	j
  _t	j
  d S )	Nc                   s   g | ]}  |qS rx   rp  r  rF  rx   ry   r     r   z,ScatterFallback.__init__.<locals>.<listcomp>c                   s   g | ]}  |qS rx   rp  r  rF  rx   ry   r     r   r  r  rw  r  )rB  rD  rE  )ru   rl   r  r  r  r  r   r  r   r_   r   r  r  r  r   r  )
rB  rE  rr   r  r   r  rw  r  tensorsr@  r  rF  ry   r    s&    
zScatterFallback.__init__r  rx   rx   r  ry   r    s   r  c                      s^   e Zd ZdZddddZddddZd	dd
dZddddZdd fddZ  Z	S )IndexPutFallbackzQ
    This needs to be a custom class to handle mutation and indices properly
    r   rD  c           	      C  s   dd | j D ^}}}g }t|}t| jD ]6\}}| j| d urR|t| q,|tjjj	 q,|j
|  |||g|  R   d S )Nc                 s  s   | ]}|  V  qd S rt   r  r  rx   rx   ry   r    r   z+IndexPutFallback.codegen.<locals>.<genexpr>)r2  rj  r   r_  r[  rk  r_   r   rT  r  Zgenerate_index_put_fallbackrh  r  )	rB  rY  rr   r   valid_indicesr_  Ziter_valid_indicesr   r   rx   rx   ry   r\    s    zIndexPutFallback.codegenrp   c                 C  s   dS r  rx   rF  rx   rx   ry   r    s    z IndexPutFallback.should_allocaterS  c                 C  s   | j d  gS rq  r  rF  rx   rx   ry   r    s    z#IndexPutFallback.get_mutation_namesr}   c                 C  s   t  S rt   r.   rF  rx   rx   ry   r    s    z)IndexPutFallback.get_unbacked_symbol_defsc           	   	     s   | _ dd |D } fdd||g|D }d}t jd t| d ||fd||d tj j	d 
  tj  _tj  d S )	Nc                 S  s   g | ]}|d ur|qS rt   rx   r   rx   rx   ry   r     r   z-IndexPutFallback.__init__.<locals>.<listcomp>c                   s   g | ]}  |qS rx   rp  ru  rF  rx   ry   r     r   Zaoti_torch_index_put_outr  zaten.index_put_)rB  rC  rE  r   )r_  r  r  r  r   r  r_   r   r  r2  r  r  r   r  )	rB  rE  rr   r_  r   
accumulater  r  rC  r  rF  ry   r    s     	zIndexPutFallback.__init__r  rx   rx   r  ry   r    s   r  c                   @  s&   e Zd Zedd ZddddZdS )
DeviceCopyc                 C  s   |  s0tdd | D r0tjjs0||S tj	| tj	|
  td |f}tt|| | d| |g|S )Nc                 s  s   | ]}|t jjv V  qd S rt   )r_   r   ru  rS  rx   rx   ry   r  0  r   z$DeviceCopy.create.<locals>.<genexpr>zDeviceCopy in input programrP  )r  r
  rM  r5   aot_inductorZuse_runtime_constant_foldingr  r_   r   Zadd_device_infor   rN   r  r   r   r   r2  )r  rr   r   r  r@  rx   rx   ry   r  ,  s(    

zDeviceCopy.creater   rD  c                 C  sZ   |   }t|dksJ | jr<||d | j |d  n||d |  |d  d S )Nr   r   r4   )r  r   rA  r  rz  )rB  rY  r   rx   rx   ry   r\  D  s    zDeviceCopy.codegenN)r   r   r   r  r  r\  rx   rx   rx   ry   r  +  s   
r  c                      s^   e Zd ZdZddddZddddZd	d fd
dZddddZd	dddZ  Z	S )r   z;
    The result of a call to aten._local_scalar_dense.
    r  rD  c                 C  s   t  S rt   r.   rF  rx   rx   ry   rL  T  s    zDynamicScalar.get_readsrp   c                 C  s   dS r  rx   rF  rx   rx   ry   r  W  s    zDynamicScalar.should_allocater   c                   s<   |   t d ttdd| |g || _|| _d S Nr   r  )	ru  r  r  r  r   r   r  symkeypath)rB  r  r  r  r  rx   ry   r  Z  s    zDynamicScalar.__init__r}   c                 C  s   t | jgS rt   )r/   r  rF  rx   rx   ry   r  b  s    z&DynamicScalar.get_unbacked_symbol_defsc                 C  s   | |  d S rt   )Zcodegen_dynamic_scalarr[  rx   rx   ry   r\  e  s    zDynamicScalar.codegen)
r   r   r   rV  rL  r  r  r  r\  r  rx   rx   r  ry   r   O  s   r   c                      sn   e Zd ZdZddddZddddZd	d fd
dZddddZdddddZd	dddZ	  Z
S )r   z5
    The result of a call to aten._assert_scalar
    r  rD  c                 C  s   t  S rt   r.   rF  rx   rx   ry   rL  n  s    zAssertScalar.get_readsrp   c                 C  s   dS r  rx   rF  rx   rx   ry   r  q  s    zAssertScalar.should_allocater   c                   s,   t  d ttddg  || _|| _d S r  )r  r  r  r   r   scalarr~  )rB  r  r~  r  rx   ry   r  t  s    zAssertScalar.__init__c                 C  s   dS r  rx   rF  rx   rx   ry   r    s    zAssertScalar.has_side_effectsFr3  c                 C  s   t | j|S rt   )r   r  r  rx   rx   ry   r    s    z!AssertScalar.get_free_symbol_usesc              	   C  s   t js
d S tt| jdd}tjjrhd| d}tjjj	| j
dd}|d| d| j d| d	 nRtjjj| j
dd}|d
| d |dt| j d ||   d d S )NFr3  zstd::to_string(r   )r  zif (!(z()) { throw std::runtime_error("Expected z but received " + z); }zif not (z):z    raise RuntimeError(z = None)r5   Zscalar_assertsrk  rj  r  r_   r   r`  rT  Zcodegen_cpp_sizevarr  rU  r~  Zcodegen_python_sizevarr  r  )rB  rY  symbolZ
symbol_strZsizevarrx   rx   ry   r\    s"    zAssertScalar.codegen)F)r   r   r   rV  rL  r  r  r  r  r\  r  rx   rx   r  ry   r   i  s   r   c                   @  s   e Zd ZU ded< ded< dS )ExternKernelNoder   r   zexport_schema.Noder   Nr   rx   rx   rx   ry   r    s   
r  c                      s   e Zd ZdZd'dddd fddZdd fd	d
ZddddZddddZdd Ze	dd Z
dd Zdd ZddddZdd ZddddZe	dd d!d"Zed#d$ Z fd%d&Z  ZS )(FallbackKernelz
    A class that represents a fallback kernel for handling operators that are not
    directly support by inductor. It currently supports functional ops, view ops,
    inplace aten ops, and mutating ops that are auto-functionalizable.
    NrH  r   rD  c                  sX  t  j|t|t||d d _| _t|tjjtjj	fsXJ d| dt
| d| _| _|d u rpi n| _tj j g  _g  _t jtjj	rd S d j v rd S  jj}tjj jr j|d   d S |jrt|std|   j j\}	}d	d
 fdd}
tjj ||	|D ]\}}|
|| q>d S )Nr  Fz#Fails to create FallbackKernel for r   z not supportedZ_c10d_functionalr   z'NYI: Can't generate FallbackKernel for r   rD  c                   s   t  jtjr t |ttfs J t jr>t |ttfr>J |d u rJd S  jd u rXd S dd fdd}t	 jr|d ur|D ]}|| qnt jsJ || d S )Nr   rD  c                   s:   j |    jjr6jtt|  d|  d S r  )	alias_namesr[  r  
alias_infoZis_writerI  r   r  r   r   )inforB  rx   ry   	add_alias	  s
    zPFallbackKernel.__init__.<locals>.handle_aliasing_and_mutation.<locals>.add_alias)
ru   r   r   ListTyper   r   library_utilsZis_tensor_like_typer  Zis_tensorlist_like_type)r  r:  r  Zoptional_tensor_argrF  )r  ry   handle_aliasing_and_mutation  s    
z=FallbackKernel.__init__.<locals>.handle_aliasing_and_mutation)!r  r  r   use_runtime_dispatchrH  ru   r   rR  rS  rf  r   rE  ro  r   r_   r   Zwarn_fallbackrB  r  r  r   rT  _libraryr  Zmutates_and_returns_first_argr[  r  
is_mutabler   rc  r2  r@  Z
zip_schema)rB  r  r<  ry  nontensor_argsro  r   rH  schemar   r  r  r:  r  rF  ry   r    sL    zFallbackKernel.__init__r  c                   sH   t   }| jtjjju rD| jD ]"}t|t	r |
t| }q |S rt   )r  r  rE  r   Z_primsZ	rng_primsZgraphsafe_run_with_rng_stater@  ru   rt  Z	with_readr6   r  r  )rB  r^  r:  r  rx   ry   r    s    


zFallbackKernel.get_read_writesc                 C  s   | |  | jt| dd S NrH  )(codegen_unbacked_symbol_defs_for_outputsr  r  r   r[  rx   rx   ry   codegen_unbacked_symbol_defs'  s    z+FallbackKernel.codegen_unbacked_symbol_defsr}   c                 C  s>   t | dd  }r4ttjjj|}|d us,J | S t S d S r  r   r*   r_   r   r   r   rl  r/   rB  rH  Zresolvedrx   rx   ry   r  ,  s    
z'FallbackKernel.get_unbacked_symbol_defsc                   s   t jG dd d  fdd| jD }| || j\}}tjjrzt| j	t
jjrz| ||}dd t| j	jj|D }ndd |D }| j| |S )Nc                   @  s$   e Zd ZU ded< ddddZdS )z)FallbackKernel.codegen_args.<locals>.Shimr
   refr   rD  c                 S  s   | j S rt   )r  rF  rx   rx   ry   r  ;  s    z2FallbackKernel.codegen_args.<locals>.Shim.__repr__N)r   r   r   r   r  rx   rx   rx   ry   Shim7  s   
r  c                   s   g | ]} |  qS rx   r  ru  r  rx   ry   r   >  r   z/FallbackKernel.codegen_args.<locals>.<listcomp>c                 S  s"   g | ]\}}t jj||jqS rx   )r_   r   rT  r  rP  )r   paramrr   rx   rx   ry   r   B  s   c                 S  s   g | ]}t jj|qS rx   r  ru  rx   rx   ry   r   G  r   )r  	dataclassr2  ro  r@  r_   r   r`  ru   rE  r   rR  rS  r  r   rT  rU  r   r  )rB  ry  r   r   rx   r  ry   r  6  s    zFallbackKernel.codegen_argsc                 C  s   | rdd | D nd }|r0dd | D }|d S t |tjrB|jS t |ttfrtdd |D }dd |D }t|dkr|d S |D ]}t|j	r|  S q|d S d S )	Nc                 S  s   g | ]}t |ts|qS rx   )ru   r4  r  rx   rx   ry   r   P  r   z.FallbackKernel.find_device.<locals>.<listcomp>c                 S  s   g | ]}|  r|  qS rx   r}  r9  rx   rx   ry   r   U  r   r   c                 s  s   | ]}t d |V  qd S rt   )r	  find_deviceru  rx   rx   ry   r  Z  s   z-FallbackKernel.find_device.<locals>.<genexpr>c                 S  s   g | ]}|r|qS rx   rx   )r   r   rx   rx   ry   r   ^  r   r4   )
ru   r   rx  r   r   r   r/   r   rV   r   )ry  r|  Znon_torch_bind_tensor_argsZdevicesZ
device_setr   rx   rx   ry   r"  M  s*    

zFallbackKernel.find_devicec                 C  s"   t | jtjjrdS t| j S r  )ru   rE  r   rR  rf  r#   r  rF  rx   rx   ry   r  g  s    zFallbackKernel.has_side_effectsc                 C  s   | j S rt   )r  rF  rx   rx   ry   r  l  s    z+FallbackKernel.get_inputs_that_alias_outputrS  c                 C  s   t | jdksJ | jS rN  )r   r  rF  rx   rx   ry   r  o  s    z!FallbackKernel.get_mutation_namesc                   sP  t d j tts"J jj\}	|}fddj
D }j}tjjsrg ||S tdd}|||}dd  t|tjjjr||d |d j}n|jj}t|dkrjrڈjnj}|d j} ||g}	n fd	dt|jD }	t tjj ||	i d
d}
tjj !|
 g ||S )a  
        ProxyExecutor Design Note
        We export the ExternFallbackNodes (for custom ops) into a serialized file
        and run it with a host side proxy executor to address the ABI problem
        This is currently only implemented for fbcode. Eventually, we will also make this work for OSS.
        Detailed design doc can be found at
        https://docs.google.com/document/d/1wC4DOZFaYym2t1Esz0X5yxlLI3RDnSiyRbUus3bkJ64/edit?usp=sharing
        z4Extern kernel node added for node %s with target %s.c                   s   g | ]}j |fi  qS rx   r  r  r   rB  rx   ry   r     s   z<FallbackKernel.export_extern_kernel_node.<locals>.<listcomp>Nc                 S  sJ  t | tjtjfr~|}t |ttfr<t|dks4J |d }t | tjrbtjj	tj
| ddS |d u snJ tjj	ddS nt | tjrt |  tjrtjj	dd |D d	S t | tjrt |  tjr|d u rtjj	tjj	ddd
S tjj	tjj	tj
| ddd
S n.t | tjr4tjj	|dS tdt|  d S )Nr4   r   r   )Z	as_tensorT)Zas_nonec                 S  s   g | ]}t j| d qS )r   )export_schemaTensorArgumentr  )r   r   rx   rx   ry   r     s   zZFallbackKernel.export_extern_kernel_node.<locals>.handle_single_output.<locals>.<listcomp>)Z
as_tensors)Zas_optional_tensor)Zas_intzUnsupported return type )ru   r   Z
TensorTypeZNoneTyper   r   r   r$  ZArgumentr  r%  r  r  getElementTypeOptionalTypeZOptionalTensorArgumentZIntTypeRuntimeErrorr   )return_typer)  r   rx   rx   ry   handle_single_output  sL    

zFFallbackKernel.export_extern_kernel_node.<locals>.handle_single_outputr   r4   c                   s   g | ]\}} |j |qS rx   )rP  )r   Zreturn_schemar)  )r*  rx   ry   r     s   )r'  r2  r  metadata)r   r   )"rg  rh  r  rE  ru   r	  ro  r2  r@  r  rD  r_   r   Zaot_moder   Zserialize_inputsr   rw  	torchbindCallTorchBindr  returnsrT  r   r  rI  rP  r   r  r$  ra   r   Zextern_kernel_nodesr[  )rB  r   Zordered_kwargsr'  
serializerZnamed_argumentsr.  r  r)  Zoutput_argumentsr   rx   )r*  r   rB  ry   export_extern_kernel_nodes  sL    	
/



z(FallbackKernel.export_extern_kernel_nodec                   s  j }|jdkrVt|tjjs"J tjjrddl	m
} t||vrtd| d_n4|jdkrtt|tjjsJ ntjjr|tjjv_tjjrt|tjjrjsddd	 fd
d jj\}t|fddjD }t fddt||jjD _| jrd }| j fddj |j!rZj!nj" n,|# tj$t%r&| '| (| d S )Nr^  r   )inductor_fallback_opszG%s is missing a c-shim implementation, using proxy executor as fallbackTZ
_quantizedztorch.JitTyperp   )r   rs   c                   s$   t | tjr |  S t | tjS rt   )ru   r   r'  r&  Z
NumberTyper  	is_numberrx   ry   r3    s    z)FallbackKernel.codegen.<locals>.is_numberc                 3  s    | ]}j |fi  V  qd S rt   r  r  r#  rx   ry   r    s   z)FallbackKernel.codegen.<locals>.<genexpr>c                 3  s&   | ]\}}t |to |jV  qd S rt   )ru   complexrP  )r   r  r'  r2  rx   ry   r    s   c                     s   g      S rt   )r  r  rx   rF  rx   ry   rT  "  r   z(FallbackKernel.codegen.<locals>.<lambda>))rE  ra  ru   r   rR  rS  r_   r   r`  Ztorchgen.aoti.fallback_opsr1  r   rg  r  r  r5   r  Zcustom_ops_to_c_shimsro  r2  r@  rx  r  rD  r  r   rT  rU  rZ  r0  Z,generate_fallback_kernel_with_runtime_lookupr  rB  r  rI  Zgenerate_fallback_kernelr  re  r  r  r  )rB  rY  r<  r1  r   Z	args_iterZexported_argsrx   )r3  r   rB  ry   r\    sb    









zFallbackKernel.codegenr   r)  c                 C  s"   t | j| jt|  t|  S rt   )r  r   r   rL   r   r   r5  rx   rx   ry   tensor_to_layout0  s    

zFallbackKernel.tensor_to_layoutc                   s,  t jf}||vrtjjnt }|2  j|g|R i |\}}}}	}
W d    n1 sZ0    Y  tdd |D  ||}|st	|t
jjjrt
d}|d u rĈ t|d||||	|
dn&|sJ d t|d||||	|
d fdd|g }t	|tttfr |_n|g_|S )	Nc                 s  s   | ]}t |V  qd S rt   )r	  r9  rx   rx   ry   r  J  r   z(FallbackKernel.create.<locals>.<genexpr>r   r  r
  z"Not sure where to find device infoc                   s   t ttfr4t fddttD S t trX fdd D S t tj	rt
 }tjsststjj|j |S t trS t tjrjjS d u sJ dt dd S d S )Nc                 3  s*   | ]"} | t |fg V  qd S rt   r   r   generate_outputr_  r)  rx   ry   r  k  s   zAFallbackKernel.create.<locals>.generate_output.<locals>.<genexpr>c                   s*   i | ]"\}}| |t |fg qS rx   r7  )r   rV  r  r8  rx   ry   r   p  s   zBFallbackKernel.create.<locals>.generate_output.<locals>.<dictcomp>zFallbackKernel output type z is not supported)ru   r   r   r   r   r   r   r  r   rx  MultiOutputr6  r5    assume_unaligned_fallback_outputr\   r_   r   r
  r  r   rv   ZSymIntr   r  )r)  r_  ra  r  r9  Zhas_unaligned_inputpacked)r_  r)  ry   r9  i  s<    



z.FallbackKernel.create.<locals>.generate_output)r^  Z*_fused_moving_avg_obs_fq_helper_functionalr_   r   rx  r   r  r  r"  ru   r   rw  r,  r-  r   r  r.  r   r   r   r  )r  r<  r   r   Zfake_incorrect_kernelscontextr|  ry  rz  ro  rH  r   r  rx   r<  ry   r  9  sT    



	"
zFallbackKernel.createc                   s
   t   S rt   )r  rX  rF  r  rx   ry   rX    s    zFallbackKernel.apply_constraint)N)r   r   r   rV  r  r  r  r  r  r  r"  r  r  r  r0  r\  r6  r  r  rX  r  rx   rx   r  ry   r	    s*    	l

oN
Xr	  c                      sH   e Zd ZdZddddZddddZd	d
dd fddZ  ZS )ComplexViewz9View a complex number as two dtyped numbers or vice versarp   rD  c                 C  s   dS r  rx   rF  rx   rx   ry   r    s    zComplexView.should_allocaterS  c                 C  s   | j d  gS rq  r  rF  rx   rx   ry   r    s    z(ComplexView.get_inputs_that_alias_outputNr
  r   c                  s   t  j||||||d d S )Nr
  )r  r  )rB  r  r<  ry  r  ro  rH  r  rx   ry   r    s    
zComplexView.__init__)r   r   r   rV  r  r  r  r  rx   rx   r  ry   r?    s
   r?  c                   @  s$   e Zd ZU ded< ddddZdS )r.  r|  r   r{  rD  c                 C  s   | j S rt   r  rF  rx   rx   ry   r     s    zMultiOutputLayout.get_deviceN)r   r   r   r   r   rx   rx   rx   ry   r.    s   
r.  c                      sd   e Zd ZddddZddddd fd	d
ZddddddZddddZddddZ  ZS )r:  r   rD  c                 C  s(   | |  | js$| | | | d S rt   )Zcodegen_multi_output!skip_size_stride_alignment_checksr  r  r[  rx   rx   ry   r\    s    

zMultiOutput.codegenFr   zlist[tuple[Any, ...]])r  r_  rs   c                   s>   t  d ||gd tj| | _tj|  || _|| _d S rl  )	r  r  r_   r   r  r   r  r_  r@  )rB  r  r1  r_  r@  r  rx   ry   r    s
    zMultiOutput.__init__rp   r}   r  c                 C  s   | j d |S rq  )r2  r  r  rx   rx   ry   r    s    z MultiOutput.get_free_symbol_usesc                 C  s&   t | jdkr"t| jd tr"dS dS )Nr4   r   TF)r   r2  ru   r-  rF  rx   rx   ry   r    s
    zMultiOutput.should_allocaterS  c                 C  s   dd | j D S )Nc                 S  s.   g | ]&}t |trt| d kr| qS r   )ru   r	  r   r  r  r   r4  rx   rx   ry   r     s   
z<MultiOutput.get_inputs_that_alias_output.<locals>.<listcomp>r2  rF  rx   rx   ry   r    s    z(MultiOutput.get_inputs_that_alias_output)F)F)	r   r   r   r\  r  r  r  r  r  rx   rx   r  ry   r:    s     r:  c                   @  sb  e Zd ZU dZded< ddddZddd	d
ZddddZddddZddddZ	ddddZ
dwdddddZddddd Zddd!d"Zddd#d$Zddd%d&Zddd'd(Zdxd*ddd+d,d-Zd*dd.d/d0Zd1dd2d3d4Zdyd1ddd5d6d7Zd8dd9d:Zd;dd<d=Zddd>d?Zd@ddAdBZdCddDdEZdFddGdHZdddIdJZdddKdLZdMddNdOdPZdQddRdSZdddTdUZdQddVdWZ dCddXdYZ!dzddZd[d\d]Z"d^dd_d`Z#daddbdcZ$d{ddddedfdgZ%e&dhddidjZ'dkddldmZ(dhddndoZ)dpddqdrZ*e&dsdt Z+dddudvZ,e,Z-dS )|r  zC
    TensorBox / StorageBox allow in-place mutation of Tensors
    rm   r  rp   rD  c                 C  s
   | j  S rt   r  rF  rx   rx   ry   r    s    z!MutableBox.has_exceeded_max_readsr{  c                 C  s
   | j  S rt   r  rF  rx   rx   ry   r     s    zMutableBox.get_devicer  c                 C  s
   | j  S rt   r[  rF  rx   rx   ry   r    s    zMutableBox.make_loaderr  c                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r    s    zMutableBox.make_indexerr  c                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r    s    zMutableBox.get_strider   c                 C  s
   | j  S rt   r  rF  rx   rx   ry   r    s    zMutableBox.get_nameNr  r  c                 C  s   | j |S rt   )r  r  r  rx   rx   ry   r    s    zMutableBox.has_large_inner_fnrv   r   r  c                 C  s   | j |S rt   r  r  rx   rx   ry   r    s    zMutableBox.mark_reusec                 C  s
   | j  S rt   r  rF  rx   rx   ry   r    s    zMutableBox.realize_hintc                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r    s    zMutableBox.unwrap_viewc                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r    s    zMutableBox.is_input_bufferc                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r    s    zMutableBox.freeze_layoutFr  r  c                 C  s   | j ||S rt   )r  r  r  rx   rx   ry   r    s    z*MutableBox.freeze_layout_with_stride_orderr   c                 C  s   | j |S rt   )r  r  r  rx   rx   ry   r    s    z(MutableBox.freeze_layout_with_fill_orderr  r  c                 C  s   | j |S rt   )r  r  r  rx   rx   ry   r    s    z(MutableBox.freeze_layout_with_same_orderr  c                 C  s   | j ||S rt   )r  r  r  rx   rx   ry   r    s    z+MutableBox.freeze_layout_with_exact_stridesr  c                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r  $  s    zMutableBox.get_read_writesr  c                 C  s
   | j  S rt   r  rF  rx   rx   ry   rL  '  s    zMutableBox.get_readsc                 C  s
   | j  S rt   r  rF  rx   rx   ry   r  *  s    zMutableBox.num_readsrh   c                 C  s
   | j  S rt   r  rF  rx   rx   ry   r  -  s    zMutableBox.get_storage_numelr   c                 C  s
   | j  S rt   r  rF  rx   rx   ry   r  0  s    zMutableBox.get_reduction_typer  c                 C  s
   | j  S rt   r  rF  rx   rx   ry   r  3  s    zMutableBox.get_reduction_sizec                 C  s
   | j  S rt   r  rF  rx   rx   ry   r  6  s    zMutableBox.is_externc                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r  9  s    zMutableBox.is_no_opr|  r  c                 C  s   | j |S rt   r  r~  rx   rx   ry   r  <  s    zMutableBox.constant_to_devicerS  c                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r  ?  s    zMutableBox.get_mutation_namesc                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r  B  s    zMutableBox.get_operation_namec                 C  s
   | j  S rt   )r  r  rF  rx   rx   ry   r  E  s    z'MutableBox.get_inputs_that_alias_outputc                 C  s
   | j  S rt   r  rF  rx   rx   ry   ru  H  s    zMutableBox.realizer}   r  c                 C  s   | j |S rt   r  r  rx   rx   ry   r  K  s    zMutableBox.get_free_symbol_usesrH  c                 C  s
   | j  S rt   r  rF  rx   rx   ry   rM  P  s    zMutableBox.get_read_namesrQ  c                 C  s
   | j  S rt   )r  rR  rF  rx   rx   ry   rR  S  s    zMutableBox.get_defining_oprv  rw  c                 C  s   | j |S rt   )r  rz  ry  rx   rx   ry   rz  V  s    zMutableBox.codegen_referencer   c                 C  s
   | j  S rt   r  ri  rF  rx   rx   ry   r  Y  s    zMutableBox.layoutre  c                 C  s
   | j  S rt   r  rF  rx   rx   ry   r   ^  s    zMutableBox.get_layoutc                 C  s
   | j  S rt   rC  rF  rx   rx   ry   ri  a  s    zMutableBox.get_output_specrl  c                 C  s
   | j  S rt   rY  rF  rx   rx   ry   r   d  s    zMutableBox.get_sizec                 C  s   | j jS rt   )r  r   rF  rx   rx   ry   r   g  s    zMutableBox.dtypec                 C  sn   t | jtr8t| j dt| jj d}d}| jj}nt| j d}| j}d}|tt||g}d|S )Nr[  z))r   
)ru   r  r  r   r   r]  r   r^  )rB  Zline0Zendlr  rY  rx   rx   ry   r  k  s    

zMutableBox.__str__)N)F)F)F)N).r   r   r   rV  r   r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rL  r  r  r  r  r  r  r  r  r  r  ru  r  rM  rR  rz  r  r  r   ri  r   r   r  r  rx   rx   rx   ry   r    s^   
   
r  c                   @  s   e Zd Zedd ZdS )rl   c                 C  s   t | tr| S tt| S rt   )ru   r   rl   r  )r  rx   rx   ry   r    s    
zTensorBox.createN)r   r   r   r  r  rx   rx   rx   ry   rl     s   c                   @  sl   e Zd ZddddZdd Zdddd	Zd
dddZddddZdd Zdd
dddZ	dd Z
dS )r  rp   rD  c                 C  s&   t | jttfr"| j tjjv S dS r  )ru   r  r  r!  r  r_   r   r  rF  rx   rx   ry   r    s    zStorageBox.is_input_bufferc                 C  s   t | jto| j tjjv S rt   )ru   r  r  r  r_   r   ru  rF  rx   rx   ry   r    s    zStorageBox.is_module_bufferr   c                 C  s   t | jtttttfr | j S t | jtt	t
tfsBJ t| j| j }| j }td t| j | j | j d| jd| _tj| j| j_tj| j | j| j_|| j_|| j_| jjS )NrP  rQ  )ru   r  rX  r  r  r!  r  r  r  r7  r  r  r   rP  rN  r   r   r   r   r_   r   r  r   r  r9  r=  r;  )rB  r=  r;  rx   rx   ry   ru    s<    



	
zStorageBox.realizer   c                 C  s,   t | jttfr(| j jdkr(|   dS )zL
        Called on buffers we expect to be forced to realize later.
        r4   N)ru   r  r  r7  r  Znontrivial_read_countru  rF  rx   rx   ry   r    s
    zStorageBox.realize_hintc                 C  s"   t | jto |  tjkp |  S rt   )ru   r  r  r  r5   Zrealize_acc_reads_thresholdr  rF  rx   rx   ry   r    s    z!StorageBox.has_exceeded_max_readsc                   sh   |dkrdt | jttfrdt| jrN| j  ddg}t fdd|D rNdS |  tj	kpb| 
 S dS )zj
        A heuristic to decide if we should realize a tensor
        that is used multiple times.
        r4   expZsigmoidc                 3  s   | ]}| j v V  qd S rt   )Zused_opsru  Zopcountrx   ry   r    r   z5StorageBox.should_realize_on_reuse.<locals>.<genexpr>TF)ru   r  r  r7  r  r  r  r  r5   Zrealize_reads_thresholdr  )rB  r  Z	heavy_opsrx   rF  ry   should_realize_on_reuse  s    

z"StorageBox.should_realize_on_reuserv   r  c                 C  s   |  |r|   d S rt   )rG  ru  r  rx   rx   ry   r    s    
zStorageBox.mark_reusec                 C  s
   | j  S rt   r  rF  rx   rx   ry   r    s    zStorageBox.num_readsN)r   r   r   r  r  ru  r  r  rG  r  r  rx   rx   rx   ry   r    s   !
r  c                   @  s*   e Zd ZU ded< ded< dZded< dS )Subgraphr   r   r'  graph_moduleNzOptional[GraphLowering]r   )r   r   r   r   r   rx   rx   rx   ry   rH    s   
rH  r  )buffersrs   c                 C  s,   dd | D } t tdd | D t | k S )Nc                 S  s"   g | ]}t |tr| n|qS rx   )ru   r!  r  r   r  rx   rx   ry   r     s   z(_has_aliased_buffers.<locals>.<listcomp>c                 s  s   | ]}t |V  qd S rt   )r  rK  rx   rx   ry   r    r   z'_has_aliased_buffers.<locals>.<genexpr>)r   r/   )rJ  rx   rx   ry   _has_aliased_buffers  s    rL  c                      sr   e Zd ZU dZdZded< dZded< dZded< d	d
ddd fddZe	d	dddZ
ddddZ  ZS )InvokeSubgraphz.
    Ir node for the invoke_subgraph HOP.
    NOptional[Subgraph]r  zOptional[list[TensorBox]]operandsOptional[list[MultiOutput]]r  rH  zlist[TensorBox]r.  r   )r  rO  r  rs   c                   s6   t  jd ||d || _tj| | _tj|  d S rJ  )r  r  r  r_   r   r  r   r  )rB  r  rO  r  r  rx   ry   r    s    zInvokeSubgraph.__init__)r  c                   s  ddl m} tjj}d }|jd }r:|d dd  }n|jdd  }dd |D } fdd|D }g }t|D ]2\}	}
t	|
t
r||
 qt|||
||	  qt|}|jd u rtjj|j||jd	|_t|j |jj|  W d    n1 s0    Y  |jj}d }|D ] }
t	|
t
s|
 } q:q|d usHJ t||t|d
ddddfddfddt|D }|_|S )Nr4   )constrain_to_fake_tensoreager_input_valsr   r   c                 S  s   g | ]}|j d  qS r  r,  ru  rx   rx   ry   r     r   z)InvokeSubgraph.create.<locals>.<listcomp>c                   s   g | ]}  |qS rx   rp  ru  rq  rx   ry   r     r   r(  r  r  r  )r  rO  r  rm   rv   r)  indc                   sP   t | ttfr| S tt|  |  |  |  | 	 j
d t|fgddS d S )Nr  T)r@  )ru   r   r  r:  r  r   r   r   r  r   r   r   rU  )invoke_subgraphrx   ry   create_output@  s    z,InvokeSubgraph.create.<locals>.create_outputc                   s   g | ]\}} ||qS rx   rx   )r   r   r)  )rX  rx   ry   r   Q  r   )r  rQ  r_   r   r  r,  r  r   r   ru   r   r[  r  rI  r   r  r  graph_outputsr   rM  r.  r  )r  r  rO  rQ  r  fake_operandsrR  fx_operandsZnew_operandsr   operandr  r   rx   )r  rX  rW  ry   r    sP    
*
zInvokeSubgraph.createrD  c                 C  s   | |  d S rt   )Zcodegen_invoke_subgraphr[  rx   rx   ry   r\  U  s    zInvokeSubgraph.codegen)r   r   r   rV  r  r   rO  r  r  r  r  r\  r  rx   rx   r  ry   rM    s   
PrM  c                      s   e Zd ZU dZded< dZded< dZded< dZded< dZd	ed
< dddddddd fddZ	e
dddddddZddddZddddZ  ZS )ConditionalNr   	predicate7Optional[list[Union[TensorBox, ShapeAsConstantBuffer]]]rO  rN  true_subgraphfalse_subgraphrP  r  rm   -list[Union[TensorBox, ShapeAsConstantBuffer]]rH  r.  z,Optional[dict[sympy.Symbol, pytree.KeyPath]]r   )r^  rO  r`  ra  r  rH  rs   c           	        sj   || _ || _|| _|| _t|g| \}}t jd |||d |d urL|| _tj	
| | _tj	|  d S N)r   r  r2  r@  )r^  rO  r`  ra  _split_by_sym_typer  r  rH  r_   r   r  r   r  )	rB  r^  rO  r`  ra  r  rH  sym_argsry  r  rx   ry   r  a  s    	zConditional.__init__rl   )r^  true_fnfalse_fnrO  c              	     s*   |}fdd|D }tjjjd }dd |D }||fD ]^}|jd u r@tjj|j||jd|_t|j |jj	|  W d    q@1 s0    Y  q@|jj
}|jj
}	d|fd|	ffD ]$\}
}t|rtd|
 d	| qt|t|	ksJ ||	ftt||	D ]r\}\}}| | ks>J |||f| | ks^J |||f| j| jksJ |||fqtd
d |g| D }ttjjjtjjjdd }|d usJ dt||||t|d|dddddd  fddtt|tjjjd D }|_|S )Nc                   s   g | ]}  |qS rx   rp  ru  rq  rx   ry   r     r   z&Conditional.create.<locals>.<listcomp>r  c                 S  s   g | ]}|j d  qS r  rS  ru  rx   rx   ry   r     r   rT  rf  rg  zVOutput aliasing is currently not supported in compiled torch.cond. The outputs of the z% subgraph of torch.cond are aliased: c                 s  s    | ]}t |ts| V  qd S rt   )ru   r   r   )r   orx   rx   ry   r    s   
z%Conditional.create.<locals>.<genexpr>rH  zcannot determine devicer  )r^  rO  r`  ra  r  rH  zUnion[int, torch.SymInt]zUnion[int, sympy.expr]r   rs   c                 S  s   t | tr| S | jjS rt   )ru   rv   r   r  )r   rx   rx   ry   _maybe_expr  s    
z'Conditional.create.<locals>._maybe_exprc              
     sf   g | ]^\}\}}t t| |  fd d| D  fdd| D | jdt|fgqS )c                   s   g | ]} |qS rx   rx   r   r  rj  rx   ry   r     r   z1Conditional.create.<locals>.<listcomp>.<listcomp>c                   s   g | ]} |qS rx   rx   rk  rl  rx   ry   r     r   r  )	r:  r  r   r   r   r   r   r   r   )r   r   r)  Zmerged_output)rj  conditionalrx   ry   r     s   
r  )r2  r_   r   r  r   r  rI  r   r  r  rY  rL  r7  r   r   r   r   r   r   r   rk  r*   r   r   r,  r  r]  r.  r  )r  r^  rf  rg  rO  r[  rZ  r  Ztrue_outputsZfalse_outputsr   r  r   Zt_oZf_or   rH  rx   )rj  r  rm  ry   r  }  sj    

,  (	zConditional.createrD  c                 C  s*   | |  ||  | jt| di  d S r  )Zcodegen_conditionalr  r  r  r   r[  rx   rx   ry   r\    s    
zConditional.codegenr}   c                 C  s>   t | dd  }r4ttjjj|}|d us,J | S t S d S r  r  r  rx   rx   ry   r    s    
z$Conditional.get_unbacked_symbol_defs)r   r   r   r^  r   rO  r`  ra  r  r  r  r  r\  r  r  rx   rx   r  ry   r]  Y  s   
Vr]  r  z-tuple[list[ShapeAsConstantBuffer], list[Any]])r   rs   c                 C  s<   g }g }| D ]&}t |tr(||j q|| q||fS rt   )ru   r   r[  r  )r   Znon_sym_argsre  r:  rx   rx   ry   rd    s    
rd  c                      s   e Zd ZU dZded< dZded< dZded< dZded< dZded	< d
d
ddddd fddZ	e
ddd
d
dddZddddZ  ZS )	WhileLoopNr_  carried_inputsadditional_inputsrN  cond_subgraphbody_subgraphrP  r  rb  rH  r.  r   )ro  rp  rq  rr  r  rs   c                   sZ   || _ || _|| _|| _t|| \}}t jd |||d tj	| | _
tj|  d S rc  )ro  rp  rq  rr  rd  r  r  r_   r   r  r   r  )rB  ro  rp  rq  rr  r  re  ry  r  rx   ry   r    s    zWhileLoop.__init__)cond_fnbody_fnro  rp  c              	     s  ddl m} dddddd}tjjjd }tjjjd	 }|| }	d
d |	D }
dd |D }dd |D }fdd|D }|||}fdd|D }|||}||  ||fD ]}|jd u rtjj|j|	|jd|_t	|jR |jj
|
  ||u r*t|jjt|ksJ ||jj||j_W d    q1 s@0    Y  q|jj}|jj}t|rttd| t|dksJ ||d }t|ts| tjksJ |t| dksJ |t dksJ d d  }|d us J t|t|ksJ ||ftt||D ]\}\}}dddddd}|| |  || |  | | ksJ ||||f| | ksJ |||f| j| jks,J |||fq,t||||t|dd|jd urt|jjtjjsJ ||jj|
d }t | fddD }fddt|D }fdd|! D }|_"fd d|D _#t$|t$|fd!dt%t|D }t||D ].\}}|& tjj'v rtjj()|&  q|S )"Nr   )check_input_alias_and_mutationz'list[TensorBox | ShapeAsConstantBuffer]z,list[Union[int, torch.SymInt, torch.Tensor]])tensor_boxesfake_tensorsrs   c                 S  sb   t | t |ksJ g }t| |D ]:\}}t|tjrR|tj|| dd q"|| q"|S )NFr  )	r   r   ru   r   rx  r[  rB  r  r   )rv  rw  retr  Zfkrx   rx   ry   _require_exact_strides  s    
z0WhileLoop.create.<locals>._require_exact_stridesr  c                 S  s   g | ]}|j d  qS r  rS  ru  rx   rx   ry   r   1  r   z$WhileLoop.create.<locals>.<listcomp>c                 S  s   g | ]}|j d  qS r  rS  ru  rx   rx   ry   r   2  r   c                 S  s   g | ]}|j d  qS r  rS  ru  rx   rx   ry   r   3  r   c                   s   g | ]}  |qS rx   rp  ru  rq  rx   ry   r   5  r   c                   s   g | ]}  |qS rx   rp  ru  rq  rx   ry   r   7  r   rT  zOutput aliasing is currently not supported in compiled torch.while_loop. The outputs of the body_fn subgraph of torch.while_loop are aliased: r4   z9torch.while_loop is assumed to have at least one operand.zSequence[Union[int, Any]]r   )	lhs_exprs	rhs_exprsrs   c                 S  s(   t | |D ]\}}tjj|| q
d S rt   )r   r_   r   r   rF  )r{  r|  lhsrhsrx   rx   ry   _guard_list_equalsq  s    z,WhileLoop.create.<locals>._guard_list_equalsr  )ro  rp  rq  rr  r  r   c                   s   g | ]} | qS rx   rx   r   r   )
all_inputsrx   ry   r     r   c                   s   i | ]\}}| vr||qS rx   rx   )r   r   r   )mutated_idx_setrx   ry   r     s   z$WhileLoop.create.<locals>.<dictcomp>c              
     sF   g | ]>\}}t t| | | | | jd  t|fgqS )r  )	r:  r  r   r   r   r  r   r   r   )r   r   r)  
while_looprx   ry   r     s   c                   s   g | ]}t |j| qS rx   )r   r  rA  r  rx   ry   r     s   c                   s$   g | ]}| v rt nt qS rx   )rk  r  )r  mutated_inputs_iteroutputs_iterrx   ry   r     s   )*Ztorch._higher_order_ops.utilsru  r_   r   r  r   r  rI  r   r  r  r   rY  rL  r7  ru   r   r   r   rp   r   r   r   r   r  r   r   rn  r.  moduleZfxZGraphModuler/   r  r  rI  rj  r   r  r  r  r  )r  rs  rt  ro  rp  ru  ry  Zfx_carried_inputsZfx_additional_inputsZfx_all_inputsZfake_all_inputsZfake_carried_inputsZfake_additional_inputsr  Zcond_outputsZbody_outputsr  r   r   r*  Zbor  Zmutated_idxsr  Zreal_outputsZall_outputsr4  r   rx   )r  r  r  r  r  r  ry   r    s    


*
" (	




zWhileLoop.createrD  c                 C  s   | |  d S rt   )Zcodegen_while_loopr[  rx   rx   ry   r\    s    zWhileLoop.codegen)r   r   r   ro  r   rp  rq  rr  r  r  r  r  r\  r  rx   rx   r  ry   rn    s   
 )rn  c                      sJ   e Zd Zddddd fddZdd fdd	Zd
dddZ  ZS )r   Nr
  r   rD  c             	     s~   t  j|||||d |d ddlm} dd |D }	||g ||	R |}
|
d usVJ |
| _tjj|
d | _	| tjj|
< d S )N)r   rH  r   )get_effect_keyc                 S  s    g | ]}t |tr|jn|qS rx   )ru   r4  r   )r   r'  rx   rx   ry   r     s   z,EffectfulKernel.__init__.<locals>.<listcomp>)
r  r  Ztorch._higher_order_ops.effectsr  effect_typer_   r   Zeffectful_opsr  prev_effect_buffer)rB  r  r<  ry  r  ro  r   rH  r  Zuncovered_argsr  r  rx   ry   r    s$    
zEffectfulKernel.__init__r  c                   s0   t   }| jd ur,|jt| j  |S rt   )r  r  r  r  r  r6   r  r  )rB  r^  r  rx   ry   r    s    

zEffectfulKernel.get_read_writesrp   c                 C  s   dS r  rx   rF  rx   rx   ry   r    s    z EffectfulKernel.has_side_effects)N)r   r   r   r  r  r  r  rx   rx   r  ry   r     s    	 
r   c                   @  s   e Zd ZdS )r  Nr  rx   rx   rx   ry   r    s   r  c                   @  sh   e Zd ZU ded< ded< ddddZdd	dd
ddZddddZddddZddddZdS )r4  r   r   z+Union[FakeScriptObject, torch.ScriptObject]r   rD  c                 C  s   | j S rt   r   rF  rx   rx   ry   r    s    zTorchBindObject.get_nameNrv  rw  c                 C  s   | j S rt   r   ry  rx   rx   ry   rz    s    z!TorchBindObject.codegen_referencec                 C  s   | j S rt   r   rF  rx   rx   ry   rv    s    zTorchBindObject.get_valueztorch.ScriptObjectc                 C  s    t | jtjr| jS | jjS d S rt   )ru   r   r   ZScriptObjectZreal_objrF  rx   rx   ry   get_real_obj   s    zTorchBindObject.get_real_objrv   c                 C  s@   |   }t| }t|d }dd |D }ttj|dS )Nr   c                 S  s(   g | ] }t |tjr| |  qS rx   )ru   r   rx  r  Znumelru  rx   rx   ry   r      s   z1TorchBindObject.get_buf_bytes.<locals>.<listcomp>)	r  r   Z__obj_flatten__rl  rs  re  rw  operatorr  )rB  Zreal_script_objZ	flat_dictZ
flat_elemsZ
flat_sizesrx   rx   ry   get_buf_bytes   s    zTorchBindObject.get_buf_bytes)N)	r   r   r   r   r  rz  rv  r  r  rx   rx   rx   ry   r4    s   
r4  c                   @  s>   e Zd ZU ded< ded< ddddZdd	dd
ddZdS )rt  r   r   r|  r   rD  c                 C  s   | j S rt   r   rF  rx   rx   ry   r     s    zGeneratorState.get_nameNrv  rw  c                 C  s   | j S rt   r   ry  rx   rx   ry   rz     s    z GeneratorState.codegen_reference)N)r   r   r   r   r  rz  rx   rx   rx   ry   rt     s   
rt  c                   @  s`   e Zd ZddddZddddZddd	d
ddZedd	dddZeddddZdS )_CollectiveKernelrp   rD  c                 C  s   dS r  rx   rF  rx   rx   ry   r  !   s    z!_CollectiveKernel.should_allocatec                 C  s   dS r  rx   rF  rx   rx   ry   r  $   s    z"_CollectiveKernel.has_side_effectsNr   r   r]  c                 C  sB   t | jtjju sJ d| j}|jj| _dd |jjD | _	d S )Nz,Setting cpp kernel needs a valid op_overloadc                 S  s   g | ]}|j r|jqS rx   rQ  ru  rx   rx   ry   r   0   s   z9_CollectiveKernel.set_cpp_kernel_name.<locals>.<listcomp>)
r   rE  r   rR  rS  rT  r   rC  rU  rD  )rB  rC  r<  rx   rx   ry   rK  )   s    
z%_CollectiveKernel.set_cpp_kernel_namez!Union[TensorBox, list[TensorBox]]r1  c                   s  t jj4 | j||g|R i |\}}}}}	W d    n1 sB0    Y  |	rbJ | d|	 |D ]}
|
  qf|d   | t d||||t|}j	
 fdd|D  j
dd |D  d|v rj	tt d|d  j|d   d S )Nr3  r   r  c                   s   g | ]}t t d |qS r  r  r  r   r=  rx   ry   r   U   r   z4_CollectiveKernel.create_inplace.<locals>.<listcomp>c                 S  s   g | ]}|  qS rx   r  rA  rx   rx   ry   r   Y   r   r   )r_   r   rx  r  ru  r   r  rl  Ztree_leavesrI  r  r  r[  r   r  )r  r<  r2  r   r   _example_outputry  rz  ro  rH  
tensor_argZinpsrx   r  ry   create_inplace:   s<    



z _CollectiveKernel.create_inplacerB  c                   s<  t jj4  j||g|R i |\}}}}}	W d    n1 sB0    Y  |	rbJ | d|	 |D ]}
|
  qft|tr ||} t|d|||| fddt	|D _
tj
|D ]&\}}tjst|st jj|j qƈj
S   |||||tjst|s,t jjj g_
S d S )Nr9  r  c                   s(   g | ] \}}t  |t|fgqS rx   )r:  r6  r   )r   r   r  r  r=  rx   ry   r      s   z9_CollectiveKernel.create_out_of_place.<locals>.<listcomp>)r_   r   rx  r  ru  ru   r   r"  r.  r   r  r   r5   r;  r\   r
  r  r   r6  )r  r<  r2  r   r   r|  ry  rz  ro  rH  r  r   ra  r  rx   r  ry   create_out_of_placew   sV    



z%_CollectiveKernel.create_out_of_place)N)	r   r   r   r  r  rK  r  r  r  rx   rx   rx   ry   r      s   <r  c                      s>   e Zd Zdd ZedddddZdd	 fd
dZ  ZS )_WaitKernelc                 C  sd   | j d }t|tr |j d gS t|tr\|j d }t|trX|jd \}}|j | gS g S g S d S rq  )r2  ru   r  r:  r_  )rB  r4  Zcollr   r   rx   rx   ry   get_volatile_reads   s    




z_WaitKernel.get_volatile_readsrl   r   )r4  rs   c           	      C  s   t jj& | ||\}}}}}W d    n1 s40    Y  |rTJ | d| | t| d||||}|jtt| d|| d S )Nr3  r  )	r_   r   rx  r  r  r   rI  r[  r   )	r  r<  r4  r  ry  rz  ro  rH  r=  rx   rx   ry   create_wait   s(    

z_WaitKernel.create_waitr  rD  c                   s6   t   }|  }|D ]}|jt|  q|S rt   )r  r  r  r  r  r6   r  r  )rB  r^  Zvolatile_readsZvrr  rx   ry   r     s
    
z_WaitKernel.get_read_writes)r   r   r   r  r  r  r  r  rx   rx   r  ry   r     s   r  r  ri  c                 C  sh   t | ttfrt| S t | ttfrJttj  }| D ]}|t	|O }q4|S t | t
jr^t| S t S d S rt   )ru   r-   r   r'   r   r   r/   r   r   r  r   rx  r   r  r   rx   rx   ry   r     s    r  c                 C  sh   t | ttfrt| S t | ttfrJttj  }| D ]}|t	|O }q4|S t | t
jr^t| S t S d S rt   )ru   r-   r   r&   r   r   r/   r   r   r  r   rx  r  rx   rx   ry   r     s    r  )N)N)T)T)T)T)TFNFN(-  
__future__r   r  r  re  rx  loggingr  textwrapr;  r  collections.abcr   r   r   r   r   enumr   r	   r
   r   r   r   r   r   r   r   r   Ztyping_extensionsr   r   r   Zunittest.mockr   r   r   r   r   Ztorch._export.serde.schemaZ_exportZserder  r$  Ztorch._library.utilsr  r  r  Ztorch._loggingr   Ztorch.fxZtorch.utils._pytreeZ_pytreerl  Ztorch._dynamo.utilsr   Ztorch._export.serde.serializer   Z*torch._higher_order_ops.auto_functionalizer   Ztorch._inductorr   Ztorch._prims_commonr   r   r    r!   r"   Ztorch._subclasses.fake_tensorr#   Z%torch.fx.experimental.symbolic_shapesr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   Ztorch.utils._ordered_setr/   Ztorch.utils._sympy.functionsr0   r1   r2   Ztorch.utils._sympy.symbolr3   rU  r5   r6   Zcodegen.commonr7   r8   r9   r:   r;   r<   r=   r>   r?   Z	loop_bodyr@   Zops_handlerrA   rB   rC   rD   Zruntime.benchmarkingrE   Zruntime.hintsrF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   Zvirtualizedr]   r^   r_   Z"torch._library.fake_class_registryr`   Ztorch.fx.nodera   Zcodegen.cuda.cuda_templaterb   r   rc   rd   ro   r   r   __version__r  r  ImportErrorre   rf   rg   rv   rh   r  ri   	getLoggerr   rg  r]  r^  r   r   rn   rz   r!  r|   r   r   r   r   r   r   r  r  r   r   r   r   r   r   r  r  r  r  r&  r.  r5  rm   r  r  r  r  r  r  r6  r7  ZINNER_FN_TYr  r  r  r  r  r  r   r  r  r  r	  r  r   r(  r0  r8  r  r!  rW  r\  rk  rm  rn  rq  rs  r   re  r  r   r  r  r  r  r  r  r  r  r  r  r  r   rX  r  r  rp   r   ZPrimitiveInfoTyper
  r  r  r(  r-  r  r6  r7  rB  r  r  r>  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r  r	  r?  r.  r:  r  rl   r  rH  rL  rM  r]  rd  rn  r   r  r4  rt  r  r  r  r  rx   rx   rx   ry   <module>   sJ  ,0\

,

"	


&
	 |K  #?     q:& _ I     <
_N+: .U(R U 'T 
  LEN"9GA ,      V#)>(2 9*0K/$9   q+ W	j  M/  8