o
    ZhW                    @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZ d dlmZ d dl	mZ d dlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0 d dlm1Z1 d dl2Z2d dl3Z3d d	l4m5Z5 d d
l6m7Z7 d dl8m9Z9 e(rd dlm:Z:m;Z;m<Z< d dl3m=Z=m>Z>m?Z? d dl@mAZA d dlBmCZC d dlDmEZE d dlFmGZG ddlHmIZI ddlJmKZK ddlLmMZM ddlNmOZOmPZPmQZQmRZRmSZSmTZT ddlUmVZV ddlWmXZXmYZY g dZZe)dZ[e\dd_ddZ]d dl^m_Z_ d d l`maZa d d!lbmcZc d d"ldmeZe d d#lfmgZg d d$lhmiZi d d%ljmkZkmlZlmmZmmnZnmoZo d d&lpmqZqmrZr d d'lsmtZtmuZu dd(lvmwZw dd)lxmyZz ejd*kZ{e|e}Z~e)d+Zee2je2jf Ze&e*e3jee3j?f  Zd,d-d.Zd/Zd/Zd/Zd0Zd1Zeed @ d kred2ksJ d3d`d6d7Zdad;d<ZG d=d> d>e2jZ	@dbdcdFdGZe\ddddHdIZdedMdNZdfdQdRZdgdVdWZdhdZd[Zdid_d`ZydjdcddZdkdhdiZdldldmZdmdpdqZdrds fdndxdyZdoddZdpdqddZ		drdsddZ					dtduddZdvddZdwddZdxddZdyddZdzddZe.dZe)dddZG dd de'e#eef Zd{ddZd|ddZd}ddZd~ddÄZ	ddddʄZdddτZddd҄ZdddքZdddلZdddބZdddZdddZdddZdddZdddZdddZdddZg Zd}ed< dddZdddZd dlZdddZej			ddddZdddZdddZe\d2dddZG dd de%ZejG dd dZG dd dZG dd deƃZejdddZG dd dZG dd deɃZe\dddd!d"Zej\dd#d$Z̐dd%d&Z͐dd)d*Zΐddd+d,Zϐdd1d2ZАdd4d5Zѐdd6d7ZҐd8d8d9dd<d=ZӐdd@dAZԐddEdFZe\dddGdHZe\dddJdKZאddLdMZؐddNdOZِddPdQZڐddRdSZېddWdXZܐ	8		8	ddd^d_Zݐddd`daZG dbdc dcZߐddhdiZddkdlZddmdnZddodpZddqdrZddsdtZddvdwZejddzd{Z	ddddZdddZdddZdddZdddZdddZdddZejdddZdddZe\ddddZe\ddddZe\ddddZdddZdddZdddZddddZddddZdddZdddZG dd dejZdddZdddZdddZ	ddddZ dddÄZddƐdǄZddȐdɄZdd̐d̈́Zddѐd҄Zdds fddڐdۄZdds fddݐdބZdddZdddZ	ejG dd dZ
ejdddZdddZdddZdddZdÐddZdĐddZdŐddZdƐddZdǐddZdȐddZdɐddZdʐddZdːddZd̐ddZd͐ddZdΐddZdϐddZdАd d!Zddd"d#Zdd$d%Zd&d'd(d)d*d+d,Zd-d. e  D Z!e"d/Z#dѐd0d1Z$dҐd2d3Z%dӐd6d7Z&dӐd8d9Z'e\ddԐd;d<Z(ejG d=d> d>Z)i Z*d?ed@< dՐdDdEZ+d֐dFdGZ,e)dHZ-e)dIZ.G dJdK dKee-e.f Z/e-ddLdddMdאdQdRZ0dؐdTdUZ1dِdWdXZ2G dYdZ dZejZ3e\ddڐd[d\Z4ddd]d^Z5dS (      )annotationsN)
CollectionIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeGuard)mock)DeviceProperties)
OrderedSet)tree_map_only)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)ShapeEnv)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudaZmpsxpuTreturnstrc                  C  s>   dd t D } t| dksJ t| dkrd}|S |  }|S )Nc                 S  s   g | ]}t t| r|qS  )getattrtorchis_available.0xr=   r=   D/var/www/auris/lib/python3.10/site-packages/torch/_inductor/utils.py
<listcomp>R   s    z get_gpu_type.<locals>.<listcomp>r(   r   r8   )	GPU_TYPESlenpop)Z
avail_gpusZgpu_typer=   r=   rD   get_gpu_typeP   s   rI   )get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRanges)config)ceildivwin32_Tz.cubinz.spv)r8   r9         @      zmust be power of 2nbytesintc                 C  s   | t  d t  @ S )z/Round up to the nearest multiple of ALIGN_BYTESr(   )ALIGN_BYTES)ra   r=   r=   rD   _align   s   rd   v
sympy.Exprboolc                 C  s<   t | tjtjfrttt| jS t | tpt	| t
t
kS )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddZMaxallmap_is_alignedargsaligngcdrc   )re   r=   r=   rD   rm      s   rm   c                   @  s&   e Zd ZdZdZdZeddd	Zd
S )ro   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr(   Tvaluerf   r;   Optional[sympy.Expr]c                 C  s,   t |ttjfrtt|S t|r|S d S N)rh   rb   ri   Integerrd   rm   )clsrr   r=   r=   rD   eval   s
   z
align.evalN)rr   rf   r;   rs   )__name__
__module____qualname____doc__nargs
is_integerclassmethodrw   r=   r=   r=   rD   ro      s    ro      d   fnCallable[[], Any]warmuprepfloatc                   s  |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]	}|  |   q)|  t j  |	|d }t
dt|| }t
dt|| }	t|D ]}|   qYt jjt jjjgd}
t|	D ]	}|  |   qot j  W d	   n1 sw   Y  td
 t|
 jddd tdd |
 D }t||	 dkrtdt||	t||	  t fddt|D }|  | }td t|jdd tdd |D d |	 }td| |S )aR  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
    g    Ar8   )dtypedeviceT)Zenable_timing   r(   )Z
activitiesNz
raw eventsZself_device_time_total)Zsort_by	row_limitc                 S  s&   g | ]}|j tjkr|jd kr|qS )zContext Sync)Zdevice_typerL   CUDAnamerB   eventr=   r=   rD   rE      s
    z,do_bench_using_profiling.<locals>.<listcomp>r   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %sc                   s    g | ]\}}|  d kr|qS r   r=   )rB   ir   Znum_event_per_groupr=   rD   rE      s
    zprofiling time breakdown)r   c                 s      | ]}|j V  qd S rt   )Zdevice_time_totalr   r=   r=   rD   	<genexpr>       z+do_bench_using_profiling.<locals>.<genexpr>g     @@zprofiling results: %s ms)r?   r8   synchronizeemptyrb   EventrecordrangeZzero_Zelapsed_timemaxprofilerprofileZProfilerActivityr   logdebugZkey_averagestablerM   eventsrG   RuntimeError	enumerateZ_build_treesum)r   r   r   cacheZstart_eventZ	end_event_Zestimate_msZn_warmupZn_repeatpr   Zfiltered_eventsZactual_eventsresr=   r   rD   do_bench_using_profiling   sh   




r   c               
   C  s   zddl m}  tjdd | d uotttjdd dW S  ty&   Y dS  t	y@ } zdt
|v s5J W Y d }~dS d }~ww )	Nr   )	roi_alignztorchvision::nmsZMetaZtorchvisionr   Fztorchvision::nms does not exist)Ztorchvision.opsr   r?   _CZ%_dispatch_has_kernel_for_dispatch_keyhasattrr>   opsImportErrorr   r<   )r   er=   r=   rD   has_torchvision_roi_align   s   
r   r   "Union[Optional[torch.device], str]torch.devicec                 C  s`   | d u r
t djS t| trt | } | jdvr.| jd u r.t| j}t j| j|j	 dS | S )Ng        )cpumeta)index)
r?   tensorr   rh   r<   typer   rJ   ZWorkerZcurrent_devicer   Zdevice_interfacer=   r=   rD   decode_device   s   


r   itIterable[sympy.Expr]c                 C  s   t tj| tjjS rt   )	functoolsreduceoperatormulri   SZOner   r=   r=   rD   sympy_product	     r   seq1Sequence[sympy.Expr]seq2c                 C  s2   t | t |ks
J ttdd t| |D S )Nc                 s  s    | ]	\}}|| V  qd S rt   r=   )rB   abr=   r=   rD   r     s    zsympy_dot.<locals>.<genexpr>)rG   ri   expandr   zip)r   r   r=   r=   rD   	sympy_dot  s   r   Iterable[_T]ValuesView[_T]c                 C  s   dd | D   S )Nc                 S  s   i | ]}t ||qS r=   )idrA   r=   r=   rD   
<dictcomp>      zunique.<locals>.<dictcomp>)valuesr   r=   r=   rD   unique     r   numerUnion[int, sympy.Expr]denomc              	   C  sr   t | tjst |tjrtt| t|S t | tr!t |ts4J |  dt|  d| dt| t| |S )Nz: , )rh   ri   ExprrP   sympifyrb   r   runtime_ceildiv)r   r   r=   r=   rD   rZ     s    
rZ   keyOptional[torch.dtype]c                 C  s   | d u rdS t | dd }i dddddd	d
ddddddd	ddddddddddddddddd d!d"dd#d$d%d&}t| D ]}|||< qPt| t r^| S d'||  S )(Nz*i8.r   rg   i1Z
float8e4nvZfp8e4nvZfloat8e5Zfp8e5Zfloat8e4b15Zfp8e4b15Zfloat8e4b15x4Z
fp8e4b15x4float8_e4m3fnfloat8_e5m2Zfloat8_e8m0fnuu8float16Zfp16bfloat16Zbf16float32Zfp32float64Zfp64int8i8Zint16Zi16int32Zi32int64Zi64uint8u16u32Zu64)Zuint16Zuint32uint64*)r<   splitlistr   rh   )r   Z	dtype_strZtysre   r=   r=   rD   _type_of$  sZ   

r   lst"Iterable[Union[int, torch.SymInt]]list[sympy.Expr]c                 C  s   dd | D S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                 S  s   g | ]}t |qS r=   )ri   r   rB   r   r=   r=   rD   rE   R  r   z-convert_shape_to_inductor.<locals>.<listcomp>r=   r   r=   r=   rD   convert_shape_to_inductorJ  s   r    Iterable[Union[int, sympy.Expr]]list[Union[int, torch.SymInt]]c                   s   ddl m   fdd| D S )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    r(   Vc                   sB   g | ]}t |tr|nt |tjrt|n	 jjjj|d dqS )N)hint)rh   rb   ri   ru   graphsizevars	shape_envZcreate_symintnoder   r   r=   rD   rE   ^  s    


z+convert_shape_to_symint.<locals>.<listcomp>)virtualizedr   r   r=   r   rD   convert_shape_to_symintU  s   

r   optorch._ops.OpOverloadc                 C  s   t dd | jjD S )z-
    Does this op overload have aliasing
    c                 s  s    | ]}|j d uV  qd S rt   )Z
alias_inforB   r   r=   r=   rD   r   p      zis_view.<locals>.<genexpr>)any_schema	argumentsr  r=   r=   rD   is_viewl  s   r	  c                 C     dS NFr=   )r   r=   r=   rD   <lambda>u      r  user'   is_pointwise_fn'Callable[[torch._ops.OpOverload], bool]c                   s~   | j dksdS t| jtjjs| jtju sdS ttjj| j}|tju s(t	|r4t
 fdd| jD S tjj|jv p> |S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc                 3  s    | ]}t | V  qd S rt   )is_pointwise_use)rB   ur  r=   rD   r     r  z#is_pointwise_use.<locals>.<genexpr>)r  rh   targetr?   _ops
OpOverloadr   getitemr   r	  rk   usersTagZ	pointwisetags)r  r  r  r=   r  rD   r  s  s   

r  r  r
   rn   	list[Any]kwargsdict[str, Any]&tuple[GraphModule, list[torch.Tensor]]c                   s   t j  g d
 fdd} j| gtt j|||fR  }t| jjdkr5t	| jjd j
d	kr5|f} | t ji  }|fS )Nargtorch.Tensorr;   r'   c                   s    |   dt S )Nr   )appendplaceholderrG   )r   gZ
graph_argsr=   rD   add_tensor_arg  s   
z)gen_gm_and_inputs.<locals>.add_tensor_argr(   r   Tensor)r   r!  r;   r'   )r?   ZfxZGraphr  r   r'  rG   r  returnsr<   r   outputr%   )r  rn   r  r&  nodegmr=   r$  rD   gen_gm_and_inputs  s   

r,  r8   Nonec                 C  s,   | dkrd S t | }| r|  d S d S Nr   )rJ   r@   r   r   r=   r=   rD   r     s   r   modelCallable[..., Any]example_inputsSequence[Any]timesc                 C  sT   t | td t }t|D ]
}| | }t | qt }|d us&J || S )Ni9  )r   r?   Zmanual_seedtimeperf_counterr   )r/  r1  r3  r   t0r   resultt1r=   r=   rD   timed  s   

r9  r=   
         ?repeatbaselinec                   sH   t  fddt|D }t | }t|| d | S )Nc                   s   g | ]	}t  qS r=   )r9  )rB   r   r   r1  r/  r3  r=   rD   rE         z%print_performance.<locals>.<listcomp>z.6f)r?   r   r   Zmedianprintitem)r/  r1  r3  r<  r=  r   ZtimingsZtookr=   r>  rD   print_performance  s   rB  objmethodc                   s$   t | |  t| | fdd dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                     s    S rt   r=   r=   r7  r=   rD   r    r  z#precompute_method.<locals>.<lambda>N)r>   setattr)rC  rD  r=   rE  rD   precompute_method  s   rG  methods	list[str]c                 C  s   |D ]}t | | qdS )zFReplace methods with new methods that returns a precomputed constants.N)rG  )rC  rH  rD  r=   r=   rD   precompute_methods  s   rJ  r   r   c                 C  s   t | |kt | |k  S rt   )rb   )r   r   r=   r=   rD   cmp     rK  rC   Union[int, Sequence[int]]sizeSequence[int]c                 C  s:   t | tr
| g| S t| dkrt| | d g| S | S )Nr(   r   )rh   rb   rG   r   )rC   rN  r=   r=   rD   pad_listlike  s
   

rP  tuple[_T, ...]list[_T]c                 C  s&   t | dkrg S d	dd}t| |dS )
Nr   elemr\   r;   r<   c                 S  s0   t | tr| S ddlm} t | |sJ |  S )Nr(   )r6   )rh   r<   	schedulerr6   get_name)rS  r6   r=   r=   rD   	sort_func  s
   
ztuple_sorted.<locals>.sort_funcr   )rS  r\   r;   r<   )rG   sorted)rC   rV  r=   r=   rD   tuple_sorted  s   
	rY  PRVT)	covariantc                   @  s$   e Zd ZedddZdddZdS )CachedMethodr   r
   r;   r-  c                 C     d S rt   r=   )r   r=   r=   rD   clear_cache     zCachedMethod.clear_cachern   P.argsr  P.kwargsr[  c                 O  r^  rt   r=   selfrn   r  r=   r=   rD   __call__  r  zCachedMethod.__call__N)r   r
   r;   r-  )rn   ra  r  rb  r;   r[  )rx   ry   rz   staticmethodr_  re  r=   r=   r=   rD   r]    s    r]  !Callable[Concatenate[Any, P], RV]CachedMethod[P, RV]c                   sl   | j }d| d d| i}td| d  d  d | t| || d }d fdd}||_|S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        Z_cache_on_selfrd  r
   r;   r-  c                   s   t |  rt|   d S d S rt   )r   delattrrd  rW  r=   rD   r_    s   
z"cache_on_self.<locals>.clear_cache)rd  r
   r;   r-  )rx   execlstripr   wrapsr_  )r   r   ctxwrapperr_  r=   rW  rD   cache_on_self  s$   	rr  node_schedule0Union[Sequence[BaseSchedulerNode], ExternKernel]OrderedSet[Node]c                 C  sJ   ddl m} t| trttjdd | D t S t| |j	r"| j
S t S )Nr(   irc                 S  s$   g | ]}t |d r|jr|jjqS )r*  )r   r*  origins)rB   r*  r=   r=   rD   rE   '  s    z%aggregate_origins.<locals>.<listcomp>) rw  rh   r   r   r   r   or_r   r/   rx  )rs  rw  r=   r=   rD   aggregate_origins  s   
	r{  Sequence[BaseSchedulerNode]descriptive_names8Literal[True, 'torch', 'original_aten', 'inductor_node']c                 C  s   t | }|dkrdd |D }tt|}nH|dkrPg }|D ]*}|jdkrHd|jv rH|jd d }t|d tr@||d  q||d j qtt|}n|d	kr\d
d |D }nt	|}d
dg| S )Noriginal_atenc                 S  s<   g | ]}|j d krd|jv r|jd dur|jd jjqS )r  r  N)r  r   _overloadpacketrx   rB   originr=   r=   rD   rE   ;  s    

z)get_fused_kernel_name.<locals>.<listcomp>r?   r  Zsource_fn_stackr   r(   Zinductor_nodec                 S  s   g | ]
}|j d kr|jqS r  )r  r   r  r=   r=   rD   rE   O  s    r   Zfused)r{  rX  r   r  r   rh   r<   r"  rx   NotImplementedErrorjoin)rs  r}  all_originssourcesr  Z	source_fnr=   r=   rD   get_fused_kernel_name4  s.   r  rq  r+   tuple[str, str]c                   s  t | }dd |D }tt}tt}d  t|rQtdd |D }t|dkrQ|d j t dsGi }t j	D ]\}}	|||	< q;| _
|j fdd	d
 |D ]3}
d|
jv rq|
jd d urqt|
jd j}|| |
j d|
jv r|
jd d j}|| |
j qS d urdnd}|j d| dd|  dd|  d}|j dg}t| D ]\}}||j d| ddt|  q d ur||j d |D ]}	||j d|	   q|d|fS )Nc                 S  s   g | ]	}|j d kr|qS r  r  r  r=   r=   rD   rE   ]  r?  z'get_kernel_metadata.<locals>.<listcomp>c                 s  r   rt   )r   )rB   nr=   r=   rD   r   g  r   z&get_kernel_metadata.<locals>.<genexpr>r(   r   )_inductor_kernel_metadata_node_to_idx_mapc                   s
    j |  S rt   )r  r  Zsingle_graphr=   rD   r  q  s   
 z%get_kernel_metadata.<locals>.<lambda>rW  r  Z	from_nodezTopologically SortedZUnsorted z Source Nodes: [r   z], Original ATen: []z" Source node to ATen node mapping:z   z => z Graph fragment:
)r{  collectionsdefaultdictr   rG   r   r   r   r   nodesr  sortr   r<   r  r"  r   commentr  keysrX  itemsZformat_node)rs  rq  r  Zinductor_nodesZfrom_node_dictZoriginal_aten_dictZunique_graphsZnode_to_idx_mapidxr  r*  r   Zsort_strmetadataZdetailed_metadataZoriginal_noder  r=   r  rD   get_kernel_metadataX  sP   






r  initial_queueIterable[torch.fx.Node]skip_filterOptional[Callable[[Any], bool]]OrderedSet[torch.fx.Node]c                 C  sZ   t | } t| }| r+|  }|jD ]}|r||rq||vr(|| | | q| s
|S )zJReturns the set of nodes whose values depend on those within initial_queue)r   r   rH   r  addr"  )r  r  Zdominated_setr*  userr=   r=   rD   dominated_nodes  s   


	r  Sequence[IRNode]dict[str, IRNode]OrderedSet[IRNode]c                   sd   dd l }ddlm  d fdd	fd
d| D }fdd| D }t|jg ||R  S )Nr   r(   rv  r  r0   r;   rg   c                   sD   t |  jr| jS t |  jr| jS t |  jo!t |  jS rt   )rh   	TensorBoxdata
StorageBoxr0   Z	Pointwiser  rw  is_unrealized_noder=   rD   r    s
   

z*gather_origins.<locals>.is_unrealized_nodec                      g | ]	} |r|j qS r=   rx  )rB   valr  r=   rD   rE     r?  z"gather_origins.<locals>.<listcomp>c                   r  r=   r  )rB   r   r  r=   rD   rE     r?  )r  r0   r;   rg   )	itertoolsry  rw  r   r   chain)rn   r  r  Zkwarg_originsZarg_originsr=   r  rD   gather_origins  s   r  exprc                 C  s   t | tjr	| jS t | tjrdtt| jS t | tj	r'dtt| jS t | t
tttfrA| jj ddtt| j dS t| S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    z + z * (r   ))rh   ri   Symbolr   rj   r  rl   	sympy_strrn   ZMulrT   rQ   rR   rS   funcrx   r<   r  r=   r=   rD   r    s   "r  r   ValueRanges[Any]c                 C  s>   ddl m} tjrt|jdd  }r|jdkrt| S t	 S )Nr(   r   Zcurrent_nodeZ
index_expr)
r   r   rY   Zcompute_all_boundsr>   interpreterr  rW   rX   unknown)r   r   Zfx_noder=   r=   rD   get_bounds_index_expr  s   
r  prefixc                 C  s   | d dkS )Nr   rr=   )r  r=   r=   rD   prefix_is_reduction     r  rV   r  sympy.Symbolc                 C  s   | t jksJ t| |dddS )9
    Used to generate an integer-nonnegative symbol.
    TintegerZnonnegative)rV   ZSIZErU   )r  r  r=   r=   rD   sympy_index_symbol_with_prefix  s   r  checkc                 C  s   | st jot jS rt   )rY   Zdebug_index_assertsZassert_indirect_indexing)r  r=   r=   rD   generate_assert     r  r   c                 C  s    | d dksJ t j| dddS )r  r   sTr  )ri   r  r   r=   r=   rD   sympy_index_symbol  s   r  replacementsdict[sympy.Expr, Any]c                   s,   ddd t |  fd	d
| D S )z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    replacedrf   replacementUnion[sympy.Expr, str]r;   r  c                 S  s2   t | tjsJ t |trtj|| j| jdS |S )Nr  )rh   ri   r   r<   r  r}   Zis_nonnegative)r  r  r=   r=   rD   	to_symbol   s   
zsympy_subs.<locals>.to_symbolc                   s   i | ]
\}}| ||qS r=   r=   rB   kre   r  r=   rD   r         zsympy_subs.<locals>.<dictcomp>N)r  rf   r  r  r;   r  )ri   r   Zxreplacer  )r  r  r=   r  rD   
sympy_subs  s   

r  ,TypeGuard[Union[torch.SymInt, torch.Tensor]]c                 C  s:   t | tjpt | tjotdd t|  |  D S )Nc                 s      | ]}t |V  qd S rt   is_symbolicrA   r=   r=   rD   r         zis_symbolic.<locals>.<genexpr>)	rh   r?   r#   r'  r  r  r  rN  stride)r   r=   r=   rD   r    s    r  c                  G     t dd | D S )Nc                 s  r  rt   r  r  r=   r=   rD   r     r  z"any_is_symbolic.<locals>.<genexpr>r  )rn   r=   r=   rD   any_is_symbolic  r   r  r+  torch.fx.GraphModuleOptional[torch.fx.Node]c                 C  sv   ddl m} tg d}t r|d | jjD ]}t|j	|v r&|  S |j
d }d ur8||r8|  S qd S )Nr   )free_unbacked_symbols)z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultZrun_and_save_rng_stateZrun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outr  )%torch.fx.experimental.symbolic_shapesr  r   r?   $are_deterministic_algorithms_enabledupdater   r  r<   r  r   get)r+  r  Zforbidden_setr*  r  r=   r=   rD   %get_first_incompatible_cudagraph_node  s   r  c                 C  s&   t tt| jj}|jdksJ |S )z$Get the output node from an FX graphr)  )nextiterreversedr   r  r  )r+  Z	last_noder=   r=   rD   output_nodeM  s   r  _registered_cachesc                 C  s0   t | dr
t| jst|  dt|  | S )zq
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_inductor_cache().
    cache_clearz# does not have a cache_clear method)r   callabler  AttributeErrorr  r"  rC  r=   r=   rD   clear_on_fresh_inductor_cacheW  s   
r  c                  C  s   t D ]} |   qdS )z&
    Clear all registered caches.
    N)r  r  r  r=   r=   rD   clear_inductor_cachesc  s   
r  c                  C  s   t tj D ]9} | dsqtj|  }|j D ]"}|dr;t||}t|tj	j
jjr;|jD ]	}|jjj  q1qtj| = qdtjv rVtjd }t|jjj`|jj`t  d S )Nz&torch._inductor.runtime.compile_tasks.Ztriton_ztriton.runtime.driver)r   sysmodulesr  
startswith__dict__r>   rh   r?   Z	_inductorZruntimeZtriton_heuristicsZCachingAutotunerZcompile_resultskernelrunmod__del__r   driveractiveutilsinstancegcZcollect)module_namemZ	attr_namer  r7  r  r=   r=   rD   unload_xpu_triton_pydsn  s&   







r  cache_entriesOptional[dict[str, Any]]dirOptional[str]deleteIterator[None]c              	   #  sP   t   tj|d zztjtjd iX t	d  tj
 dtjtjdi1 dV  t| trXt| dksAJ dtj
rXt}| fd	d
|D  W d   n1 sbw   Y  W d   n1 sqw   Y  |rt rtj rt  tj  fddd W n ty   td   w W t   dS t   w )z
    Contextmanager that provides a clean tmp cachedir for inductor.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    )r  ZTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonZTRITON_CACHE_DIRNr   z!expected empty cache_entries dictc              	     s,   i | ]}d |vr|t jt j |qS )z.lock)ospathgetsizer  )rB   f)triton_cache_dirr=   rD   r     s
    z(fresh_inductor_cache.<locals>.<dictcomp>c                   s   t jd |dS )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  r  r  )inductor_cache_dirr=   rD   r    s
    z&fresh_inductor_cache.<locals>.<lambda>)onerrorz(on error, temporary cache dir kept at %s)r  tempfilemkdtempr   patchdictr
  environr   r   r  r  rh   rG   existslistdirr  
is_windowsr?   r9   r@   r  shutilrmtree	Exceptionr  )r  r  r  filesr=   )r  r  rD   fresh_inductor_cache  sL   




r  seq	list[int]c                 C  s(   | j }tt| }ttt||ddS )NT)r   reverse)__getitem__r   rG   r   r  rX  )r   getterZa_rr=   r=   rD   argsort  s   r%  r   r&   .Sequence[Union[int, torch.SymInt, sympy.Expr]]c                   sD   d fdd}dd	 t |D }t|t|d
}dd	 |D }|S )Nr   tuple[int, sympy.Expr]r   r;   rb   c                   sZ   | \}}|\}}d
 fdd}|||k rdS |||krdS ||k r%dS ||kr+dS d	S )Nr  %Union[bool, torch.SymInt, sympy.Expr]r;   rg   c                   s   t | tr| S  j| ddS )NT)Zsize_oblivious)rh   rg   Zevaluate_exprr  r   r=   rD   evaluate  s   
z*argsort_sym.<locals>.cmp.<locals>.evaluater   r(   r   )r  r(  r;   rg   r=   )r   r   Za_idxZa_valZb_idxZb_valr*  r)  r=   rD   rK    s   zargsort_sym.<locals>.cmpc                 S  s,   g | ]\}}|t |tjr|jjn|fqS r=   )rh   r?   r#   r*  r  )rB   r  r  r=   r=   rD   rE     s    zargsort_sym.<locals>.<listcomp>rW  c                 S  s   g | ]\}}|qS r=   r=   )rB   r  r   r=   r=   rD   rE         )r   r'  r   r'  r;   rb   )r   rX  r   
cmp_to_key)r   r   rK  exprsr7  r=   r)  rD   argsort_sym  s   r.  r   torch.dtypec                 C  s    | t jkrdS t jd| d S )Nr`   r=   r   )r?   r   r   Zelement_sizer0  r=   r=   rD   get_dtype_size  s   
r1  c                   @  s   e Zd ZU ded< dS )LineContextr
   contextNrx   ry   rz   __annotations__r=   r=   r=   rD   r2    s   
 r2  c                   @     e Zd ZU ded< ded< dS )ValueWithLineMapr<   rr   zlist[tuple[int, LineContext]]Zline_mapNr4  r=   r=   r=   rD   r7       
 r7  c                   @  s   e Zd ZdZd<d=ddZd>d
dZd?ddZd?ddZd@ddZdAddZ	d?ddZ
d@ddZdBddZdCd d!ZdDdEd%d&ZdDdFd'd(ZdDdFd)d*Z	+dGdHd/d0ZdId3d4Zd?d5d6ZdJd9d:Zd;S )KIndentedBuffer   r   initial_indentrb   r;   r-  c                 C  s   g | _ || _d S rt   )_lines_indent)rd  r;  r=   r=   rD   __init__      
zIndentedBuffer.__init__r7  c                 C  s   t  }d}g }| jD ]:}t|tr| }|d u rq
nt|tr(|||jf q
|}t|ts1J || |d |d|	d 7 }q
t
| |S )Nr(   r  )r	   r<  rh   DeferredLineBaser2  r"  r3  r<   writecountr7  getvalue)rd  bufr   Zlinemapliliner=   r=   rD   getvaluewithlinemap  s$   




z"IndentedBuffer.getvaluewithlinemapr<   c                 C  s
   |   jS rt   )rG  rr   rl  r=   r=   rD   rC       
zIndentedBuffer.getvaluec                 C  s   t  }| jD ]8}t|tr| }|d u rqnt|trq|}t|ts%J |dr4||d d  q|| |d q| S )N\r   r  )	r	   r<  rh   r@  r2  r<   endswithrA  rC  )rd  rD  rE  rF  r=   r=   rD   getrawvalue  s    




zIndentedBuffer.getrawvaluec                 C  s   | j   d S rt   )r<  clearrl  r=   r=   rD   rL  /     zIndentedBuffer.clearrg   c                 C  
   t | jS rt   )rg   r<  rl  r=   r=   rD   __bool__2  rH  zIndentedBuffer.__bool__c                 C  s   d| j | j  S )Nr  )r=  tabwidthrl  r=   r=   rD   r  5  r  zIndentedBuffer.prefixc                 C  s   |  d d S )Nr  	writelinerl  r=   r=   rD   newline8  rM  zIndentedBuffer.newlinerF  )Union[LineContext, DeferredLineBase, str]c                 C  sr   t |tr| j| d S t |tr| j||   d S | r1| j|   |  d S | jd d S Nry  )rh   r2  r<  r"  r@  with_prefixr  striprd  rF  r=   r=   rD   rR  ;  s   

zIndentedBuffer.writelinelines3Sequence[Union[LineContext, DeferredLineBase, str]]c                 C  s   |D ]}|  | qd S rt   rQ  )rd  rY  rF  r=   r=   rD   
writelinesE  s   zIndentedBuffer.writelinesr(   offset'contextlib.AbstractContextManager[None]c                   s   t jd fdd}| S )Nr;   r  c                	   3  s<     j  7  _ zd V  W  j  8  _ d S  j  8  _ w rt   r=  r=   r\  rd  r=   rD   rp  L  
   "z"IndentedBuffer.indent.<locals>.ctxr;   r  )
contextlibcontextmanager)rd  r\  rp  r=   r_  rD   indentK  s   zIndentedBuffer.indentc                 C  s   |  j |7  _ d S rt   r^  rd  r\  r=   r=   rD   	do_indentV  r   zIndentedBuffer.do_indentc                 C  s   |  j |8  _ d S rt   r^  re  r=   r=   rD   do_unindentY  r   zIndentedBuffer.do_unindentF
other_codeUnion[IndentedBuffer, str]rW  c                 C  s   t |trJtd}|jD ]}t |ts"|r"t|t|t|  }qt	|r*d}|jD ]}t |tr;| j
| q-t| |t|d   q-d S t|}|rU| }|sYd S | }|dD ]}| | qbd S )Ninfr   r  )rh   r9  r   r<  r2  minrG   rn  mathisinfr"  rR  rb   textwrapdedentrstripr   )rd  rh  rW  ro  rF  r  r=   r=   rD   splice\  s,   





zIndentedBuffer.splicer  Callable[[Any], Any]c                   s&   t | jd} fdd| jD |_|S )Nr;  c                   s   g | ]} |qS r=   r=   )rB   rF  r  r=   rD   rE   w  r+  z&IndentedBuffer.map.<locals>.<listcomp>)r9  r=  r<  )rd  r  r   r=   rt  rD   rl   u  s   zIndentedBuffer.mapc                 C  s   t |  d|   dS )Nr  r  )r   rC  rl  r=   r=   rD   __repr__z  rL  zIndentedBuffer.__repr__otherr   c                 C  s8   | j |j ksJ t| j d}|| j ||j |S )Nrs  )r=  r9  r[  r<  )rd  rv  r   r=   r=   rD   __add__}  s
   zIndentedBuffer.__add__Nr   )r;  rb   r;   r-  )r;   r7  r;   r<   r;   r-  r;   rg   )rF  rT  r;   r-  )rY  rZ  r;   r-  rq   )r\  rb   r;   r]  )r\  rb   r;   r-  )F)rh  ri  rW  rg   r;   r-  )r  rr  r;   r9  )rv  r   r;   r9  )rx   ry   rz   rP  r>  rG  rC  rK  rL  rO  r  rS  rR  r[  rd  rf  rg  rq  rl   ru  rw  r=   r=   r=   rD   r9    s(    











r9  c                      s(   e Zd Zd
 fddZddd	Z  ZS )FakeIndentedBufferr;   r-  c                   s   t    d S rt   )superr>  rl  	__class__r=   rD   r>    rM  zFakeIndentedBuffer.__init__r   r<   r
   c                 C  s$   |dkr
t | |S td| d)Nr~  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )rd  r   r=   r=   rD   r    s
   
z#FakeIndentedBuffer.__getattribute__ry  )r   r<   r;   r
   )rx   ry   rz   r>  r  __classcell__r=   r=   r}  rD   r{    s    r{  c               	   c  s<    t jt j} }zd V  W | |t _t _d S | |t _t _w rt   )r  stdoutstderr)Zinitial_stdoutZinitial_stderrr=   r=   rD   restore_stdout_stderr  r`  r  c                   @  s`   e Zd ZdZdddZddd	ZdddZd ddZd!ddZd"ddZ	d#ddZ
d$ddZdS )%r@  z.A line that can be 'unwritten' at a later timerF  r<   c                 C  s   |  sd}|| _d S rU  )rW  rF  rX  r=   r=   rD   r>    s   
zDeferredLineBase.__init__r;   Union[str, None]c                 C     t )zJReturns either self.line or None to indicate the line has been 'unwritten'r  rl  r=   r=   rD   re       zDeferredLineBase.__call__r   c                 C  r  )z3Returns a new deferred line with the same conditionr  rX  r=   r=   rD   	_new_line  r  zDeferredLineBase._new_liner  c                 C  s   |  | | j S rt   r  rF  )rd  r  r=   r=   rD   rV    r   zDeferredLineBase.with_prefixc                 C  s   |  | j S rt   )r  rF  rn  rl  r=   r=   rD   rn    r  zDeferredLineBase.lstripr   Union[int, slice]c                 C  s   |  | j| S rt   r  )rd  r   r=   r=   rD   r#    r  zDeferredLineBase.__getitem__rg   c                 C  rN  rt   )rg   rF  rl  r=   r=   rD   rO    rH  zDeferredLineBase.__bool__rb   c                 C  rN  rt   )rG   rF  rl  r=   r=   rD   __len__  rH  zDeferredLineBase.__len__N)rF  r<   )r;   r  )rF  r<   r;   r   )r  r<   r;   r   )r;   r   )r   r  r;   r   rz  r;   rb   )rx   ry   rz   r{   r>  re  r  rV  rn  r#  rO  r  r=   r=   r=   rD   r@    s    






r@  c                      s6   e Zd ZdZd fddZdd
dZdddZ  ZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`r   r<   value_fnCallable[[], str]rF  c                   s   t  | || _|| _d S rt   )r|  r>  r   r  )rd  r   r  rF  r}  r=   rD   r>    s   
zDelayReplaceLine.__init__r;   c                 C  s   | j | j|  S rt   )rF  replacer   r  rl  r=   r=   rD   re    r   zDelayReplaceLine.__call__c                 C  s   t | j| j|S rt   )r  r   r  rX  r=   r=   rD   r    r  zDelayReplaceLine._new_line)r   r<   r  r  rF  r<   rx  )rF  r<   r;   r  )rx   ry   rz   r{   r>  re  r  r  r=   r=   r}  rD   r    s
    
r  index_or_deviceUnion[int, torch.device]c                 C  s   t | tjr	| }ntt | }t|}tjjr3|jd us J |jdk s*|jdkr1t	
d dS dS |jdkr:dnd}|j}||k rOt	j
d	||d
d dS dS )N	   r:  z6GPU arch does not support max_autotune_gemm mode usageFTr9   r]   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rh   r?   r   rI   r   createversionhipmajorr   r  r   multi_processor_count)r  r   propr  r  r=   r=   rD   
is_big_gpu  s&   

r  c                   C  s   t jdjS )Nr8   )r?   r8   get_device_propertiesr  r=   r=   r=   rD   get_max_num_sms     r  c                  C  s"   t j } t | dur|  S d S )zFHandle experimental carveout if set otherwise return hardware SM countNr   )r?   r   Z_get_sm_carveout_experimentalr  )Zcarveoutr=   r=   rD   get_num_sms  s   
r  num_tma_descriptorsr)   c                 C  s<   ddl m}m} |d}t |  t }||||| dS )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r(   )r)   WorkspaceZeroModeF)rB  	zero_moder   Z
outer_name)codegen.commonr)   r  Z	from_boolr  TMA_DESCRIPTOR_SIZEZunique_name)r  r   r)   r  r  rN  r=   r=   rD   get_tma_workspace_arg  s   
r  c                   C  s   t jpt jpt jS rt   )rY   Zmax_autotuneZmax_autotune_gemmZsearch_autotune_cacher=   r=   r=   rD   use_max_autotune  s   r  layoutr1   allowed_layout_dtypeslist[torch.dtype]c                 C  s    t | jjo| j|v ot| jS rt   )is_gpur   r   r   r  )r  r  r=   r=   rD   _use_template_for_gpu  s
   r  backendc                 C  "   |   dd tj  dD v S )Nc                 S     g | ]}|  qS r=   rW  rA   r=   r=   rD   rE         z)_use_autotune_backend.<locals>.<listcomp>,)upperrY   Zmax_autotune_gemm_backendsr   r  r=   r=   rD   _use_autotune_backend     r  c                 C  r  )Nc                 S  r  r=   r  rA   r=   r=   rD   rE     r  z._use_conv_autotune_backend.<locals>.<listcomp>r  )r  rY   Zmax_autotune_conv_backendsr   r  r=   r=   rD   _use_conv_autotune_backend  r  r  F)enable_int32enable_float8r  r  c                C  s   ddl m}m} tjtjtjg}|rtjtjtjtjg}|r'|tj	tj
g t| jjo1t| |p<| jjdko<| j|v oJt oJtdoJ|| j|jS )Nr(   )BackendFeaturehas_backend_featurer   ZTRITON)r  r  r  r?   r   r   r   r   extendr   r   r  r   r   r  r   r  r  ZTRITON_TEMPLATES)r  r  r  r  r  layout_dtypesr=   r=   rD   use_triton_template#  s"   	r  matricesr0   c                    sJ   ddl m} ddlm  d fd	d
tjjo$| o$tfdd| D S )Nr   )has_triton_tma_devicer(   r   rC   r0   r;   rg   c                   s   t |  dkr
dS |  }|tjtjfvrdS |  }| }| s(|s(dS |j	d }|r4|j	d }||j
 } jj|tS )N   Fr(   r   )rG   get_size	get_dtyper?   r   r   Z
get_layoutZis_transposedis_contiguousrN  itemsizer   r   Zstatically_known_multiple_ofTMA_ALIGNMENT)rC   r   r  Z
transposedZ	inner_dimZinner_bytesr   r=   rD   _is_tma_compatible@  s   


z3use_triton_tma_template.<locals>._is_tma_compatiblec                 3      | ]} |V  qd S rt   r=   )rB   r  )r  r=   rD   r   V  r  z*use_triton_tma_template.<locals>.<genexpr>rC   r0   r;   rg   )Ztorch.utils._tritonr  r   r   rY   r	  Zenable_persistent_tma_matmulrk   )r  r  r=   )r   r  rD   use_triton_tma_template;  s   r  r  r  r  c           	      C  s   ddl m} |jjj|| | dd}|dks|tjjk rdS ddlm	} t
jjr+dS t
jt
jt
jt
jg}t| |o@t o@td}|rM| sMtd	 dS |S )
Nr(   r   r   fallbackr   F)try_import_cutlassZCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)r   r   r   r   	size_hintrY   r8   Zcutlass_backend_min_gemm_sizeZcodegen.cuda.cutlass_utilsr  r?   r  r  r   r   r   r   r  r  r  r   r  )	r  r  r  r  r   Z	gemm_sizer  r  r   r=   r=   rD   use_cutlass_templateZ  s(   
r  c                 C  s   t j| jS rt   )r?   r8   r  ZgcnArchNamer   r=   r=   rD   _rocm_native_device_arch_namex  r  r  Qtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]]c                  C  s|   zdd l } ddlm}m} ddlm} tj| j	}W n t
y7   ddd}ddd	}G d
d d}d }Y nw ||||fS )Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationr;   r  c                   S     g S rt   r=   r=   r=   r=   rD   r    r`  z*try_import_ck_lib.<locals>.gen_ops_libraryc                   S  r  rt   r=   r=   r=   r=   rD   r    r`  z.try_import_ck_lib.<locals>.gen_ops_preselectedc                   @  s   e Zd ZdS )z*try_import_ck_lib.<locals>.CKGemmOperationN)rx   ry   rz   r=   r=   r=   rD   r    s    r  )r;   r  )ck4inductorZ(ck4inductor.universal_gemm.gen_instancesr  r  Zck4inductor.universal_gemm.opr  r
  r  dirname__file__r   )r  r  r  r  Zpackage_dirnamer=   r=   rD   try_import_ck_lib}  s   

r  c                   s   t  sdS tjjsdS | jjdksdS t| j}dd tjj	D p)|
dd |i  fdd  tjj@ D }|s=dS | jtjtjtjfvrJdS t \}}}}|sZtd	 dS t rb|tj_tjjsmtd
 dS |tjjkrztd dS dS )NFr8   c                 S  s   i | ]
}| d d |qS ):r   )r   rB   r  r=   r=   rD   r     r  z#use_ck_template.<locals>.<dictcomp>r  r   c                   s   g | ]} | qS r=   r=   r  Zrequested_archsr=   rD   rE     s    z#use_ck_template.<locals>.<listcomp>z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)r  r?   r  r  r   r   r  rY   Zrocmarchr   r  Zck_supported_archr   r   r   r   r  r   r  	is_fbcodeZck_dir)r  Znative_archZrequested_supported_archsZck_package_dirnamer   r=   r  rD   use_ck_template  s<   




r  c                 C  s:   ddl m} tdot| o|jjj|| | dddkS )Nr(   r   CKr   r  r   )r   r   r  r  r   r   r  )r  r  r  r  r   r=   r=   rD   use_ck_gemm_template  s   r  c                 C  s   t dot| S )Nr  )r  r  r  r=   r=   rD   use_ck_conv_template  r  r  c                 C  s   t  o| jjdkS r.  )r  r   r   r  r=   r=   rD   _use_template_for_cpu  r   r  mat1Union[ReinterpretView, Buffer]mat2c                 C  s6   ddl m} t|j|sJ t| ||ddo|j S )Nr(   )r1   F)require_constant_mat2)rw  r1   rh   r  use_cpp_gemm_templater  )r  r  r  r1   r=   r=   rD   use_cpp_bmm_template  s
   r  mat2_transposedr  is_woq_int4q_group_sizeOptional[int]c                 C  s:  ddl m} ddlm} ddlm}	 ddlm}
 t| r t	ds"dS t
jjs(dS | tjtjfv }tjtjtjtjg}|
|||rD| jnd ||d\}}}} }}t||frXdS t||jrb| }|	| \}}|d	|||| | |t | |d

}ddd}| j|v o|d uo||ot||jo| p| S )Nr(   rv  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsZCPPF)	out_dtyper  Zuse_4x2_dim
micro_gemm)Zinput_dtypeZinput2_dtypeoutput_dtypeZnum_threadsZuse_refr  rC   r0   r;   rg   c                 S  s   |    |  d dkS )Nr   r(   )Zfreeze_layoutZ
get_striderC   r=   r=   rD   is_last_dim_stride1  s   z2use_cpp_gemm_template.<locals>.is_last_dim_stride1r  )ry  rw  Zcodegen.cpp_micro_gemmr  Zcodegen.cpp_utilsr  Zkernel.mm_commonr  r  r  rY   cppZweight_prepackr  r?   r   r   r   r   Zhalfr   has_free_symbolsrh   BaseViewZunwrap_viewparallel_num_threadsr  Zis_module_buffer)r  r  r  r  r  r  r  rw  r  r  r  Z	int8_gemmr  r  r  r  r  r   r  r  r=   r=   rD   r    sX   		


r  c                   C  s   t   ptdS )NZATEN)r  r  r=   r=   r=   rD   use_aten_gemm_kernels'  r  r  c                   @  s>   e Zd ZU edZded< dddZddd	ZdddZ	dS )DebugDirManagerr   r<   prev_debug_namer;   r-  c                 C  s   t tj| _d S rt   )r  r   counterr   rl  r=   r=   rD   r>  /  r  zDebugDirManager.__init__c                 C  s0   t jjj| _| j d| j | _| jt jj_d S )NZ_tmp_)r?   _dynamorY   debug_dir_rootr  r   new_namerl  r=   r=   rD   	__enter__2  s   zDebugDirManager.__enter__rn   r
   c                 G  s   t | j | jtjj_d S rt   )r  r  r  r  r?   r  rY   r  )rd  rn   r=   r=   rD   __exit__7  s   zDebugDirManager.__exit__Nry  )rn   r
   r;   r-  )
rx   ry   rz   r  rB  r  r5  r>  r  r  r=   r=   r=   rD   r   +  s   
 


r   Callable[P, _T]ra  rb  tuple[_T, list[str]]c                   st   ddl m} g  d
 fdd}tj|d	| tj  | |i |}W d    | fS 1 s1w   Y  | fS )Nr(   r,   coder<   r;   r-  c                        |  d S rt   r"  r
  source_codesr=   rD   save_output_codeE  rM  z*run_and_get_code.<locals>.save_output_coder  r
  r<   r;   r-  r   r-   r   r  r  r?   r  reset)r   rn   r  r-   r  r7  r=   r  rD   run_and_get_code<  s   

r  tuple[Any, list[str]]c                 O  sF   t | g|R i |\}}g }|D ]}|td|tj q||fS )Nz	'''.*?''')r  r  refindallDOTALL)r   rn   r  r7  r  Zkernelsr
  r=   r=   rD   run_and_get_kernelsN  s
   r  c                   s   d fdd}t |S )Nr;   r
   c                    s     } |     | S rt   )r   ZbackwardrE  r   r=   rD   run_with_backwardY  s   z1run_fw_bw_and_get_code.<locals>.run_with_backward)r;   r
   )r  )r   r  r=   r  rD   run_fw_bw_and_get_codeX  s   r  c              	     s   ddl m} g dfdd d fdd}tj|d|5 tj|d  tj  | |i |}W d   n1 s>w   Y  W d   S W d   S 1 sVw   Y  S )zLGet the inductor-generated code, but skip any actual compilation or running.r(   r,   r
  r<   r;   r-  c                   r  rt   r  r  r  r=   rD   r  g  rM  z"get_code.<locals>.save_output_coderd  r-   r
   c                   sF   G dd d}| j r|  n|  \}} |j |r  |j | S )Nc                   @  s$   e Zd ZdZdddZdd	d
ZdS )z@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton moduler;   r-  c                 S  r^  rt   r=   rl  r=   r=   rD   r>  n  r`  zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__rn   r
   r  c                 _  r^  rt   r=   rc  r=   r=   rD   callq  r  zEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.callNry  rn   r
   r  r
   r;   r-  )rx   ry   rz   r{   r>  r  r=   r=   r=   rD   DummyModulek  s    
r  )Zcpp_wrapperZcodegen_with_cpp_wrapperZcodegenrr   )rd  r  Zwrapper_codekernel_code)r  r=   rD   patched_compile_to_modulej  s   

z+get_code.<locals>.patched_compile_to_moduleZcompile_to_moduler  Nr  )rd  r-   r;   r
   r  )r   rn   r  r-   r!  r   r=   )r  r  rD   get_codea  s$   
(


r"  c                 O  sJ   t | g|R i |}dt|  krdks!n J dt| |d S Nr(   r  z%expected one or two code outputs got r   )r"  rG   )r   rn   r  r  r=   r=   rD   get_triton_code  s
   r$  c                 O  sN   t | g|R i |\}}dt|  krdks#n J dt| |d S r#  )r  rG   )r   rn   r  r   r  r=   r=   rD   run_and_get_triton_code  s
   r%  tuple[Any, list[GraphLowering]]c                   s   ddl m  ddlm} |jg d fd	d
}tj|d| | |i |}W d    |fS 1 s7w   Y  |fS )Nr   r,   r4   rn   r
   r  r;   r-  c                    s2   | i | | d }t | sJ | d S )Nr  )rh   r"  )rn   r  r   r-   Zgraph_loweringsZ	real_initr=   rD   	fake_init  s   z-run_and_get_graph_lowering.<locals>.fake_initr>  r  )Ztorch._inductor.graphr-   Ztorch._inductor.output_coder5   r>  r   r  r  )r   rn   r  r5   r(  r7  r=   r'  rD   run_and_get_graph_lowering  s   
r)  aten_opoverride_fnc              	   c  sN    ddl m} |j|  }zt|||j| < dV  W ||j| < dS ||j| < w )z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)Ztorch._inductorr,  Z	loweringsr   partial)r*  r+  r,  orig_fnr=   r=   rD   override_lowering  s   
r/  pre_fnpost_fnOptional[Callable[..., Any]]c                   s6   ddl m} |j d fdd}tjj|d	|S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	SchedulerrT  r
   r  r;   c                   s&   | |  | |}r| | |S rt   r=   )rT  r  outr.  r1  r0  r=   rD   rq    s
   


z(add_scheduler_init_hook.<locals>.wrapperr>  N)rT  r
   r  r
   r;   r
   )torch._inductor.schedulerr3  r>  unittestr   r  r  )r0  r1  r3  rq  r=   r5  rD   add_scheduler_init_hook  s   r8  msgc                 C  s"   t jr
t|  dS t|  dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)rY   Zdeveloper_warningsr   r  info)r9  r=   r=   rD   developer_warning  s   r;  c                  C  s   z/t jd} | d tt jk r.tt j| d  dkr.t j| d  d dkr.t j| d  W S W n	 ty8   Y nw t jD ]}|drM|tdd   S q<dS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr(   r   -z--only=N)r  argvr   rG   
ValueErrorr  )r  r   r=   r=   rD   get_benchmark_name  s   

r?  r  c                 C  r  )Nc                 s      | ]}|d kV  qdS r(   Nr=   rA   r=   r=   rD   r   	  r  zis_ones.<locals>.<genexpr>rk   r  r=   r=   rD   is_ones  r   rD  c                 C  r  )Nc                 s  r@  )r   Nr=   rA   r=   r=   rD   r     r  zis_zeros.<locals>.<genexpr>rB  rC  r=   r=   rD   is_zeros  r   rE  inputsSequence[torch.Tensor]c                 C  r  )Nc                 s  s,    | ]}t |tjr|jtd kV  qdS )r   N)rh   r?   r'  r   )rB   rA  r=   r=   rD   r     s    

z is_cpu_device.<locals>.<genexpr>rB  )rF  r=   r=   rD   is_cpu_device  s   rH  r  c                 C  s&   t | tjs
J d| jrtjS tjS )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rh   ri   r   r}   r?   r   r   )r  r=   r=   rD   get_sympy_Expr_dtype  s   rI  should_profileIterator[Any]c                 o  sN    | r"t jj|i |}|V  W d    d S 1 sw   Y  d S d V  d S rt   )r?   r   r   )rJ  rn   r  r   r=   r=   rD   maybe_profile"  s   "
rL  c                  C  s   t jj} | dk rt } | S Nr(   )rY   r  threadsr?   Zget_num_threads)rN  r=   r=   rD   r  +  s   r  c                  C  s,   ddl m}  |  }|dtjjrdS dS )Nr(   )get_backend_optionsZ
num_stagesr     )Zruntime.triton_helpersrO  r  r?   r  r  )rO  optionsr=   r=   rD   get_backend_num_stages2  s   rR  c                 C  s   ddl m}m} | tjtjtjfv sJ t|j	
drEddlm} | }| tjtjfv r3|| |S tjjjjr?|tj|S |tj|S | tjtjfv rQ|| S tjjjjr\|tjS |tjS )Nr   )get_max_simd_tflopsget_max_tensorcore_tflopsZ
clock_rate)max_clock_rate)triton.testingrS  rT  r?   r   r   r   inspect	signature
parametersr  Ztorch._utils_internalrU  backendsr8   matmulZ
allow_tf32)r   rS  rT  rU  Zsm_clockr=   r=   rD   get_device_tflops:  s   


r\  c                  C  s   ddl m}  |  S )Nr   get_dram_gbps)rV  r^  r]  r=   r=   rD   get_gpu_dram_gbpsV  s   r_  c                  C  s"   ddl m}  | jjdddS )Nr   r  Zmax_shared_mem)Ztriton.runtimer  r  r  r  r  r`  r=   r=   rD   get_gpu_shared_memory]  s   ra  reduction_typec                 C  s
   |  dS )NZwelford)r  rb  r=   r=   rD   is_welford_reductionc  rH  rd  c                 C  s   t | rdS | dkrdS dS )NrP  Zonline_softmax_reducer  r(   )rd  rc  r=   r=   rD   reduction_num_outputsg  s
   re  c                   C  s   t  dkS )NLinux)platformsystemr=   r=   r=   rD   is_linuxp  r  ri  c                   C  s
   t jdkS )Nr[   )r  rg  r=   r=   r=   rD   r  t  rH  r  itrIterable[Any]c                 C  r  )Nc                 s  s$    | ]}t |tjo|j V  qd S rt   )rh   ri   r   Z	is_numberrA   r=   r=   rD   r   y  s   " z#has_free_symbols.<locals>.<genexpr>r  )rj  r=   r=   rD   r  x  r   r  c                  G  s~   ddl m} | D ]4}t||j|j|j|j|jfr-t|	 pds)t|
 p'dr, dS qt||js4qtdt| dS )Nr(   rv  r=   Tzunexpected type for is_dynamic F)ry  rw  rh   r  r  r  ZComputedBufferr.   r  Zmaybe_get_sizeZmaybe_get_strider0   	TypeErrorr   )rn   rw  tr=   r=   rD   
is_dynamic|  s   
rn  c                   @  s   e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)rx   ry   rz   rp  rq  r=   r=   r=   rD   ro    s    ro  r  r%   inpc              	   C  s4  ddl m} tjdddd}t }t }t|t|dj|  t	d|j
 |d	 t	|j
|d	 t }t|| | |j
 W d    n1 sLw   Y  t | }	||j
 |j
  |  t	d
|j
 |d	 t	|j
|d	 | | k}
td||j|
|	 W d    d S 1 sw   Y  d S )Nr(   )stable_topological_sortwzutf-8F)modeencodingr  )r+  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)Zpattern_matcherrs  r  NamedTemporaryFileior	   rO   rK   	propagater@  r   r   nowrN   ZlintZ	recompilerC  r   r:  r   )r  r+  rr  r9  rs  r  Z	before_ioZafter_io
start_timeZtime_elapsedrm  r=   r=   rD   pass_execution_and_save  s>   

"r~  	input_buf"Optional[Union[Buffer, Operation]]c                 C  s&   ddl m} t| |jot| j|jS )zB
    Check if input buffer is a multi-outputs template buffer
    r(   rv  )ry  rw  rh   ZCppTemplateBufferr  ZMultiOutputLayoutr  rw  r=   r=   rD   is_multi_outputs_template  s   r  c                 C  s4   ddl m} t| |jot| jdkot| jd S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r(   rv  r   )ry  rw  rh   ZMultiOutputrG   rF  r  r  r=   r=   rD   #is_output_of_multi_outputs_template  s   r  r*   Optional[Union[Node, Operation]]!Optional[torch._ops.OperatorBase]c                 C  s   | d u rdS ddl m} t| |jko|d u p| j|u pRt| |jkoRttjj	do2| jtjj	j
jkpRttjj	doB| jtjj	jjkpRttjj	doR| jtjj	jjkS )NFr(   rv  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)ry  rw  r   Z_CollectiveKernelop_overloadFallbackKernelr   r?   r   Ztorchrecr  defaultr  r  r*  r  rw  r=   r=   rD   is_collective  s"   

r  "Optional[Union[IRNode, Operation]]c                 C  s   ddl m} t| |jkS Nr(   rv  )ry  rw  r   Z_WaitKernel)r*  rw  r=   r=   rD   is_wait  s   r  snoder6   c                 C  4   ddl m} t| |rtdd | jD S t| jS )Nr   GroupedSchedulerNodec                 s  r  rt   )contains_collectiverA   r=   r=   rD   r     r  z&contains_collective.<locals>.<genexpr>)r6  r  rh   r  snodesr  r*  r  r  r=   r=   rD   r       

r  c                 C  r  )Nr   r  c                 s  r  rt   )contains_waitrA   r=   r=   rD   r     r  z contains_wait.<locals>.<genexpr>)r6  r  rh   r  r  r  r*  r  r=   r=   rD   r    r  r  Optional[Operation]?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]c                 C  s6   ddl m} t|tjjr|g}t| |jo| j|v S r  )ry  rw  rh   r?   r  r  r  r  r  r=   r=   rD   is_fallback_op  s   r  buf_namename_to_bufname_to_fused_nodec                 C  s   |||  j   S rt   )Zdefining_oprU  )r  r  r  r=   r=   rD   buf_name_to_fused_snode  s   r  c                 C  r
  r  r=   r  r=   r=   rD   r  *  r  collected_node_setMutableSet[BaseSchedulerNode]dict[str, SchedulerBuffer]dict[str, BaseSchedulerNode]criteria_cbCallable[[Any], bool]c                 C  sP   || rd S | |  | jD ]}t|j||}||v rqt|||||d qd S )Nr  )r  Zunmet_dependenciesr  r   find_recursive_deps_of_node)r  r  r  r  r  depZdefining_op_for_depr=   r=   rD   r  %  s"   

r  c                 C  r
  r  r=   r  r=   r=   rD   r  C  r  c              	   C  s   || rd S | |  |  D ]4}|jD ].}|jd usJ |j dkr%q|j |vr-q||j  }||v r9qt|||||d qqd S )NZOUTPUTr  )r  Zget_outputsr  r*  rU  find_recursive_users_of_node)r  r  r  r  r  or  Zuser_opr=   r=   rD   r  >  s,   

r  dynamo_gm_num_inputsaot_fw_gm_num_inputsc                 C  s   t jjjrdnd}||  | S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )r?   Z
_functorchrY   Zfunctionalize_rng_ops)r  r  Znum_rng_seed_offset_inputsr=   r=   rD   num_fw_fixed_arguments[  s   r  fx_gc                 C  sd   ddd}d}g }| j jD ]}|jdkr!||r|| |d	7 }q|ttt|ks.J t|S )z>
    Infers which inputs are static for a backwards graph
    rC   r'   r;   rg   c                 S  s(   d| j vod| j vod| j vod| j vS )NZtangentsZbwd_seedZbwd_base_offsetZbwd_rng_stater  r  r=   r=   rD   is_saved_tensork  s   
z'count_tangents.<locals>.is_saved_tensorr   r#  r(   N)rC   r'   r;   rg   )r   r  r  r"  r   r   rG   )r  r  	arg_countZstatic_arg_idxsr  r=   r=   rD   count_tangentsf  s   


r  c                   @  s.   e Zd ZU ded< dddZedd	d
ZdS )	BoxedBoolrg   rr   r;   c                 C  s   | j S rt   )rr   rl  r=   r=   rD   rO    s   zBoxedBool.__bool__rC  r
   Union[BoxedBool, bool]c                 C  s   t | tr
d| _| S dS r  )rh   r  rr   r  r=   r=   rD   disable  s   
zBoxedBool.disableNrz  )rC  r
   r;   r  )rx   ry   rz   r5  rO  rf  r  r=   r=   r=   rD   r    s
   
 
r  kernel_listc                 #  sh    ddl m} |j	 		 dd fdd}tj|d| d V  W d    d S 1 s-w   Y  d S )Nr(   r*   Trd  r+   kernel_namer<   r   r  r  gpurg   cpp_definitionr;   r
   c                   s     | | |||||S rt   r  )rd  r  r   r  r  r  r  Zorig_define_kernelr=   rD   define_kernel  s   
z.collect_defined_kernels.<locals>.define_kernelr  )NTN)rd  r+   r  r<   r   r<   r  r  r  rg   r  r  r;   r
   )codegen.wrapperr+   r  r   r  r  )r  r+   r  r=   r  rD   collect_defined_kernels  s   "r  c                 C  s   | d S )NZ__original__r=   r  r=   r=   rD    get_cloned_parameter_buffer_name     r  c                 C  s   | t v S rt   )rF   r  r=   r=   rD   r    r  r  c                 C  s   t | S rt   )r  r  r=   r=   rD   device_need_guard  r  r  c                 C  sF   t  r| tjkrtj rtj dkrdS | ttjtj	tjgv S )N)r  r   F)
rY   r  r?   r   r8   r@   Zget_device_capabilityr   r   rg   r0  r=   r=   rD   ,needs_fallback_due_to_atomic_add_limitations  s   
r  r  
self_dtype	src_dtypesrc_device_typesrc_is_tensorc                 C  s   | j tjjjtjjjfv r|d u rdS | j tjjjkrdnd}|d |fvp]|o.t|o.t|p]| j tjjjkoM|dkoM|oM|dkoMt	j
joMt	j
jpMt dkp]||koY|tjtjfv p]t S )NFr  r   r   r(   )Zoverloadpacketr?   r   ZatenZscatter_reduce_Zscatter_reduceZscatter_r  r  rY   r  Zfallback_scatter_reduce_sumZdynamic_threadsr  rg   r   r  )r  rb  r  r  r  r  Z	reduce_tyr=   r=   rD   use_scatter_fallback  s8   	r  c                 C  s  ddl m}m} ddlm} tdt|  d t| D ]m\}}td|dd ||u r2td	 q||u r;td
 qt||r|	 }t|rIdnd d |rb|j
dusXJ td|j
jj  td |jjD ]}t| qjtd |jjD ]}t| qyqtdt| dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r  zenable reductionzdisable reductionredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )Ztorch._inductor.codegen.simdr  r  r6  r  r@  rG   r   rh   Zis_reductionr*  r  Zreduction_hintZread_writesZreadsZwritesr   r   )rs  r  r  r  r  r*  Zis_redr  r=   r=   rD   dump_node_schedule  s0   




r  r   r!  c                 C  s*   ddl m} ||  t| j t dkS )Nr   )statically_known_true)r  r  storage_offsetr1  r   GPU_ALIGN_BYTES)r   r  r=   r=   rD   tensor_is_aligned	  s   r  example_inputc                 C  s   t | jjsdS tjpt| S r  )r  r   r   rY   Zassume_aligned_inputsr  )r  r=   r=   rD   should_assume_input_aligned	  s   r  r]  c                  C  s4   t jj } | st S | jj}|st S | S rt   )	r?   _guardsTracingContexttry_getrb  nullcontextrw  r   Zsuppress_guards)tracing_contextr   r=   r=   rD   #maybe_get_suppress_shape_guards_ctx	  s   r  tuple[Any, str]c                 O  s   t jjtddJ tj  dd l}dd l	}|
 }||}ddlm} || |j}||j | |i |}	| }
|| || W d    |	|
fS 1 sVw   Y  |	|
fS )Nr   Tr   )output_code_log)r7  r   r  r  rY   r?   r  r  rz  loggingr	   StreamHandlerZtorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGrC  removeHandler)r   rn   r  rz  r  Zlog_capture_stringchr  Z
prev_levelr7  r  r=   r=   rD   run_and_get_cpp_code.	  s$   




r  Sequence[InputType]Optional[ShapeEnv]c                 C  s<   t | }|d ur|jS | D ]}t|tjr|jj  S qd S rt   )rK   r   rh   r?   r#   r*  )rF  rw  inputr=   r=   rD   shape_env_from_inputsG	  s   r   Callable[[list[InputType]], Any]inputs_to_checkc                   s$   t  dkrS d fdd}|S )	Nr   
new_inputslist[InputType]r;   r
   c                   s   t |   | S rt   )copy_misaligned_inputs)r  r  r/  r=   rD   r  b	  s   
z)align_inputs_from_check_idxs.<locals>.run)r  r  r;   r
   )rG   )r/  r  r  r=   r  rD   align_inputs_from_check_idxs[	  s   r  c                 C  s`   d|   v r	d}ntdd t|   |  D d }t| |fd }t||   |  S )Nr   c                 s  s     | ]\}}|d  | V  qdS rA  r=   )rB   shaper  r=   r=   rD   r   o	  s    z)clone_preserve_strides.<locals>.<genexpr>r(   rq   )rN  r   r   r  r?   Z
as_stridedclone)rC   Zneeded_sizebufferr=   r=   rD   clone_preserve_stridesi	  s   "r  r  r  check_inputs_idxsc                 C  s>   |D ]}| | }t |tjsJ | t rt|| |< qd S rt   )rh   r?   r'  data_ptr	ALIGNMENTr  )r  r  r   Z_inpr=   r=   rD   r  u	  s   r  static_input_idxsc                 C  sT   g }|D ]}| | }t |tjr| t dkr|| qt|t|kr(|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rh   r?   r'  r  r  r"  rG   )rF  r  Zaligned_static_input_idxsr  r  r=   r=   rD   remove_unaligned_input_idxs	  s   
r  r   c                 C  sZ   ddl m} ttjj}|jjj}|jjj	j
}|jj| |kr#dS || o,|| |kS )Nr(   r   T)r   r   r?   Ziinfor   r   r   r   r  r   has_hintZis_expr_static_and_true)r   r   Zint_maxr  r  r=   r=   rD   expr_fits_within_32bit	  s   
r  compiled_graphr5   c                   s   t jj }|d urX|jd urZt|jdksJ t| |jd us#J |jD ]5}|d u r3|jd  q&d t jj  }r@|j d fdd|jt	fd	d
|D  q&d S d S d S )Nr   Fr   r
   r;   ,Union[float, int, SymInt, SymFloat, SymBool]c                   s(   d u rt | S  r| S | S rt   )rb   Zdeserialize_symexprZevaluate_symexpr)r   )fakify_first_callr   r=   rD   map_expr	  s
   

z4set_tracing_context_output_strides.<locals>.map_exprc                 3  r  rt   r=   )rB   r   )r   r=   rD   r   	  r  z5set_tracing_context_output_strides.<locals>.<genexpr>)r   r
   r;   r  )
r?   r  r  r  Zoutput_stridesrG   r  r"  r  tuple)r1  r  r3  r-  rp  r=   )r  r   r   rD   "set_tracing_context_output_strides	  s"   
r  c                  C  s`   t jd urt jS t  sdS tj rdS zddlm}  W n
 ty'   Y dS w | tj	dkS )NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
rY   Zfx_graph_remote_cacher  r?   Z_utils_internalZis_fb_unit_testZtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorZjustknobs_getval_intr  r=   r=   rD    should_use_remote_fx_graph_cache	  s   

r  c                 C  s   t dd| S )Nz[^a-zA-Z0-9_]r   )r  subr  r=   r=   rD   normalize_name	  rM  r  ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuc                 C  s   i | ]\}}||qS r=   r=   r  r=   r=   rD   r   	  r   r   z^.*[.]c                 C  s   t dt| }t||S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r<   _triton_type_mappingr  )r   Ztriton_type_namer=   r=   rD   triton_type	  s   r  c                 C  s6   t | | }|dd}tt|}t|tjsJ |S )Nr	  ry  )_torch_triton_mappingr  r  r>   r?   rh   r   )r   Zadjusted_type	type_namer  r=   r=   rD   triton_type_to_torch	  s
   
r  r  rr   c                 C  sh   | j  o3|  | ko3|  | ko3| j|jko3| j|jko3|   |  ko3|  | kS rt   )	is_mkldnnrN  r  r   r   Zuntyped_storager  r  r  rr   r=   r=   rD   is_same_tensor	  s   

r  c                 C  sJ   | j o$|  | ko$| j|jko$| j|jko$tjj| tjj|kS rt   )r  rN  r   r   r?   r   Zmkldnnr  r  r=   r=   rD   is_same_mkldnn_tensor	  s   

r  tuple[str, ...]c                   C  r
  )N)rm  isnanZlogical_notlogical_andZsignbitand_leltgegteqnerz  xorr=   r=   r=   r=   rD   boolean_ops	
  r  r  c                   @  r6  )OpDtypeRuler$   type_promotion_kindr   override_return_dtypeNr4  r=   r=   r=   rD   r   
  r8  r   zdict[str, OpDtypeRule]op_dtype_propagation_rulesr!  r$   r"  c                 C  s   t ||t| < d S rt   )r   r#  )r   r!  r"  r=   r=   rD   #register_op_dtype_propagation_rules&
  s   r$  c                 C  s"   t jjr| tjtjfv rtjS | S )z"Maybe upcast [b]float16 to float32)rY   r	  Zcodegen_upcast_to_fp32r?   r   r   r   r0  r=   r=   rD   upcast_compute_type0
  s   r%  KeyTypeValTypec                   @  sl   e Zd ZdZd#ddZd$d
dZd%ddZd&ddZd'd(ddZd)ddZ	d*ddZ
d+dd Zd,d!d"ZdS )-
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    original_dictMapping[KeyType, ValType]c                 C  s   || _ i | _d S rt   r)  	new_items)rd  r)  r=   r=   rD   r>  E
  r?  zScopedDict.__init__r   r&  r;   r'  c                 C  s   || j v r
| j | S | j| S rt   r,  r)  rd  r   r=   r=   rD   r#  I
  s   


zScopedDict.__getitem__rr   r-  c                 C  s   || j |< d S rt   )r,  )rd  r   rr   r=   r=   rD   __setitem__N
  rM  zScopedDict.__setitem__r  rg   c                 C  s   || j v p	|| jv S rt   r-  r.  r=   r=   rD   __contains__Q
  r   zScopedDict.__contains__Nr  Optional[ValType]c                 C  s"   || j v r
| j | S | j||S rt   )r,  r)  r  )rd  r   r  r=   r=   rD   r  T
  s   

zScopedDict.getrb   c                 C  s,   t | j}| jD ]}|| jvr|d7 }q|S rM  )rG   r)  r,  )rd  r  r  r=   r=   rD   r  Y
  s   


zScopedDict.__len__Iterator[KeyType]c                 c  s.    | j E d H  | jD ]
}|| j vr|V  q
d S rt   r+  )rd  r  r=   r=   rD   __iter__`
  s   

zScopedDict.__iter__c                 C  s   t | jp| jS rt   )rg   r)  r,  rl  r=   r=   rD   rO  f
  r  zScopedDict.__bool__c                 C  r  rt   r  r.  r=   r=   rD   __delitem__i
  r`  zScopedDict.__delitem__)r)  r*  )r   r&  r;   r'  )r   r&  rr   r'  r;   r-  )r   r  r;   rg   rt   )r   r&  r  r1  r;   r1  r  )r;   r2  rz  )r   r&  r;   r-  )rx   ry   rz   r{   r>  r#  r/  r0  r  r  r3  rO  r4  r=   r=   r=   rD   r(  =
  s    






r(  )Zfrozen_defaultfrozenrv   Optional[type[Any]]r6  c                 s"   d fdd}| d u r|S || S )Nrv   r\   r;   c                   s(   t jdkrtj| d dS tj|  dS )N)rP  r:  T)kw_onlyr6  r5  )r  version_infodataclasses	dataclass)rv   r5  r=   rD   wrapo
  s   
zir_dataclass.<locals>.wrap)rv   r\   r;   r\   r=   )rv   r6  r<  r=   r5  rD   ir_dataclassm
  s   r=  Optional[list[int]]c                  C  s&   t jj } | d ur| jr| jjS d S rt   )r?   r  r  r  Zfw_metadataZbw_donated_idxs)r  r=   r=   rD   get_donated_idxs|
  s   r?  r  c                 C  sZ   ddl m}m} ddlm} | D ]}|||fvr*|jd ur*dd |jjD |jj|< qd S )Nr(   r  r   c                 S  s   g | ]}|j qS r=   r  r  r=   r=   rD   rE   
  s    z;set_kernel_post_grad_provenance_tracing.<locals>.<listcomp>)	Zcodegen.simd_kernel_featuresr  r  r   r   r*  rx  r   Z._inductor_triton_kernel_to_post_grad_node_info)rs  r  r  r  r   r*  r=   r=   rD   'set_kernel_post_grad_provenance_tracing
  s   
r@  c                   @  s    e Zd ZdZdZdZdZdZdS )TritonAttrsDescriptorVersionr   r(   r  rP  r:  N)rx   ry   rz   V0_NO_TRITONV1_COMPILERV2_BACKENDSZV3_BACKENDS_TUPLEV4_DICTr=   r=   r=   rD   rA  
  s    rA  c                  C  sT   t jdd u rtjS dd l} dd l} t| jj	drtj
S t| j	j	dr'tjS tjS )Nr	  r   ZAttrsDescriptor)	importlibutil	find_specrA  rB  Ztriton.backends.compilerZtriton.compiler.compilerr   rZ  compilerrD  rC  rE  )r	  r=   r=   rD   #get_triton_attrs_descriptor_version
  s   rJ  c                   C  s   t  tjkS rt   )rJ  rA  rE  r=   r=   r=   rD   triton_version_uses_attrs_dict
  r  rK  rx  )ra   rb   r;   rb   )re   rf   r;   rg   )r   r   )r   r   r   rb   r   rb   r;   r   rz  )r   r   r;   r   )r   r   r;   rf   )r   r   r   r   r;   rf   )r   r   r;   r   )r   r   r   r   r;   r   )r   r   r;   r<   )r   r   r;   r   )r   r   r;   r   )r  r  r;   rg   )r  r'   r  r  r;   rg   )r  r
   rn   r  r  r  r;   r  )r8   )r   r<   r;   r-  )r(   r8   )
r/  r0  r1  r2  r3  rb   r   r<   r;   r   )r=   r:  r:  r;  r8   )r/  r0  r1  r2  r3  rb   r<  rb   r=  r   r   r<   r;   r   )rC  r
   rD  r<   r;   r-  )rC  r
   rH  rI  r;   r-  )r   rb   r   rb   r;   rb   )rC   rM  rN  rb   r;   rO  )rC   rQ  r;   rR  )r   rg  r;   rh  )rs  rt  r;   ru  )rs  r|  r}  r~  r;   r<   )rs  rt  rq  r+   r;   r  rt   )r  r  r  r  r;   r  )rn   r  r  r  r;   r  )r  rf   r;   r<   )r   rf   r;   r  )r  r<   r;   rg   )r  rV   r  rb   r;   r  )r  rg   r;   rg   )r   r<   r;   r  )r  rf   r  r  r;   rf   )r   r
   r;   r  )rn   r
   r;   rg   )r+  r  r;   r  )r+  r  r;   r'   )rC  r
   r;   r
   ry  )NNT)r  r  r  r  r  rg   r;   r  )r   r2  r;   r!  )r   r&   r   r&  r;   r!  )r   r/  r;   rb   ra  r   )r  r  r;   rg   r  )r  rb   r   r   r;   r)   )r  r1   r  r  r;   rg   )r  r<   r;   rg   )r  r1   r  rg   r  rg   r;   rg   )r  r0   r;   rg   )
r  r1   r  rb   r  rb   r  rb   r;   rg   )r   r<   r;   r<   )r;   r  )r  r1   r;   rg   )r  r1   r  r  r  r0   r;   rg   )FTFN)r  r1   r  r0   r  r0   r  rg   r  rg   r  rg   r  r  r;   rg   )r   r  rn   ra  r  rb  r;   r	  )r   r0  rn   r
   r  r
   r;   r  )r   r0  r;   r  )r   r0  rn   r
   r  r
   r;   rI  )r   r0  rn   r
   r  r
   r;   r<   )r   r0  rn   r
   r  r
   r;   r&  )r*  r0  r+  r0  r;   r  )r0  r0  r1  r2  r;   r
   )r9  r<   r;   r-  )r;   r  )r  r2  r;   rg   )rF  rG  r;   rg   )r  rf   r;   r/  )rJ  rg   rn   r
   r  r
   r;   rK  )rb  r<   r;   rg   )rb  r<   r;   rb   )rj  rk  r;   rg   )
r  r0  r+  r%   rr  r2  r9  r<   r;   r-  )r  r  r;   rg   )r*  r  r  r  r;   rg   )r*  r  r;   rg   )r  r6   r;   rg   )r*  r  r  r  r;   rg   )r  r<   r  r  r  r  r;   r
   )r  r6   r  r  r  r  r  r  r  r  r;   r-  )r  rb   r  rb   r;   rb   )r  r  r;   rb   )r  rI  r;   r  )r   r<   r;   r<   )r   r  r;   rg   )r   r<   r;   rg   )r   r/  r;   rg   )r  r  rb  r  r  r/  r  r/  r  r<   r  rg   r;   rg   )rs  r|  r;   r-  )r   r!  r;   rg   )r  r!  r;   rg   )r;   r]  )r   r0  rn   r
   r  r
   r;   r  )rF  r  r;   r  )r/  r  r  rO  r;   r  )rC   r!  r;   r!  )r  r  r  rO  r;   r-  )rF  r  r  rO  r;   rO  )r   rf   r;   rg   )r1  r2  r  r5   r;   r-  )r   r/  r;   r<   )r   r<   r;   r/  )r  r!  rr   r!  r;   rg   )r;   r  )r   r<   r!  r$   r"  r   r;   r-  )r   r/  r;   r/  )rv   r7  r6  rg   r;   r
   )r;   r>  )rs  r|  r  r<   r;   r-  )r;   rA  (6  
__future__r   r  rb  r:  enumr   rF  rW  rz  r  r  rl  r   r
  rg  r  r  r  r  rn  r4  r7  collections.abcr   r   r   r   r   r   r	   typingr
   r   r   r   r   r   r   r   r   r   r   Ztyping_extensionsr   r   r   r   r   r   ri   r?   Ztorch._inductor.runtime.hintsr   Ztorch.utils._ordered_setr   Ztorch.utils._pytreer   r   r   r    r!   r"   r#   Ztorch._prims_commonr$   Ztorch.fxr%   r  r&   Ztorch.fx.noder'   r  r)   r  r+   r   r-   rw  r.   r/   r0   r1   r2   r3   Zoutput_coder5   rT  r6   r7   rF   r:   	lru_cacherI   Ztorch._dynamo.device_interfacerJ   Ztorch._dynamo.utilsrK   Ztorch.autogradrL   Ztorch.autograd.profiler_utilrM   Z(torch.fx.passes.graph_transform_observerrN   Ztorch.fx.passes.shape_proprO   Ztorch.utils._sympy.functionsrP   rQ   rR   rS   rT   Ztorch.utils._sympy.symbolrU   rV   Ztorch.utils._sympy.value_rangesrW   rX   ry  rY   Zruntime.runtime_utilsrZ   r   Z_IS_WINDOWS	getLoggerrx   r   r\   r  r   Z	VarRangesr'  rb   Z	InputTypeZGPU_KERNEL_BIN_EXTSr  r  r  r  rc   rd   rm   Functionro   r   r   r   r   r   r   r   r   r   r	  r  r,  r   r9  rB  rG  rJ  rK  rP  rY  rZ  r[  r]  rr  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r5  r  r  r  r  rc  r  r%  r.  r1  r2  r;  r7  r9  r{  r  r@  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r"  r$  r%  r)  r/  r8  r;  r?  rD  rE  rH  rI  rL  r  rR  r\  r_  ra  rd  re  ri  r  r  rn  Enumro  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  compiler
  r  r  r  r  r  r   r#  r$  r%  r&  r'  r(  r=  r?  r@  rA  rJ  rK  r=   r=   r=   rD   <module>   s<   4 


$T&		$=/7$ 
 
.
?
	,		!
	
$$		'	




	$
0 
