a
    h6                 
   @  s  U d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlmZmZmZmZmZ d dlmZ d dl	mZ d dl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d dlm3Z3 d dl4Z4d dl5Z5d d	l6m7Z7 d d
l8m9Z9 d dl:m;Z; dgZ<e)rNd dlm=Z=m>Z>m?Z? d dl5m@Z@mAZAmBZB d dlCmDZD d dlEmFZF d dlGmHZH d dlImJZJ ddlKmLZL ddlMmNZN ddlOmPZP ddlQmRZRmSZSmTZTmUZUmVZVmWZWmXZX ddlYmZZZ ddl[m\Z\m]Z] g dZ^e*dZ_ej`ddddZad d lbmcZc d d!ldmeZe d d"lfmgZg d d#lhmiZi d d$ljmkZk d d%llmmZm d d&lnmoZompZpmqZqmrZrmsZs d d'ltmuZumvZv d d(lwmxZxmyZy dd)lzm{Z{ dd*l|m}Z~ ejd+kZeeZe*d,Zee4je4jf Ze'e+e5jee5jBf  Zd-d.d/Zd0Zd0Zd0Zd1Zd2Zeed @ d kred3ksJ d4d5d5d6d7d8Zd9d:d;d<d=ZG d>d? d?e4jZejd@dAG dBdC dCZdudFd5d5dGdHdIdJZdvdFd5d5dGdHdKdLZej`d:ddMdNZdOdPdQdRdSZdTd9dUdVdWZdXdXd9dYdZd[Zd\d]dUd^d_Zd`d`d`dadbdcZ}ddddedfdgZdhdidjdkdlZd`dmdndodpZdqdrdjdsdtZdud:dvdwdxZdydz fd{d|d:d}d~dZdddddddZdwdddQddZdxddd5ddGdddZdyddd5d5dGddGdddZddddddZddddddZd5d5d5dddZdd5ddddZdddddZe/dZe*dd@dZG dd de(e$eef ZdddddZdddddZddddddZddddddZdzddddƜddȄZdddd̜dd΄Zd9ddϜddфZd9ddӜddՄZdd:d֜dd؄Zdd5ddۜdd݄Zd:d:dޜddZdddddZd9dd9dddZdddddZdd:dddZdddddZdd{dddZdddddZd dlZddddZg Zded< dddddZdddd Zejǐd{ddd:ddddZeZeZeZdddd	d
Z̐ddddddZed3dd5dddZG dd de&ZejG dd dZG dd dZG dd de҃ZejǐddddZG dd dZG d d! d!eՃZej`d|d"d:d#d$d%Zejd5dd&d'Zd5dd(d)Zِd}d5dPd*d+d,d-d.Zڐd/d0d:d1d2d3Zdd:d4d5d6Zdd:d4d7d8Zݐd9d9d:d/d:d:d:d;d<d=Zސd>d:d?d@dAZߐd/d5d5d5d:dBdCdDZdd:dEdFdGZdHZdIZg dJZe+ee4jf ZdKeĐdL< dLdLdLd:dMdNdOZej`dLdLdLddMdPdQZej`dddQdRdSZej`dTddUdVZd/d:dWdXdYZd/d5d5d5d:dBdZd[Zd/d5d5d5d:dBd\d]Zd/d:dWd^d_Zd/d:dWd`daZd/dbd>d:dcdddeZd~d/d>d>d:d:d:d*d:dfdgdhZd:ddidjZG dkdl dlZdmdndodpdqdrdsZdmdndodpdqdtduZddvddwdxZdmdndoddqdydzZdmdndoddqd{d|Zdmdndoddqd}d~ZdmdndoddqddZejddddddZdddddddZdddddZddddZdd:dddZdd:dddZdd:dddZ d9ddddZejd:ddddddZd5dddZej`d5dddZej`dd5dddZej`d5dddZd5dddZdd:dddZdd5dddZ	d:dddZ
d:dddZdd:dddZdd:dddZG dd dejZddddddddZdd:dddZdd:dddZddÐdd:dŜdƐdǄZdd:dɜdʐd˄Zdd:d͜dΐdτZdd:d͜dАdфZdҐdd:dŜdԐdՄZddddd֜dאd؄Zddz fd̐dڐdېdܐdddޜdߐdZddz fd̐dڐdېdܐdddޜddZd5d5d5dddZdd5dddZejG dd dZejdddddZdddddZdd:dQddZ dd:dQddZ!dd:dddZ"duddddd:d:dddZ#dddddZ$dd:dddZ%dd:d ddZ&ddddZ'dmdndoddqddZ(d	d
dddZ)dddddddZ*dddddZ+ddddddddZ,d	dddddZ-d9d:dddZ.dd dd!d"d#Z/d:dd$d%Z0dddd&d'Z1d(d)d*d+d,d-d-d.Z2d/d0 e23 D Z4e5d1Z6dddd2d3Z7dddd4d5Z8ddd:d6d7d8Z9ddd:d6d9d:Z:ej`d;dd<d=Z;ejG d>d? d?Z<i Z=d@eĐdA< ddBddddCdDdEZ>e9 Z?dFeĐdG< ddddHdIZ@dddJdKZAddddLdMZBe*dNZCe*dOZDG dPdQ dQeeCeDf ZEe.d@dRdd@dAdSd:ddTdUdVZFdWddXdYZGddZdd:dd[d\d]ZHG d^d_ d_ejZIej`d_dd`daZJd:ddbdcZKddd:dɜdedfZLdddgdhZMdd:didjdkZNd5ddldmdnZOdoZPdd:dpdqdrZQdd:dpdsdtZRdS (      )annotationsN)
CollectionIteratorMappingMutableMapping
MutableSet)datetime)StringIO)AnyCallablecastGenericLiteral
NamedTupleOptionalProtocolTYPE_CHECKINGTypeVarUnion)Concatenatedataclass_transform	ParamSpecSelf	TypeAlias	TypeGuard)mock)DeviceProperties)
OrderedSet)tree_map_onlyZ!activation_quantization_aten_pass)IterableSequence
ValuesView)SymBoolSymFloatSymInt)ELEMENTWISE_TYPE_PROMOTION_KIND)GraphModule)ShapeEnv)Node   )WorkspaceArgPythonWrapperCodegenGraphLowering)BufferExternKernelExternKernelOutIRNodeLayout	OperationReinterpretViewCompiledFxGraph)BaseSchedulerNodeSchedulerBuffer)cudampsxpuTstrreturnc                  C  s:   dd t D } t| dksJ t| dkr.dn|  }|S )Nc                 S  s   g | ]}t t| r|qS  )getattrtorchis_available.0xrA   rA   C/var/www/auris/lib/python3.9/site-packages/torch/_inductor/utils.py
<listcomp>`       z get_gpu_type.<locals>.<listcomp>r)   r   r:   )	GPU_TYPESlenpop)Z
avail_gpusZgpu_typerA   rA   rH   get_gpu_type^   s    rN   )get_interface_for_device)detect_fake_mode)
DeviceType)	EventList)GraphTransformObserver)	ShapeProp)CeilDivCleanDivFloorDivIdentityModularIndexing)make_symbolSymT)bound_sympyValueRanges)config)ceildivwin32_Tz.cubinz.spv)r:   r<         @      zmust be power of 2int)nbytesr@   c                 C  s   | t  d t  @ S )z/Round up to the nearest multiple of ALIGN_BYTESr)   )ALIGN_BYTES)rg   rA   rA   rH   _align   s    ri   
sympy.Exprbool)vr@   c                 C  s<   t | tjtjfr"ttt| jS t | tp:t	| t
t
kS )z:v can be statically proven to be a multiple of ALIGN_BYTES)
isinstancesympyAddZMaxallmap_is_alignedargsaligngcdrh   )rl   rA   rA   rH   rr      s    rr   c                   @  s,   e Zd ZdZdZdZedddddZd	S )
rt   z<Symbolically round up to the nearest multiple of ALIGN_BYTESr)   Trj   zOptional[sympy.Expr])valuer@   c                 C  s,   t |ttjfrtt|S t|r(|S d S N)rm   rf   rn   Integerri   rr   )clsrw   rA   rA   rH   eval   s    z
align.evalN)__name__
__module____qualname____doc__nargs
is_integerclassmethodr{   rA   rA   rA   rH   rt      s
   rt   Tfrozenc                   @  s2   e Zd ZU dZded< ded< ded< ded< d	S )
GraphPartitionMapzP
    Mapping from the partition info (e.g., input/output) to the graph info
    rf   idzlist[Optional[int]]Zinput_index_mappingZoutput_index_mapping	list[str]Zconstant_namesN)r|   r}   r~   r   __annotations__rA   rA   rA   rH   r      s
   
r      d   zCallable[[], Any]float)fnwarmuprepr@   c              
   C  s&  |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]}|	  |   qR|  t j  |
|d }tdt|| }tdt|| }	t|D ]
}|   qdd	 t|	D }d
d	 t|	D }t jjt jjjgd}
t j  t|	D ]\}|	  ||   t jjd |   W d   n1 sP0    Y  ||   qt j  t dd	 t||D }W d   n1 s0    Y  t | }td t|
 jddd tdd	 |
 D }|r|tdd |D d 8 }td| |S )R  
    Returns benchmark results by examining torch profiler events.
    This could be more accurate as it doesn't count CPU side overhead.
    However, this also requires manually excluding irrelevant event, e.g.
    vectorized_elementwise_kernel which is used to fill L2 cache,
    various CUDA events, etc, so could also be fragile.
        Ar:   dtypedeviceTZenable_timing   r)   c                 S  s   g | ]}t jjd dqS Tr   rC   r:   EventrF   _rA   rA   rH   rI      rJ   zfp8_bench.<locals>.<listcomp>c                 S  s   g | ]}t jjd dqS r   r   r   rA   rA   rH   rI      rJ   Z
activitiesZRunCudaModuleNc                 S  s   g | ]\}}| |qS rA   )elapsed_time)rF   serA   rA   rH   rI      rJ   
raw eventsself_device_time_totalZsort_by	row_limitc                 S  s&   g | ]}|j tjkrd |jv r|qS )Zfused_abs_max_0device_typerQ   CUDAnamerF   eventrA   rA   rH   rI      s   c                 s  s   | ]}|j V  qd S rx   Zdevice_time_totalr   rA   rA   rH   	<genexpr>   rJ   zfp8_bench.<locals>.<genexpr>     @@profiling results: %s ms)rC   r:   synchronizeemptyrf   float16r   recordrangezero_r   maxprofilerprofileProfilerActivityr   Znvtxtensorzipmeanitemlogdebugkey_averagestablerR   events
statistics)r   r   r   cachestart_event	end_eventr   estimate_msn_warmupn_repeatpitimesresfiltered_eventsrA   rA   rH   	fp8_bench   sd    	


&
$
r   c                   s  |   t j  t jtdt jdd}t jjdd}t jjdd}|  tdD ]}|  |   qR|  t j  |	|d }t
dt|| }t
dt|| }	t|D ]
}|   qt j  t jjt jjjgd6}
t|	D ]}|  |   qt j  W d	   n1 s0    Y  td
 t|
 jddd tdd |
 D }t||	 dkrtdt||	t||	  t fddt|D }|  | }td t|jdd tdd |D d |	 }td| |S )r   r   r:   r   Tr   r   r)   r   Nr   r   r   r   c                 S  s&   g | ]}|j tjkr|jd kr|qS )zContext Syncr   r   rA   rA   rH   rI   7  s   z,do_bench_using_profiling.<locals>.<listcomp>r   zYFailed to divide all profiling events into #repeat groups. #CUDA events: %d, #repeats: %sc                   s    g | ]\}}|  d kr|qS )r   rA   )rF   r   r   Znum_event_per_grouprA   rH   rI   F  s   zprofiling time breakdown)r   c                 s  s   | ]}|j V  qd S rx   r   r   rA   rA   rH   r   R  rJ   z+do_bench_using_profiling.<locals>.<genexpr>r   r   )rC   r:   r   r   rf   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rR   r   rL   RuntimeError	enumerateZ_build_treesum)r   r   r   r   r   r   r   r   r   r   r   r   r   Zactual_eventsr   rA   r   rH   do_bench_using_profiling  sh    


*


r   c               
   C  s   z8ddl m}  tjdd | d uo6tttjdd dW S  tyL   Y dS  t	y } zdt
|v sjJ W Y d }~dS d }~0 0 d S )	Nr   )	roi_alignztorchvision::nmsZMetaZtorchvisionr   Fztorchvision::nms does not exist)Ztorchvision.opsr   rC   _CZ%_dispatch_has_kernel_for_dispatch_keyhasattrrB   opsImportErrorr   r>   )r   r   rA   rA   rH   has_torchvision_roi_alignW  s    
r   z"Union[Optional[torch.device], str]ztorch.device)r   r@   c                 C  s`   | d u rt djS t| tr(t | } | jdvr\| jd u r\t| j}t j| j|j	 dS | S )Ng        )cpumeta)index)
rC   r   r   rm   r>   typer   rO   ZWorkerZcurrent_devicer   Zdevice_interfacerA   rA   rH   decode_deviceg  s    


r   zIterable[sympy.Expr])itr@   c                 C  s   t tj| tjjS rx   )	functoolsreduceoperatormulrn   SZOner   rA   rA   rH   sympy_productr  s    r   zSequence[sympy.Expr])seq1seq2r@   c                 C  s2   t | t |ksJ ttdd t| |D S )Nc                 s  s   | ]\}}|| V  qd S rx   rA   )rF   abrA   rA   rH   r   x  rJ   zsympy_dot.<locals>.<genexpr>)rL   rn   expandr   r   )r   r   rA   rA   rH   	sympy_dotv  s    r   zIterable[_T]zValuesView[_T]c                 C  s   dd | D   S )Nc                 S  s   i | ]}t ||qS rA   )r   rE   rA   rA   rH   
<dictcomp>|  rJ   zunique.<locals>.<dictcomp>)valuesr   rA   rA   rH   unique{  s    r   zUnion[int, sympy.Expr])numberdenomr@   c              	   C  sr   t | tjst |tjr.tt| t|S t | trBt |tshJ |  dt|  d| dt| t| |S )Nz: , )rm   rn   ExprrU   sympifyrf   r   runtime_ceildiv)r   r   rA   rA   rH   r_     s     r_   Optional[torch.dtype]keyr@   c                 C  s   | d u rdS t | dd }dddddddd	d	d
dddddddd	dddd}|dd t| D  t| t rx| S d||  S )Nz*i8.r   i1Zfp8e4nvZfp8e5Zfp8e4b15Z
fp8e4b15x4u8Zfp16Zbf16Zfp32Zfp64i8Zi16Zi32Zi64u16u32Zu64)rk   Z
float8e4nvZfloat8e5Zfloat8e4b15Zfloat8e4b15x4float8_e4m3fnfloat8_e5m2Zfloat8_e8m0fnuZfloat4_e2m1fn_x2r   bfloat16float32float64int8int16int32int64uint8Zuint16Zuint32uint64c                 S  s   i | ]
}||qS rA   rA   )rF   rl   rA   rA   rH   r     rJ   z_type_of.<locals>.<dictcomp>*)r>   splitupdatelistr   rm   )r   Z	dtype_strZtysrA   rA   rH   _type_of  s6    r  z"Iterable[Union[int, torch.SymInt]]zlist[sympy.Expr])lstr@   c                 C  s   dd | D S )z
    Gets the shape and stride of a tensor. For non-symbolic tensors, this is
    trivial. But for symbolic tensors, we need to map from SymIntNode into
    sympy.Expr.
    c                 S  s   g | ]}t |qS rA   )rn   r   rF   r   rA   rA   rH   rI     rJ   z-convert_shape_to_inductor.<locals>.<listcomp>rA   r  rA   rA   rH   convert_shape_to_inductor  s    r  zUnion[int, torch.SymInt])r   r@   c                 C  sB   ddl m} t| tr| S t| tjr.t| S |jjjj	| ddS )zL
    Like convert_shape_to_symint, but operates on a single expression.
    r)   VN)hint)
virtualizedr  rm   rf   rn   ry   graphsizevars	shape_envZcreate_symintnode)r   r  rA   rA   rH   convert_to_symint  s    

r  z Iterable[Union[int, sympy.Expr]]zlist[Union[int, torch.SymInt]]c                 C  s   dd | D S )zz
    Takes a list of shapes from Inductor and converts them into symints (or just
    ints if all shapes are static).
    c                 S  s   g | ]}t |qS rA   )r  r  rA   rA   rH   rI     rJ   z+convert_shape_to_symint.<locals>.<listcomp>rA   r  rA   rA   rH   convert_shape_to_symint  s    r  ztorch._ops.OpOverload)opr@   c                 C  s   t dd | jjD S )z-
    Does this op overload have aliasing
    c                 s  s   | ]}|j d uV  qd S rx   )Z
alias_inforF   r   rA   rA   rH   r     rJ   zis_view.<locals>.<genexpr>)any_schema	argumentsr  rA   rA   rH   is_view  s    r"  c                 C  s   dS NFrA   )r   rA   rA   rH   <lambda>  rJ   r$  r(   z'Callable[[torch._ops.OpOverload], bool])useis_pointwise_fnr@   c                   s~   | j dksdS t| jtjjs.| jtju s.dS ttjj| j}|tju sPt	|rht
 fdd| jD S tjj|jv p| |S )z
    Do all uses of this op have torch.Tag.pointwise or return True for optional `is_pointwise_fn`

    Uses in views ops will follow the views uses
    call_functionFc                 3  s   | ]}t | V  qd S rx   )is_pointwise_use)rF   ur&  rA   rH   r     rJ   z#is_pointwise_use.<locals>.<genexpr>)r  rm   targetrC   _ops
OpOverloadr   getitemr   r"  rp   usersTagZ	pointwisetags)r%  r&  r+  rA   r*  rH   r(    s    


r(  r
   	list[Any]zdict[str, Any]z&tuple[GraphModule, list[torch.Tensor]])r+  rs   kwargsr@   c                   s   t j  g ddd fdd} j| gtt j|||fR  }t| jjdkrpt	| jjd j
dkrp|f} | t ji  }|fS )	Ntorch.Tensorr(   )argr@   c                   s    |   dt S )Nr5  )appendplaceholderrL   )r5  gZ
graph_argsrA   rH   add_tensor_arg  s    
z)gen_gm_and_inputs.<locals>.add_tensor_argr)   r   Tensor)rC   fxZGraphr'  r   r;  rL   r  returnsr>   r   outputr&   )r+  rs   r3  r:  nodegmrA   r8  rH   gen_gm_and_inputs  s     

rA  r:   Nonec                 C  s(   | dkrd S t | }| r$|  d S Nr   )rO   rD   r   r   rA   rA   rH   r     s
    r   zCallable[..., Any]zSequence[Any])modelexample_inputsr   r   r@   c                 C  sT   t | td t }t|D ]}| | }t | q"t }|d usLJ || S )Ni9  )r   rC   Zmanual_seedtimeperf_counterr   )rD  rE  r   r   t0r   resultt1rA   rA   rH   timed  s    

rK  rA   
         ?)rD  rE  r   repeatbaseliner   r@   c                   sH   t  fddt|D }t | }t|| d | S )Nc                   s   g | ]}t  qS rA   )rK  r   r   rE  rD  r   rA   rH   rI   3  rJ   z%print_performance.<locals>.<listcomp>z.6f)rC   r   r   Zmedianprintr   )rD  rE  r   rN  rO  r   ZtimingsZtookrA   rP  rH   print_performance*  s    rR  )objmethodr@   c                   s$   t | |  t| | fdd dS )zKReplace obj.method() with a new method that returns a precomputed constant.c                     s    S rx   rA   rA   rI  rA   rH   r$  =  rJ   z#precompute_method.<locals>.<lambda>N)rB   setattr)rS  rT  rA   rU  rH   precompute_method:  s    rW  r   )rS  methodsr@   c                 C  s   |D ]}t | | qdS )zFReplace methods with new methods that returns a precomputed constants.N)rW  )rS  rX  rT  rA   rA   rH   precompute_methods@  s    rY  r   r   r@   c                 C  s   t | |kt | |k  S rx   )rf   )r   r   rA   rA   rH   cmpF  s    r[  zUnion[int, Sequence[int]]zSequence[int])rG   sizer@   c                 C  s:   t | tr| g| S t| dkr6t| | d g| S | S )Nr)   r   )rm   rf   rL   r   )rG   r\  rA   rA   rH   pad_listlikeJ  s
    

r]  ztuple[_T, ...]zlist[_T]rG   r@   c                 C  s,   t | dkrg S ddddd}t| |dS )Nr   ra   r>   )elemr@   c                 S  s0   t | tr| S ddlm} t | |s(J |  S )Nr)   )r8   )rm   r>   	schedulerr8   get_name)r_  r8   rA   rA   rH   	sort_funcW  s
    
ztuple_sorted.<locals>.sort_funcr   )rL   sorted)rG   rb  rA   rA   rH   tuple_sortedS  s    	re  PRV)	covariantc                   @  s2   e Zd ZedddddZdddd	d
dZdS )CachedMethodr
   rB  )r   r@   c                 C  s   d S rx   rA   )r   rA   rA   rH   clear_cacheh  s    zCachedMethod.clear_cacheP.argsP.kwargsrg  rs   r3  r@   c                 O  s   d S rx   rA   selfrs   r3  rA   rA   rH   __call__k  rJ   zCachedMethod.__call__N)r|   r}   r~   staticmethodrj  rp  rA   rA   rA   rH   ri  g  s   ri  z!Callable[Concatenate[Any, P], RV]zCachedMethod[P, RV])r   r@   c                   sr   | j }d| d d| i}td| d  d  d | t| || d }d	d
d fdd}||_|S )N___cacher   z        def zC_cache_on_self(self):
            try:
                return self.zy
            except AttributeError:
                pass
            rv = fn(self)
            object.__setattr__(self, "z%", rv)
            return rv
        Z_cache_on_selfr
   rB  ro  r@   c                   s   t |  rt|   d S rx   )r   delattrro  rc  rA   rH   rj    s    
z"cache_on_self.<locals>.clear_cache)r|   execlstripr   wrapsrj  )r   r   ctxwrapperrj  rA   rc  rH   cache_on_selfo  s"    

r|  z0Union[Sequence[BaseSchedulerNode], ExternKernel]zOrderedSet[Node])node_scheduler@   c                 C  sN   ddl m} t| tr2ttjdd | D t S t| |j	rD| j
S t S d S )Nr)   irc                 S  s$   g | ]}t |d r|jr|jjqS )r?  )r   r?  originsrF   r?  rA   rA   rH   rI     s   z%aggregate_origins.<locals>.<listcomp>) r  rm   r  r   r   r   or_r   r0   r  )r}  r  rA   rA   rH   aggregate_origins  s    
	r  zSequence[BaseSchedulerNode]z8Literal[True, 'torch', 'original_aten', 'inductor_node'])r}  descriptive_namesr@   c                 C  s   t | }|dkr,dd |D }tt|}n|dkrg }|D ]T}|jdkr<d|jv r<|jd d }t|d tr||d  q<||d j q<tt|}n|d	krd
d |D }nt	|}d
dg| S )Noriginal_atenc                 S  s<   g | ]4}|j d krd|jv r|jd dur|jd jjqS )r'  r  N)r  r   _overloadpacketr|   rF   originrA   rA   rH   rI     s
   

z)get_fused_kernel_name.<locals>.<listcomp>rC   r'  Zsource_fn_stackr   r)   Zinductor_nodec                 S  s   g | ]}|j d kr|jqS r'  )r  r   r  rA   rA   rH   rI     s   r   Zfused)r  rd  r   r  r   rm   r>   r6  r|   NotImplementedErrorjoin)r}  r  all_originssourcesr  Z	source_fnrA   rA   rH   get_fused_kernel_name  s,    r  r,   ztuple[str, str])r}  r{  r@   c                   s  t | }dd |D }tt}tt}d  t|rtdd |D }t|dkr|d j t dsdd	 t j	D }| _
|j fd
dd |D ]f}d|jv r|jd d urt|jd j}	||	 |j d|jv r|jd d j}	||	 |j q d urdnd}
|j d|
 dd|  dd|  d}|j dg}t| D ]0\}}||j d| ddt|  q` d ur||j d |D ] }||j d|   q|d|fS )Nc                 S  s   g | ]}|j d kr|qS r  r!  r  rA   rA   rH   rI     rJ   z'get_kernel_metadata.<locals>.<listcomp>c                 s  s   | ]}|j V  qd S rx   )r  )rF   nrA   rA   rH   r     rJ   z&get_kernel_metadata.<locals>.<genexpr>r)   r   )_inductor_kernel_metadata_node_to_idx_mapc                 S  s   i | ]\}}||qS rA   rA   )rF   idxr  rA   rA   rH   r     rJ   z'get_kernel_metadata.<locals>.<dictcomp>c                   s
    j |  S rx   )r  r  Zsingle_graphrA   rH   r$    rJ   z%get_kernel_metadata.<locals>.<lambda>rc  r  Z	from_nodezTopologically SortedZUnsorted z Source Nodes: [r   z], Original ATen: []z" Source node to ATen node mapping:z   z => z Graph fragment:
)r  collectionsdefaultdictr  rL   r   r  r   r   nodesr  sortr   r>   r  r6  r   commentr  keysrd  itemsZformat_node)r}  r{  r  Zinductor_nodesZfrom_node_dictZoriginal_aten_dictZunique_graphsZnode_to_idx_mapr?  r   Zsort_strmetadataZdetailed_metadataZoriginal_noder  r  rA   r  rH   get_kernel_metadata  sJ    






r  zIterable[torch.fx.Node]zOptional[Callable[[Any], bool]]zOrderedSet[torch.fx.Node])initial_queueskip_filterr@   c                 C  sX   t | } t| }| rT|  }|jD ].}|r4||r4q"||vr"|| | | q"q|S )zJReturns the set of nodes whose values depend on those within initial_queue)r  r   rM   r/  addr6  )r  r  Zdominated_setr?  userrA   rA   rH   dominated_nodes  s    

r  zSequence[IRNode]zdict[str, IRNode]zOrderedSet[IRNode]rm  c                   sj   dd l }ddlm  ddd fddfd	d
| D }fdd
| D }t|jg ||R  S )Nr   r)   r~  r2   rk   )r  r@   c                   sD   t |  jr| jS t |  jr,| jS t |  joBt |  jS rx   )rm   	TensorBoxdata
StorageBoxr2   Z	Pointwiser  r  is_unrealized_noderA   rH   r    s
    

z*gather_origins.<locals>.is_unrealized_nodec                   s   g | ]} |r|j qS rA   r  )rF   valr  rA   rH   rI      rJ   z"gather_origins.<locals>.<listcomp>c                   s   g | ]} |r|j qS rA   r  rF   r5  r  rA   rH   rI   !  rJ   )	itertoolsr  r  r   r   chain)rs   r3  r  Zkwarg_originsZarg_originsrA   r  rH   gather_origins  s    r  exprr@   c                   sX   ddddd ddd fddddd fd	d
dddfdd| S )z
    Normal sympy str is very slow, this is a lot faster.  The result are
    somewhat worse, as it doesn't do as much simplification.  So don't
    use this for final codegen.
    rj   rk   r  c                 S  s(   t | tjo&t| jdko&| jd dkS )N   r   r   )rm   rn   MulrL   rs   r  rA   rA   rH   is_neg_lead,  s    &zsympy_str.<locals>.is_neg_leadr>   c                   sp   t | tjrdt| jdkrP | jd rP| jd  d| jd jd  S dt| jS n| S d S )Nr  r)   r   z - z + )rm   rn   ro   rL   rs   r  rq   r  )r  sympy_str_mulrA   rH   sympy_str_add1  s
    (z sympy_str.<locals>.sympy_str_addc                   sH   t | tjr< | r(d| jd  S dt| jS n| S d S )N-r)   z * )rm   rn   r  rs   r  rq   r  )r  sympy_str_atomrA   rH   r  <  s
    z sympy_str.<locals>.sympy_str_mulc                   st   t | tjr| jS t | tjtjfr4d |  dS t | tttt	frh| j
j ddtt| j dS t| S d S )N()r   )rm   rn   Symbolr   ro   r  rY   rV   rW   rX   funcr|   r  rq   	sympy_strrs   r>   r  )r  rA   rH   r  G  s    "z!sympy_str.<locals>.sympy_str_atomrA   r  rA   )r  r  r  r  rH   r  %  s
    
r  zValueRanges[Any]r   r@   c                 C  sB   ddl m} tjr6t|jdd  }r6|jdkr6t| S t	 S d S )Nr)   r  Zcurrent_nodeZ
index_expr)
r  r  r^   Zcompute_all_boundsrB   interpreterr+  r\   r]   unknown)r   r  Zfx_noderA   rA   rH   get_bounds_index_exprT  s    r  prefixr@   c                 C  s   | d dkS )Nr   rrA   )r  rA   rA   rH   prefix_is_reductionb  s    r  r[   sympy.Symbol)r  r  r@   c                 C  s   | t jksJ t| |dddS )9
    Used to generate an integer-nonnegative symbol.
    TintegerZnonnegative)r[   ZSIZErZ   )r  r  rA   rA   rH   sympy_index_symbol_with_prefixf  s    r  )checkr@   c                 C  s   | s
t jot jS rx   )r^   Zdebug_index_assertsZassert_indirect_indexing)r  rA   rA   rH   generate_assertr  s    r  r   r@   c                 C  s    | d dksJ t j| dddS )r  r   r   Tr  )rn   r  r   rA   rA   rH   sympy_index_symbolv  s    r  zdict[sympy.Expr, Any])r  replacementsr@   c                   s4   dddddd t |  fdd| D S )	z
    When the passed replacement symbol v is a string, it is converted to a symbol with name v that
    have the same replaced expression integer and nonnegative properties.
    rj   zUnion[sympy.Expr, str]r  )replacedreplacementr@   c                 S  s6   t | tjsJ t |tr.tj|| j| jdS |S d S )Nr  )rm   rn   r   r>   r  r   Zis_nonnegative)r  r  rA   rA   rH   	to_symbol  s    
zsympy_subs.<locals>.to_symbolc                   s   i | ]\}}| ||qS rA   rA   rF   krl   r  rA   rH   r     rJ   zsympy_subs.<locals>.<dictcomp>)rn   r   Zxreplacer  )r  r  rA   r  rH   
sympy_subs  s    
r  z,TypeGuard[Union[torch.SymInt, torch.Tensor]])r   r@   c                 C  s:   t | tjp8t | tjo8tdd t|  |  D S )Nc                 s  s   | ]}t |V  qd S rx   is_symbolicrE   rA   rA   rH   r     rJ   zis_symbolic.<locals>.<genexpr>)	rm   rC   r$   r;  r  r  r  r\  stride)r   rA   rA   rH   r    s     r  rs   r@   c                  G  s   t dd | D S )Nc                 s  s   | ]}t |V  qd S rx   r  r  rA   rA   rH   r     rJ   z"any_is_symbolic.<locals>.<genexpr>r  )rs   rA   rA   rH   any_is_symbolic  s    r  ztorch.fx.GraphModulezOptional[torch.fx.Node])r@  r@   c                 C  s   ddl m} tg d}t r*|d | jjD ]r}t|j	|v rL|  S tj
jjst|j	tjjrtjjj|j	jv r|  S |jd }d ur2||r2|  S q2d S )Nr   )free_unbacked_symbols)z,aten._fused_moving_avg_obs_fq_helper.defaultz7aten._fused_moving_avg_obs_fq_helper_functional.defaultzfbgemm.dense_to_jagged.defaultz%fbgemm.jagged_to_padded_dense.defaultZrun_and_save_rng_stateZrun_with_rng_statezaten._local_scalar_densezaten._assert_scalar)zaten._unsafe_index_put.defaultz0aten._unsafe_masked_index_put_accumulate.defaultzaten.index_put.defaultzaten.index_put_.defaultzaten.scatter.srczaten.scatter.reducezaten.scatter.value_reducezaten.scatter_add_zaten.scatter_add.defaultzaten.scatter_reduce.twozaten.scatter_reduce_.twozaten.scatter_reduce.two_outr  )%torch.fx.experimental.symbolic_shapesr  r   rC   $are_deterministic_algorithms_enabledr  r  r  r>   r+  	_inductorr^   Zgraph_partitionrm   r,  r-  r   r0  cudagraph_unsafer1  r   get)r@  r  Zforbidden_setr?  r  rA   rA   rH   %get_first_incompatible_cudagraph_node  s*    
r  c                 C  s&   t tt| jj}|jdks"J |S )z$Get the output node from an FX graphr>  )nextiterreversedr  r  r  )r@  Z	last_noderA   rA   rH   output_node  s    r  zOrderedSet[torch.device]c                 C  s\   | j jdd}tdd |D }t| jd }t|tr<|n|f}tdd |D }||B S )Nr7  r!  c                 s  s.   | ]&}t |jd tjr|jd  jV  qdS r  N)rm   r   r  rC   r;  r   r  rA   rA   rH   r     s   z"get_all_devices.<locals>.<genexpr>r   c                 s  s<   | ]4}t |tjjrt |jd tjr|jd  jV  qdS r  )rm   rC   r<  r(   r   r  r;  r   r  rA   rA   rH   r     s   )r  Z
find_nodesr   r  rs   rm   tuple)r@  Zplaceholder_nodesZinput_devicesZout_argZout_argsZout_devicesrA   rA   rH   get_all_devices  s    r  c                  C  s   t tj D ]} | dsqtj|  }|j D ]V}|dr2t||}t|tj	j
jjr2|jD ]$}t|tj	j
jjrb|jjj  qbq2tj| = qdtjv rtjd }t|jjj`|jj`t  d S )Nz&torch._inductor.runtime.compile_tasks.Ztriton_ztriton.runtime.driver)r  sysmodulesr  
startswith__dict__rB   rm   rC   r  runtimeZtriton_heuristicsZCachingAutotunerZcompile_resultsZTritonCompileResultkernelrunmod__del__r   driveractiveutilsinstancegcZcollect)module_namem	attr_namer  rI  r  rA   rA   rH   unload_xpu_triton_pyds  s,    








r   _registered_cachesrS  r@   c                 C  s0   t | drt| js"t|  dt|  | S )zh
    Use this decorator to register any caches that should be cache_clear'd
    with fresh_cache().
    cache_clearz# does not have a cache_clear method)r   callabler  AttributeErrorr  r6  rS  rA   rA   rH   clear_on_fresh_cache  s    
r  c                  C  s   t D ]} |   qdS )z&
    Clear all registered caches.
    N)r  r  r  rA   rA   rH   clear_caches*  s    r  zOptional[dict[str, Any]]Optional[str]Iterator[None])cache_entriesdirdeleter@   c              	   #  s`  t   tj|d z>ztjtjd i t	d  tj
 dtjtjdib dV  t| trt| dksJ dtj
rt}| fd	d
|D  W d   n1 s0    Y  W d   n1 s0    Y  |r$t rtj rt  tj  fddd W n" tyH   td   Y n0 W t   nt   0 dS )z
    Contextmanager that provides a clean tmp cachedir for pt2 caches.

    Optionally, pass a dict as 'cache_entries' to get a list of filenames and sizes
    generated with this cache instance.
    )r  ZTORCHINDUCTOR_CACHE_DIRzUsing inductor cache dir %stritonZTRITON_CACHE_DIRNr   z!expected empty cache_entries dictc              	     s,   i | ]$}d |vr|t jt j |qS )z.lock)ospathgetsizer  )rF   f)triton_cache_dirrA   rH   r   N  s   zfresh_cache.<locals>.<dictcomp>c                   s   t jd |dS )Nz*Failed to remove temporary cache dir at %s)exc_info)r   warning)r  r  r  )inductor_cache_dirrA   rH   r$  ]  s   zfresh_cache.<locals>.<lambda>)onerrorz(on error, temporary cache dir kept at %s)r  tempfilemkdtempr   patchdictr  environr   r   r  r  rm   rL   existslistdirr  
is_windowsrC   r<   rD   r   shutilrmtree	Exceptionr  )r  r  r  filesrA   )r  r  rH   fresh_cache2  s<    



@


r$  z	list[int])seqr@   c                 C  s(   | j }tt| }ttt||ddS )NT)r   reverse)__getitem__r   rL   r  r  rd  )r%  getterZa_rrA   rA   rH   argsortp  s    r)  r'   z.Sequence[Union[int, torch.SymInt, sympy.Expr]])r  r%  r@   c                   sL   dddd fdd}dd t |D }t|t|d}d	d |D }|S )
Nztuple[int, sympy.Expr]rf   rZ  c                   s`   | \}}|\}}ddd fdd}|||k r4dS |||krDdS ||k rPdS ||kr\dS dS )	Nz%Union[bool, torch.SymInt, sympy.Expr]rk   r  c                   s   t | tr| S  j| ddS )NT)Zsize_oblivious)rm   rk   Zevaluate_exprr  r  rA   rH   evaluate~  s    
z*argsort_sym.<locals>.cmp.<locals>.evaluater   r)   r   rA   )r   r   Za_idxZa_valZb_idxZb_valr+  r*  rA   rH   r[  z  s    zargsort_sym.<locals>.cmpc                 S  s,   g | ]$\}}|t |tjr"|jjn|fqS rA   )rm   rC   r$   r?  r  )rF   r  r   rA   rA   rH   rI     s   zargsort_sym.<locals>.<listcomp>rc  c                 S  s   g | ]\}}|qS rA   rA   )rF   r  r   rA   rA   rH   rI     rJ   )r   rd  r   
cmp_to_key)r  r%  r[  exprsrI  rA   r*  rH   argsort_symw  s    r.  ztorch.dtype)r   r@   c                 C  s    | t jkrdS t jd| d S )Nre   rA   r   )rC   r	  r   Zelement_sizer/  rA   rA   rH   get_dtype_size  s    
r0  c                   @  s   e Zd ZU ded< dS )LineContextr
   contextNr|   r}   r~   r   rA   rA   rA   rH   r1    s   
r1  c                   @  s   e Zd ZU ded< ded< dS )ValueWithLineMapr>   rw   zlist[tuple[int, LineContext]]Zline_mapNr3  rA   rA   rA   rH   r4    s   
r4  c                   @  s2  e Zd ZdZd?dddddZejddd	d
dZddddZddddZ	ddddZ
ddddZddddZddddZddddZdddd d!Zd"dd#d$d%Zd@dd'd(d)d*ZdAddd(d+d,ZdBddd(d-d.ZdCd0ddd1d2d3Zd4d d5d6d7Zddd8d9Zd:d d;d<d=Zd>S )DIndentedBuffer   r   rf   rB  )initial_indentr@   c                 C  s   g | _ || _d S rx   )_lines_indent)ro  r7  rA   rA   rH   __init__  s    zIndentedBuffer.__init__r
  )tabwidthr@   c                 c  s*   | j }z|| _ d V  W || _ n|| _ 0 d S rx   )r;  )ro  r;  prevrA   rA   rH   set_tabwidth  s
    zIndentedBuffer.set_tabwidthr4  r?   c                 C  s   t  }d}g }| jD ]v}t|tr4| }|d u rVqn"t|trR|||jf qn|}t|tsdJ || |d |d|	d 7 }qt
| |S )Nr)   r  )r	   r8  rm   DeferredLineBaser1  r6  r2  r>   writecountr4  getvalue)ro  bufr   ZlinemaplilinerA   rA   rH   getvaluewithlinemap  s"    




z"IndentedBuffer.getvaluewithlinemapr>   c                 C  s
   |   jS rx   )rE  rw   rv  rA   rA   rH   rA    s    zIndentedBuffer.getvaluec                 C  s   t  }| jD ]r}t|tr,| }|d u r>qnt|tr:qn|}t|tsLJ |drj||d d  q|| |d q| S )N\r   r  )	r	   r8  rm   r>  r1  r>   endswithr?  rA  )ro  rB  rC  rD  rA   rA   rH   getrawvalue  s    




zIndentedBuffer.getrawvaluec                 C  s   | j   d S rx   )r8  clearrv  rA   rA   rH   rI    s    zIndentedBuffer.clearrk   c                 C  s
   t | jS rx   )rk   r8  rv  rA   rA   rH   __bool__  s    zIndentedBuffer.__bool__c                 C  s   d| j | j  S )Nr  )r9  r;  rv  rA   rA   rH   r    s    zIndentedBuffer.prefixc                 C  s   |  d d S )Nr  	writelinerv  rA   rA   rH   newline  s    zIndentedBuffer.newlinez)Union[LineContext, DeferredLineBase, str]rD  r@   c                 C  sl   t |tr| j| nPt |tr:| j||   n.| r\| j|   |  n| jd d S Nr  )rm   r1  r8  r6  r>  with_prefixr  stripro  rD  rA   rA   rH   rL    s    

zIndentedBuffer.writelinez3Sequence[Union[LineContext, DeferredLineBase, str]])linesr@   c                 C  s   |D ]}|  | qd S rx   rK  )ro  rS  rD  rA   rA   rH   
writelines  s    zIndentedBuffer.writelinesr)   'contextlib.AbstractContextManager[None])offsetr@   c                   s    t jdd fdd}| S )Nr
  r?   c                	   3  s<    j  7  _ zd V  W  j  8  _ n j  8  _ 0 d S rx   r9  rA   rV  ro  rA   rH   rz    s    z"IndentedBuffer.indent.<locals>.ctx)
contextlibcontextmanager)ro  rV  rz  rA   rX  rH   indent  s    zIndentedBuffer.indentc                 C  s   |  j |7  _ d S rx   rW  ro  rV  rA   rA   rH   	do_indent  s    zIndentedBuffer.do_indentc                 C  s   |  j |8  _ d S rx   rW  r\  rA   rA   rH   do_unindent  s    zIndentedBuffer.do_unindentFzUnion[IndentedBuffer, str])
other_coderQ  r@   c                 C  s   t |trtd}|jD ],}t |ts|rt|t|t|  }qt	|rTd}|jD ]4}t |trv| j
| qZt| |t|d   qZn@t|}|r| }|sd S | }|dD ]}| | qd S )Ninfr   r  )rm   r5  r   r8  r1  minrL   rx  mathisinfr6  rL  rf   textwrapdedentrstripr  )ro  r_  rQ  re  rD  r   rA   rA   rH   splice  s&    





zIndentedBuffer.splicezCallable[[Any], Any])r  r@   c                   s&   t | jd} fdd| jD |_|S )Nr7  c                   s   g | ]} |qS rA   rA   )rF   rD  r  rA   rH   rI   1  rJ   z&IndentedBuffer.map.<locals>.<listcomp>)r5  r9  r8  )ro  r  r   rA   ri  rH   rq   /  s    zIndentedBuffer.mapc                 C  s   t |  d|   dS )Nr  r  )r   rA  rv  rA   rA   rH   __repr__4  s    zIndentedBuffer.__repr__r   )otherr@   c                 C  s8   | j |j ksJ t| j d}|| j ||j |S )Nrh  )r9  r5  rT  r8  )ro  rk  r   rA   rA   rH   __add__7  s
    zIndentedBuffer.__add__N)r   )r)   )r)   )r)   )F)r|   r}   r~   r;  r:  rY  rZ  r=  rE  rA  rH  rI  rJ  r  rM  rL  rT  r[  r]  r^  rg  rq   rj  rl  rA   rA   rA   rH   r5    s*   
 r5  c                      s2   e Zd Zdd fddZddddd	Z  ZS )
FakeIndentedBufferrB  r?   c                   s   t    d S rx   )superr:  rv  	__class__rA   rH   r:  A  s    zFakeIndentedBuffer.__init__r>   r
   r  c                 C  s(   |dkrt | |S td| dd S )Nrp  zTried to call self.z on FakeIndentedBuffer. This bufferis currently used on TritonTemplateKernel to prevent actualwrites to the body without explicitly specifying the body with`TritonTemplateKernel.set_subgraph_body(name)`)object__getattribute__r   )ro  r   rA   rA   rH   rr  D  s
    
z#FakeIndentedBuffer.__getattribute__)r|   r}   r~   r:  rr  __classcell__rA   rA   ro  rH   rm  @  s   rm  c               	   c  s<   t jt j } }zd V  W | | t _t _n| | t _t _0 d S rx   )r  stdoutstderr)Zinitial_stdoutZinitial_stderrrA   rA   rH   restore_stdout_stderrO  s    rv  c                   @  s   e Zd ZdZddddZdddd	Zdd
dddZdd
dddZd
dddZdd
dddZ	ddddZ
ddddZdS )r>  z.A line that can be 'unwritten' at a later timer>   )rD  c                 C  s   |  sd}|| _d S rO  )rQ  rD  rR  rA   rA   rH   r:  [  s    zDeferredLineBase.__init__zUnion[str, None]r?   c                 C  s   t dS )zJReturns either self.line or None to indicate the line has been 'unwritten'Nr  rv  rA   rA   rH   rp  `  s    zDeferredLineBase.__call__r   rN  c                 C  s   t dS )z3Returns a new deferred line with the same conditionNrw  rR  rA   rA   rH   	_new_lined  s    zDeferredLineBase._new_liner  c                 C  s   |  | | j S rx   rx  rD  )ro  r  rA   rA   rH   rP  h  s    zDeferredLineBase.with_prefixc                 C  s   |  | j S rx   )rx  rD  rx  rv  rA   rA   rH   rx  k  s    zDeferredLineBase.lstripzUnion[int, slice]r  c                 C  s   |  | j| S rx   ry  )ro  r   rA   rA   rH   r'  n  s    zDeferredLineBase.__getitem__rk   c                 C  s
   t | jS rx   )rk   rD  rv  rA   rA   rH   rJ  q  s    zDeferredLineBase.__bool__rf   c                 C  s
   t | jS rx   )rL   rD  rv  rA   rA   rH   __len__t  s    zDeferredLineBase.__len__N)r|   r}   r~   r   r:  rp  rx  rP  rx  r'  rJ  rz  rA   rA   rA   rH   r>  X  s   r>  c                      sH   e Zd ZdZdddd fddZdddd	Zdd d
ddZ  ZS )DelayReplaceLinez6At end of codegen call `line.replace(key, value_fn())`r>   zCallable[[], str])r   value_fnrD  c                   s   t  | || _|| _d S rx   )rn  r:  r   r|  )ro  r   r|  rD  ro  rA   rH   r:  {  s    zDelayReplaceLine.__init__r?   c                 C  s   | j | j|  S rx   )rD  replacer   r|  rv  rA   rA   rH   rp    s    zDelayReplaceLine.__call__rN  c                 C  s   t | j| j|S rx   )r{  r   r|  rR  rA   rA   rH   rx    s    zDelayReplaceLine._new_line)r|   r}   r~   r   r:  rp  rx  rs  rA   rA   ro  rH   r{  x  s   r{  zUnion[int, torch.device])index_or_devicer@   c                 C  s   t | tjr| }ntt | }t|}tjjrf|jd us@J |jdk sT|jdkrbt	
d dS dS |jdkrtdnd}|j}||k rt	j
d	||d
d dS dS )N	   rL  z6GPU arch does not support max_autotune_gemm mode usageFTr<   rb   D   z,Not enough SMs to use max_autotune_gemm mode)min_sms	avail_sms)extra)rm   rC   r   rN   r   createversionhipmajorr   r  r   multi_processor_count)r~  r   propr  r  rA   rA   rH   
is_big_gpu  s&    

r  c                   C  s   t jdjS )Nr:   )rC   r:   get_device_propertiesr  rA   rA   rA   rH   get_max_num_sms  s    r  c                  C  s    t j } t | dur| nd S )zFHandle experimental carveout if set otherwise return hardware SM countNr   )rC   r   Z_get_sm_carveout_experimentalr  )ZcarveoutrA   rA   rH   get_num_sms  s    
r  zOptional[int]r*   )num_tma_descriptorsr   num_programsr@   c                 C  sH   ddl m}m} |du rt }|d}||  t }||||| dS )zKBuilds and returns a WorkspaceArg for the device side TMA workspace buffer.r)   )r*   WorkspaceZeroModeNF)r@  	zero_moder   Z
outer_name)codegen.commonr*   r  r  Z	from_boolTMA_DESCRIPTOR_SIZEZunique_name)r  r   r  r*   r  r  r\  rA   rA   rH   get_tma_workspace_arg  s    
r  r3   zlist[torch.dtype])layoutallowed_layout_dtypesr@   c                 C  s:   | j |vrtd| j | t| jjo8| j |v o8t| jS )NzDNot using template since dtype %s is not in allowed layout dtypes %s)r   r   r   is_gpur   r   r  )r  r  rA   rA   rH   _use_template_for_gpu  s    
r  )backendr@   c                 C  s"   |   dd tj  dD v S )Nc                 S  s   g | ]}|  qS rA   rQ  rE   rA   rA   rH   rI     s   z)_use_autotune_backend.<locals>.<listcomp>,)upperr^   Zmax_autotune_gemm_backendsr  r  rA   rA   rH   _use_autotune_backend  s    r  c                 C  s"   |   dd tj  dD v S )Nc                 S  s   g | ]}|  qS rA   r  rE   rA   rA   rH   rI     s   z._use_conv_autotune_backend.<locals>.<listcomp>r  )r  r^   Zmax_autotune_conv_backendsr  r  rA   rA   rH   _use_conv_autotune_backend  s    r  F)enable_int32enable_float8)r  r  r  r@   c                C  s   ddl m}m} tjtjtjg}|r8tjtjtjtjg}|rN|tj	tj
g t| jjrdt| |sz| jjdko| j|v otjstjotdo|| j|jS )Nr)   )BackendFeaturehas_backend_featurer   ZTRITON)r  r  r  rC   r   r  r  r  extendr   r   r  r   r   r  r   r^   max_autotunemax_autotune_gemmr  ZTRITON_TEMPLATES)r  r  r  r  r  layout_dtypesrA   rA   rH   use_triton_template  s"    
	r  r2   )matricesr@   c                    sd   ddl m}m} ddlm  ddd fdd	| r@tjr@d
S tjjob| obt	fdd| D S )Nr   )has_triton_stable_tma_apihas_triton_tma_devicer)   r  r2   rk   r^  c                   s   t |  dkrdS |  }|tjtjtjfvr4dS |  }| }|	 sT|sTdS |j
d }|rl|j
d }|tjkr jj|drdS ||j } jj|tS )Nr  Fr)   r       )rL   get_size	get_dtyperC   r   r  r   Z
get_layoutZis_transposedis_contiguousr\  r  r  statically_known_ltitemsizeZstatically_known_multiple_ofTMA_ALIGNMENT)rG   r   r  Z
transposedZ	inner_dimZinner_bytesr  rA   rH   _is_tma_compatible  s$    


z3use_triton_tma_template.<locals>._is_tma_compatibleFc                 3  s   | ]} |V  qd S rx   rA   )rF   r  )r  rA   rH   r     rJ   z*use_triton_tma_template.<locals>.<genexpr>)
Ztorch.utils._tritonr  r  r  r  r^   cpp_wrapperr  Zenable_persistent_tma_matmulrp   )r  r  r  rA   )r  r  rH   use_triton_tma_template  s    r  )r  r  r  r  r@   c           	      C  s   ddl m} |jjj|| | dd}|dks:|tjjk r>dS ddlm	} t
jjrVdS t
jt
jt
jg}t| |otjs|tjotd}|r| std	 dS |S )
Nr)   r  r   fallbackr   F)try_import_cutlassZCUTLASSzFailed to import CUTLASS lib. Please check whether _inductor.config.cuda.cutlass_dir is set correctly. Skipping CUTLASS backend for now.)r  r  r  r  	size_hintr^   r:   Zcutlass_backend_min_gemm_sizeZcodegen.cuda.cutlass_utilsr  rC   r  r  r   r  r  r  r  r  r  r   r  )	r  r  r  r  r  Z	gemm_sizer  r  r   rA   rA   rH   use_cutlass_template!  s(    

r  )op_namer@   c                 C  s4   t jj }|dkrdS |  dd |dD v S )z8Check if CUTLASS should be used for the given operation.ALLTc                 S  s   g | ]}|  qS rA   r  rE   rA   rA   rH   rI   F  rJ   z'_use_cutlass_for_op.<locals>.<listcomp>r  )r^   r:   Zcutlass_enabled_opsr  r  )r  Zenabled_opsrA   rA   rH   _use_cutlass_for_opA  s    r  r  r   )rb   r  rd   rc      r   _IntLike)r  r  r  r@   c              
   C  sV   ddl m} |jjtt|t|  t|t| oT|jj	 oT|jj
 oTtj S )Nr   r  )torch._inductor.virtualizedr  r  r  statically_known_truern   AndZGedecompose_k_thresholdZaot_moder  r^   Zdisable_decompose_k)r  r  r  r  rA   rA   rH   use_decompose_k_choiceT  s    r  c           
        s*  t |tjr|jstS t | tjr(| jr:t |tjr@|js@d nt||  ||  dt|} fdd|D }g g g   }}}|D ]\}|| }|dk rq||d @ dkr|dkr|| q|d dkr|| q|| qtj	d	kr|| | S t
|tkr|S || | }	|	d t S d S )
Nr  r  c                   s    g | ]}| kr|kr|qS rA   rA   )rF   ZdivisorZmax_k_splitZmin_k_splitrA   rH   rI   u  s   z get_k_splits.<locals>.<listcomp>rc   r)   r   r  Z
EXHAUSTIVE)rm   rn   r   	is_numberdefault_k_splitsra  divisorsr6  r^   Zmax_autotune_gemm_search_spacerL   k_splits_limit)
r  r  r  r  Zpow_of_2_divisorsZmul_of_32_divisorsZrest_of_splitsdZkPartZbest_splitsrA   r  rH   get_k_splitsd  s<    


r  c                 C  s   t j| jS rx   )rC   r:   r  ZgcnArchNamer   rA   rA   rH   _rocm_native_device_arch_name  s    r  zQtuple[Optional[str], Callable[[], list[Any]], Callable[[], list[Any]], type[Any]]c                  C  s   z6dd l } ddlm}m} ddlm} tj| j	}W n@ t
yv   dddd}dddd	}G d
d d}d }Y n0 ||||fS )Nr   )gen_ops_librarygen_ops_preselected)CKGemmOperationr2  r?   c                   S  s   g S rx   rA   rA   rA   rA   rH   r    s    z*try_import_ck_lib.<locals>.gen_ops_libraryc                   S  s   g S rx   rA   rA   rA   rA   rH   r    s    z.try_import_ck_lib.<locals>.gen_ops_preselectedc                   @  s   e Zd ZdS )z*try_import_ck_lib.<locals>.CKGemmOperationN)r|   r}   r~   rA   rA   rA   rH   r    s   r  )ck4inductorZ(ck4inductor.universal_gemm.gen_instancesr  r  Zck4inductor.universal_gemm.opr  r  r  dirname__file__r   )r  r  r  r  Zpackage_dirnamerA   rA   rH   try_import_ck_lib  s    
r  )r  r@   c                   s   t jst jsdS tjjsdS | jjdks,dS t| j}dd t j	j
D pX|dd |i  fdd  t j	j@ D }|sdS | jtjtjtjfvrdS t \}}}}|std	 dS t  r|t j	_t j	jstd
 dS |t j	jkrtd dS dS )NFr:   c                 S  s   i | ]}| d d |qS ):r   )r  rF   r  rA   rA   rH   r     rJ   z#use_ck_template.<locals>.<dictcomp>r  r   c                   s   g | ]} | qS rA   rA   r  Zrequested_archsrA   rH   rI     s   z#use_ck_template.<locals>.<listcomp>z,Please pip install Composable Kernel packagez,Please set TORCHINDUCTOR_CK_DIR env variablezInvalid path to CK libraryT)r^   r  r  rC   r  r  r   r   r  Zrocmarchr  r  Zck_supported_archr   r   r  r  r  r   r  	is_fbcodeZck_dir)r  Znative_archZrequested_supported_archsZck_package_dirnamer   rA   r  rH   use_ck_template  s<    




r  c                 C  s:   ddl m} tdo8t| o8|jjj|| | dddkS )Nr)   r  CKr   r  r   r  r  r  r  r  r  r  r  r  r  r  r  rA   rA   rH   use_ck_gemm_template  s    r  c                 C  s:   ddl m} tdo8t| o8|jjj|| | dddkS )Nr)   r  ZCKTILEr   r  r   r  r  rA   rA   rH   use_ck_tile_gemm_template  s    r  c                 C  s   t dot| S )Nr  )r  r  r  rA   rA   rH   use_ck_conv_template   s    r  c                 C  s   t jst jo| jjdkS rC  )r^   r  r  r   r   r  rA   rA   rH   _use_template_for_cpu  s    

r  zUnion[ReinterpretView, Buffer])r  mat1mat2r@   c                 C  s6   ddl m} t|j|sJ t| ||ddo4|j S )Nr)   )r3   F)require_constant_mat2)r  r3   rm   r  use_cpp_gemm_templater  )r  r  r  r3   rA   rA   rH   use_cpp_bmm_template
  s
    r  )r  r  r  mat2_transposedr  is_woq_int4q_group_sizer@   c                 C  sJ  ddl m} ddlm} ddlm}	 ddlm}
 t| r@t	dsDdS t
jjsPdS | tjtjfv }tjtjtjtjg}|
|||r| jnd ||d\}}}} }}t||frdS t||jr| }|	| \}}|d	|||| | |t | |d

}ddddd}| j|v oH|d uoH||oHt||joH| pH| S )Nr)   r~  )create_micro_gemm)*get_gemm_template_output_and_compute_dtype)mm_argsCPPF)	out_dtyper  Zuse_4x2_dim
micro_gemm)Zinput_dtypeZinput2_dtypeoutput_dtypeZnum_threadsZuse_refr  r2   rk   r^  c                 S  s   |    |  d dkS )Nr   r)   )Zfreeze_layoutZ
get_striderG   rA   rA   rH   is_last_dim_stride1J  s    z2use_cpp_gemm_template.<locals>.is_last_dim_stride1)r  r  Zcodegen.cpp_micro_gemmr  Zcodegen.cpp_utilsr  Zkernel.mm_commonr  r  r  r^   cppZweight_prepackr  rC   r  r  r  r  Zhalfr   has_free_symbolsrm   BaseViewZunwrap_viewparallel_num_threadsr  Zis_module_buffer)r  r  r  r  r  r  r  r  r  r  r  Z	int8_gemmr  r  r  r  r  r   r  r  rA   rA   rH   r    sX    		
r  c                   C  s   t jp
t j ptdS )NZATEN)r^   r  r  r  rA   rA   rA   rH   use_aten_gemm_kernelsW  s    
r  c                   @  sL   e Zd ZU edZded< ddddZdddd	Zd
ddddZ	dS )DebugDirManagerr   r>   prev_debug_namerB  r?   c                 C  s   t tj| _d S rx   )r  r  counterr   rv  rA   rA   rH   r:  a  s    zDebugDirManager.__init__c                 C  s0   t jjj| _| j d| j | _| jt jj_d S )NZ_tmp_)rC   _dynamor^   debug_dir_rootr  r   new_namerv  rA   rA   rH   	__enter__d  s    zDebugDirManager.__enter__r
   r  c                 G  s   t | j | jtjj_d S rx   )r   r!  r  r  rC   r  r^   r  )ro  rs   rA   rA   rH   __exit__i  s    zDebugDirManager.__exit__N)
r|   r}   r~   r  r@  r  r   r:  r  r   rA   rA   rA   rH   r  ]  s
   

r  zCallable[P, _T]rk  rl  ztuple[_T, list[str]])r   rs   r3  r@   c                   st   ddl m} g  ddd fdd}tj|d|( tj  | |i |}W d    n1 sb0    Y  | fS )	Nr)   r-   r>   rB  coder@   c                   s     |  d S rx   r6  r  source_codesrA   rH   save_output_codew  s    z*run_and_get_code.<locals>.save_output_coder  r  r.   r   r  rq  rC   r  reset)r   rs   r3  r.   r  rI  rA   r  rH   run_and_get_coden  s    
,r
  c                 O  sF   t | g|R i |\}}g }|D ]}|td|tj q"||fS )Nz	'''.*?''')r
  r  refindallDOTALL)r   rs   r3  rI  r  Zkernelsr  rA   rA   rH   run_and_get_kernels  s
    r  ztuple[Any, list[str]]c                   s   dd fdd}t |S )Nr
   r?   c                    s     } |     | S rx   )r   ZbackwardrU  r   rA   rH   run_with_backward  s    z1run_fw_bw_and_get_code.<locals>.run_with_backward)r
  )r   r  rA   r  rH   run_fw_bw_and_get_code  s    r  c              	     s   ddl m} g dddfdd dd	d
 fdd}tj|d|X tj|d ( tj  | |i |}W d   n1 s0    Y  W d   n1 s0    Y  S )zLGet the inductor-generated code, but skip any actual compilation or running.r)   r-   r>   rB  r  c                   s     |  d S rx   r  r  r  rA   rH   r    s    z"get_code.<locals>.save_output_coder.   r
   rt  c                   sF   G dd d}| j r|  n|  \}} |j |r@ |j | S )Nc                   @  s0   e Zd ZdZddddZdddddd	Zd
S )z@get_code.<locals>.patched_compile_to_module.<locals>.DummyModulez4This is empty to replace the generated triton modulerB  r?   c                 S  s   d S rx   rA   rv  rA   rA   rH   r:    s    zIget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.__init__r
   rm  c                 _  s   d S rx   rA   rn  rA   rA   rH   call  s    zEget_code.<locals>.patched_compile_to_module.<locals>.DummyModule.callN)r|   r}   r~   r   r:  r  rA   rA   rA   rH   DummyModule  s   r  )r  Zcodegen_with_cpp_wrapperZcodegenrw   )ro  r  Zwrapper_codekernel_code)r  rA   rH   patched_compile_to_module  s    

z+get_code.<locals>.patched_compile_to_moduleZcompile_to_moduler  Nr  )r   rs   r3  r.   r  r   rA   )r  r  rH   get_code  s    
Jr  c                 O  sJ   t | g|R i |}dt|  kr.dksBn J dt| |d S Nr)   r  z%expected one or two code outputs got r   )r  rL   )r   rs   r3  r  rA   rA   rH   get_triton_code  s
    r  c                 O  sN   t | g|R i |\}}dt|  kr2dksFn J dt| |d S r  )r
  rL   )r   rs   r3  r   r  rA   rA   rH   run_and_get_triton_code  s
    r  ztuple[Any, list[GraphLowering]]c                   s   ddl m  ddlm} |jg dddd fdd}tj|d	| | |i |}W d    n1 sp0    Y  |fS )
Nr   r-   r6   r
   rB  rm  c                    s2   | i | | d }t | s$J | d S )Nr  )rm   r6  )rs   r3  r  r.   Zgraph_loweringsZ	real_initrA   rH   	fake_init  s    z-run_and_get_graph_lowering.<locals>.fake_initr:  )Ztorch._inductor.graphr.   Ztorch._inductor.output_coder7   r:  r   r  rq  )r   rs   r3  r7   r  rI  rA   r  rH   run_and_get_graph_lowering  s    ,r  )aten_opoverride_fnr@   c              	   c  sN   ddl m} |j|  }z&t|||j| < dV  W ||j| < n||j| < 0 dS )z
    Override the lowering of aten_op with override_fn.
    The first argument of override_fn is the original lowering fn.
    r   )loweringN)Ztorch._inductorr  Z	loweringsr   partial)r  r  r  orig_fnrA   rA   rH   override_lowering  s    
r"  zOptional[Callable[..., Any]])pre_fnpost_fnr@   c                   s>   ddl m} |j dddd fdd}tjj|d|S )zr
    Add hook functions to be called at the beginning and end of Scheduler.__init__.
    Used for unit tests.
    r   )	Schedulerr
   )r`  r  r@   c                   s&   | |  | |}r"| | |S rx   rA   )r`  r  outr!  r$  r#  rA   rH   r{    s
    


z(add_scheduler_init_hook.<locals>.wrapperr:  )torch._inductor.schedulerr%  r:  unittestr   r  rq  )r#  r$  r%  r{  rA   r'  rH   add_scheduler_init_hook  s    r*  )msgr@   c                 C  s    t jrt|  n
t|  dS )z
    Warnings that will be actionable for PyTorch developers, but not
    end users.  Allows us to easily disable them in stable releases but
    keep them on for nightly builds.
    N)r^   Zdeveloper_warningsr   r  info)r+  rA   rA   rH   developer_warning  s    r-  c                  C  s   z^t jd} | d tt jk r\tt j| d  dkr\t j| d  d dkr\t j| d  W S W n typ   Y n0 t jD ]"}|drx|tdd   S qxdS )a  
    An experimental API used only when config.benchmark_kernel is true.

    The benchmark name is only available at codegen time. So we can not
    directly call it in benchmark_all_kernels which is run after codegen.

    The function assumes the argument after --only is the benchmark name.
    It works for torchbench.py/hugginface.py/timm_models.py. But for ad-hoc
    scripts, this function may return None.

    There are 2 flavors of --only argument we need handle:
    1. --only model_name
    2. --only=model_name
    z--onlyr)   r   r  z--only=N)r  argvr   rL   
ValueErrorr  )r  r5  rA   rA   rH   get_benchmark_name  s    

r0  )r  r@   c                 C  s   t dd | D S )Nc                 s  s   | ]}|d kV  qdS r)   NrA   rE   rA   rA   rH   r   =  rJ   zis_ones.<locals>.<genexpr>rp   r  rA   rA   rH   is_ones<  s    r4  c                 C  s   t dd | D S )Nc                 s  s   | ]}|d kV  qdS )r   NrA   rE   rA   rA   rH   r   A  rJ   zis_zeros.<locals>.<genexpr>r2  r3  rA   rA   rH   is_zeros@  s    r5  zSequence[torch.Tensor])inputsr@   c                 C  s   t dd | D S )Nc                 s  s*   | ]"}t |tjr|jtd kV  qdS )r   N)rm   rC   r;  r   )rF   r   rA   rA   rH   r   E  s   z is_cpu_device.<locals>.<genexpr>r2  )r6  rA   rA   rH   is_cpu_deviceD  s    r7  )r  r@   c                 C  s*   t | tjsJ d| jr tjS tjS d S )Nz8only support sympy.Expr as input to get_sympy_Expr_dtype)rm   rn   r   r   rC   r  r  )r  rA   rA   rH   get_sympy_Expr_dtypeL  s    r8  zIterator[Any])should_profilers   r3  r@   c                 o  sH   | r>t jj|i |}|V  W d    qD1 s20    Y  nd V  d S rx   )rC   r   r   )r9  rs   r3  r   rA   rA   rH   maybe_profileV  s    &r:  c                  C  s   t jj} | dk rt } | S Nr)   )r^   r  threadsrC   Zget_num_threads)r<  rA   rA   rH   r  _  s    r  c                  C  s*   ddl m}  |  }|dtjjr$dndS )Nr)   )get_backend_optionsZ
num_stagesr     )Zruntime.triton_helpersr=  r  rC   r  r  )r=  optionsrA   rA   rH   get_backend_num_stagesf  s    r@  c                 C  s   ddl m}m} | tjtjtjfv s(J t|j	
drddlm} | }| tjtjfv rf|| |S tjjjjr~|tj|S |tj|S n8| tjtjfv r|| S tjjjjr|tjS |tjS d S )Nr   )get_max_simd_tflopsget_max_tensorcore_tflopsZ
clock_rate)max_clock_rate)triton.testingrA  rB  rC   r   r  r  inspect	signature
parametersr  Ztorch._utils_internalrC  backendsr:   matmulZ
allow_tf32)r   rA  rB  rC  Zsm_clockrA   rA   rH   get_device_tflopsn  s    

rJ  c                  C  s   ddl m}  |  S )Nr   get_dram_gbps)rD  rL  rK  rA   rA   rH   get_gpu_dram_gbps  s    rM  c                  C  s"   ddl m}  | jjdddS )Nr   r  Zmax_shared_mem)Ztriton.runtimer  r  r  r  r  rN  rA   rA   rH   get_gpu_shared_memory  s    rO  )reduction_typer@   c                 C  s
   |  dS )NZwelford)r  rP  rA   rA   rH   is_welford_reduction  s    rR  c                 C  s    t | rdS | dkrdS dS d S )Nr>  Zonline_softmax_reducer  r)   )rR  rQ  rA   rA   rH   reduction_num_outputs  s
    rS  c                   C  s   t  dkS )NLinux)platformsystemrA   rA   rA   rH   is_linux  s    rW  c                   C  s
   t jdkS )Nr`   )r  rU  rA   rA   rA   rH   r    s    r  zIterable[Any])itrr@   c                 C  s   t dd | D S )Nc                 s  s"   | ]}t |tjo|j V  qd S rx   )rm   rn   r   r  rE   rA   rA   rH   r     rJ   z#has_free_symbols.<locals>.<genexpr>r  )rX  rA   rA   rH   r    s    r  c                  G  s   ddl m} | D ]l}t||j|j|j|j|jfrZt|	 p>dsRt|
 pNdr| dS qt||jsjqqtdt| qdS )Nr)   r~  rA   Tzunexpected type for is_dynamic F)r  r  rm   r  r  r  ZComputedBufferr/   r  Zmaybe_get_sizeZmaybe_get_strider2   	TypeErrorr   )rs   r  trA   rA   rH   
is_dynamic  s    
r[  c                   @  s   e Zd ZdZdZdS )PlaceholderKERNEL_NAMEDESCRIPTIVE_NAMEN)r|   r}   r~   r]  r^  rA   rA   rA   rH   r\    s   r\  r&   )r  r@  inpr+  r@   c              	   C  s6  ddl m} tjdddd}t }t }t|t|dj|  t	d|j
 |d	 t	|j
|d	 t }t|| | |j
 W d    n1 s0    Y  t | }	||j
 |j
  |  t	d
|j
 |d	 t	|j
|d	 | | k}
td||j|
|	 W d    n1 s(0    Y  d S )Nr)   )stable_topological_sortwzutf-8F)modeencodingr  )r@  	fake_modezBefore:
)filezAfter:
zZ%s, save before/after graph to %s, graph before/after are the same = %s, time elapsed = %s)Zpattern_matcherr`  r  NamedTemporaryFileior	   rT   rP   	propagaterQ  r  r   nowrS   ZlintZ	recompilerA  r   r,  r   )r  r@  r_  r+  r`  r  Z	before_ioZafter_io
start_timeZtime_elapsedrZ  rA   rA   rH   pass_execution_and_save  s:    (

rk  z"Optional[Union[Buffer, Operation]])	input_bufr@   c                 C  s&   ddl m} t| |jo$t| j|jS )zB
    Check if input buffer is a multi-outputs template buffer
    r)   r~  )r  r  rm   ZCppTemplateBufferr  ZMultiOutputLayoutrl  r  rA   rA   rH   is_multi_outputs_template  s    rn  c                 C  s4   ddl m} t| |jo2t| jdko2t| jd S )zL
    Check if input buffer is a output of multi-outputs template buffer
    r)   r~  r   )r  r  rm   ZMultiOutputrL   r6  rn  rm  rA   rA   rH   #is_output_of_multi_outputs_template  s    ro  z Optional[Union[Node, Operation]]z!Optional[torch._ops.OperatorBase])r?  r  r@   c                 C  s   | d u rdS ddl m} t| |jkr8|d u p| j|u pt| |jkottjj	drf| jtjj	j
jkpttjj	dr| jtjj	jjkpttjj	do| jtjj	jjkS )NFr)   r~  all_to_all_singleall_gather_into_tensorreduce_scatter_tensor)r  r  r   Z_CollectiveKernelop_overloadFallbackKernelr   rC   r   Ztorchrecrp  defaultrq  rr  r?  r  r  rA   rA   rH   is_collective	  s"    

rw  z"Optional[Union[IRNode, Operation]])r?  r@   c                 C  s   ddl m} t| |jkS Nr)   r~  )r  r  r   Z_WaitKernelr?  r  rA   rA   rH   is_wait0	  s    rz  r8   )snoder@   c                 C  s4   ddl m} t| |r*tdd | jD S t| jS )Nr   GroupedSchedulerNodec                 s  s   | ]}t |V  qd S rx   )contains_collectiverE   rA   rA   rH   r   :	  rJ   z&contains_collective.<locals>.<genexpr>)r(  r}  rm   r  snodesrw  r?  r{  r}  rA   rA   rH   r~  6	  s    
r~  c                 C  s8   ddl m} t| |r*tdd | jD S t| jS d S )Nr   r|  c                 s  s   | ]}t |V  qd S rx   )contains_waitrE   rA   rA   rH   r   C	  rJ   z contains_wait.<locals>.<genexpr>)r(  r}  rm   r  r  rz  r?  r  rA   rA   rH   r  ?	  s    
r  zOptional[Operation]z?Union[torch._ops.OpOverload, Collection[torch._ops.OpOverload]]c                 C  s6   ddl m} t|tjjr |g}t| |jo4| j|v S rx  )r  r  rm   rC   r,  r-  rt  rs  rv  rA   rA   rH   is_fallback_opH	  s    r  )buf_namename_to_bufname_to_fused_noder@   c                 C  s   |||  j   S rx   )Zdefining_opra  )r  r  r  rA   rA   rH   buf_name_to_fused_snodeS	  s    r  c                 C  s   dS r#  rA   r{  rA   rA   rH   r$  ^	  rJ   zMutableSet[BaseSchedulerNode]zdict[str, SchedulerBuffer]zdict[str, BaseSchedulerNode]zCallable[[Any], bool])r{  collected_node_setr  r  criteria_cbr@   c                 C  sP   || rd S | |  | jD ].}t|j||}||v r8qt|||||d qd S )Nr  )r  Zunmet_dependenciesr  r   find_recursive_deps_of_node)r{  r  r  r  r  depZdefining_op_for_deprA   rA   rH   r  Y	  s     

r  c                 C  s   dS r#  rA   r  rA   rA   rH   r$  w	  rJ   c              	   C  s   || rd S | |  |  D ]h}|jD ]\}|jd us:J |j dkrJq(|j |vrZq(||j  }||v rrq(t|||||d q(qd S )NZOUTPUTr  )r  get_outputsr/  r?  ra  find_recursive_users_of_node)r{  r  r  r  r  or  Zuser_oprA   rA   rH   r  r	  s(    

r  )dynamo_gm_num_inputsaot_fw_gm_num_inputsr@   c                 C  s   t jjjrdnd}||  | S )zaComputes the number of inputs to the aot fw graph which have fixed addresses (params and buffers)r  r   )rC   Z
_functorchr^   Zfunctionalize_rng_ops)r  r  Znum_rng_seed_offset_inputsrA   rA   rH   num_fw_fixed_arguments	  s    r  )fx_gr@   c                 C  sj   ddddd}d}g }| j jD ](}|jdkr ||r@|| |d7 }q |ttt|ksbJ t|S )	z>
    Infers which inputs are static for a backwards graph
    r(   rk   r^  c                 S  s(   d| j vo&d| j vo&d| j vo&d| j vS )NZtangentsZbwd_seedZbwd_base_offsetZbwd_rng_stater  r  rA   rA   rH   is_saved_tensor	  s    
z'count_tangents.<locals>.is_saved_tensorr   r7  r)   )r  r  r  r6  r  r   rL   )r  r  	arg_countZstatic_arg_idxsr  rA   rA   rH   count_tangents	  s    


r  c                   @  s8   e Zd ZU ded< ddddZedddd	d
ZdS )	BoxedBoolrk   rw   r?   c                 C  s   | j S rx   )rw   rv  rA   rA   rH   rJ  	  s    zBoxedBool.__bool__r
   zUnion[BoxedBool, bool]r  c                 C  s   t | trd| _| S dS r#  )rm   r  rw   r  rA   rA   rH   disable	  s    
zBoxedBool.disableN)r|   r}   r~   r   rJ  rq  r  rA   rA   rA   rH   r  	  s   
r  )kernel_listr@   c              	   #  sn   ddl m} |jddddddddd	 fd
d}tj|d| d V  W d    n1 s`0    Y  d S )Nr)   r+   Tr,   r>   r	  rk   r
   )ro  kernel_namer  r  gpucpp_definitionr@   c                   s     | | |||||S rx   r  )ro  r  r  r  r  r  r  Zorig_define_kernelrA   rH   define_kernel	  s    
z.collect_defined_kernels.<locals>.define_kernelr  )NTN)codegen.wrapperr,   r  r   r  rq  )r  r,   r  rA   r  rH   collect_defined_kernels	  s       "r  c                 C  s   | d S )NZ__original__rA   r  rA   rA   rH    get_cloned_parameter_buffer_name	  s    r  c                 C  s   | t v S rx   )rK   r  rA   rA   rH   r  	  s    r  c                 C  s   | dkot | S )Nr;   )r  r  rA   rA   rH   device_need_guard	  s    r  c                 C  sP   t  r4| tjkr4tj r4tj dkr4t jr4dS | ttj	tj
tjgv S d S )N)r  r   F)r^   r  rC   r  r:   rD   Zget_device_capabilityZbfloat16_atomic_adds_enabledr   r  rk   r/  rA   rA   rH   ,needs_fallback_due_to_atomic_add_limitations	  s    r  )rs  rP  
self_dtype	src_dtypesrc_device_typesrc_is_tensorr@   c                 C  s   | j tjjjtjjjfv r&|d u r&dS | j tjjjkr:dnd}|d |fvp|r^t|r^t|p| j tjjjkr|dkr|r|dkrt	j
jrt	j
jpt dkp||kr|tjtjfv pt S )NFr  r   r   r)   )ZoverloadpacketrC   r   ZatenZscatter_reduce_Zscatter_reduceZscatter_r  r  r^   r  Zfallback_scatter_reduce_sumZdynamic_threadsr  rk   r  r  )rs  rP  r  r  r  r  Z	reduce_tyrA   rA   rH   use_scatter_fallback	  s<    	r  c                 C  s  ddl m}m} ddlm} tdt|  d t| D ]\}}td|dd ||u rdtd	 q8||u rvtd
 q8t||r|	 }t|rdnd d |r|j
dusJ td|j
jj  td |jjD ]}t| qtd |jjD ]}t| qq8tdt| q8dS )z
    An API that can be used in pdb to dump a node_schedule.
    Right mainly dump the read/write dependencies but can add more as needed.
    r   DisableReductionEnableReduction)SchedulerNodezNode schedule with z nodesr  3r  zenable reductionzdisable reductionZredpwz scheduler nodeNzoriginal reduction hint zReadDep:z	WriteDep:zUnrecognized node type: )Ztorch._inductor.codegen.simdr  r  r(  r  rQ  rL   r   rm   Zis_reductionr?  r  Zreduction_hintZread_writesZreadsZwritesr   r   )r}  r  r  r  r  r?  Zis_redr  rA   rA   rH   dump_node_schedule
  s,    


r  r4  )r   r@   c                 C  s*   ddl m} ||  t| j t dkS )Nr   )r  )r  r  storage_offsetr0  r   GPU_ALIGN_BYTES)r   r  rA   rA   rH   tensor_is_aligned;
  s    r  )example_inputr@   c                 C  s   t | jjsdS tjpt| S r#  )r  r   r   r^   Zassume_aligned_inputsr  )r  rA   rA   rH   should_assume_input_alignedI
  s    r  rU  c                  C  s4   t jj } | st S | jj}|s,t S | S rx   )	rC   _guardsTracingContexttry_getrY  nullcontextrd  r  Zsuppress_guards)tracing_contextr  rA   rA   rH   #maybe_get_suppress_shape_guards_ctxR
  s    r  ztuple[_T, str]c                 O  s   t jjtdd tj  dd l}dd l	}|
 }||}ddlm} || |j}||j | |i |}	| }
|| || W d    n1 s0    Y  |	|
fS )Nr   Tr   )output_code_log)r)  r   r  rq  r^   rC   r  r	  rg  loggingr	   StreamHandlerZtorch._inductor.codecacher  
addHandlerlevelsetLevelDEBUGrA  removeHandler)r   rs   r3  rg  r  Zlog_capture_stringchr  Z
prev_levelrI  r   rA   rA   rH   run_and_get_cpp_codec
  s    



(r  zSequence[InputType]zOptional[ShapeEnv]c                 C  s<   t | }|d ur|jS | D ]}t|tjr|jj  S qd S rx   )rP   r  rm   rC   r$   r?  )r6  rd  inputrA   rA   rH   shape_env_from_inputs|
  s    r  zCallable[[list[InputType]], _T]zOrderedSet[int])rD  inputs_to_checkmutated_input_idxsr@   c                   s,   t  dkrS ddd fdd}|S )Nr   list[InputType]r
   )
new_inputsr@   c                   s0   t |  \}}| }t|r,t|| |S rx   )copy_misaligned_inputsrL   rC   Z_foreach_copy_)r  old_tensorsnew_tensorsr&  r  rD  r  rA   rH   r  
  s    z)align_inputs_from_check_idxs.<locals>.run)rL   )rD  r  r  r  rA   r  rH   align_inputs_from_check_idxs
  s    r  c                 C  s`   d|   v rd}n$tdd t|   |  D d }t| |fd }t||   |  S )Nr   c                 s  s   | ]\}}|d  | V  qdS r1  rA   )rF   shaper  rA   rA   rH   r   
  rJ   z)clone_preserve_strides.<locals>.<genexpr>r)   rv   )r\  r   r   r  rC   Z
as_stridedclone)rG   Zneeded_sizebufferrA   rA   rH   clone_preserve_strides
  s    "r  r  zOptional[OrderedSet[int]]z-tuple[list[torch.Tensor], list[torch.Tensor]])r  check_inputs_idxsreturn_pair_idxsr@   c                 C  s   g }g }|du}|D ]f}| | }t |tjs>J dt| | t rt|| |< |r||v r|| || |  q||fS )z
    Clones misaligned tensors which we inferred were aligned. Returns a tuple of [old_tensors], [new_tensors] for every
    cloned tensor which is in `return_pair_idxs`.
    Nz Expected tensors only, but got: )rm   rC   r;  r   data_ptr	ALIGNMENTr  r6  )r  r  r  r  r  Zret_pair_definedr   Z_inprA   rA   rH   r  
  s    

r  )r6  static_input_idxsr@   c                 C  sT   g }|D ]2}| | }t |tjr| t dkr|| qt|t|krP|S |S )z[
    We require all inputs to be aligned, so introduce a copy for any
    that aren't.
    r   )rm   rC   r;  r  r  r6  rL   )r6  r  Zaligned_static_input_idxsr  r  rA   rA   rH   remove_unaligned_input_idxs
  s    r  r   r@   c                 C  sZ   ddl m} ttjj}|jjj}|jjj	j
}|jj| |krFdS || oX|| |kS )Nr)   r  T)r  r  rC   Ziinfor  r   r  r  r  r  has_hintr  )r   r  Zint_maxr  r  rA   rA   rH   expr_fits_within_32bit
  s    
r  r7   )rE  compiled_graphr@   c                   s   t jj }|d ur|jd urt|jdks0J t| |jd usFJ |jD ]h}|d u rf|jd  qLd t jj  }r|j ddd fdd|jt	fdd	|D  qLd S )
Nr   Fr
   z,Union[float, int, SymInt, SymFloat, SymBool]r  c                   s(   d u rt | S  r| S | S rx   )rf   Zdeserialize_symexprZevaluate_symexpr)r   )fakify_first_callr  rA   rH   map_expr  s
    
z4set_tracing_context_output_strides.<locals>.map_exprc                 3  s   | ]} |V  qd S rx   rA   )rF   r   )r  rA   rH   r     rJ   z5set_tracing_context_output_strides.<locals>.<genexpr>)
rC   r  r  r  Zoutput_stridesrL   r  r6  r  r  )rE  r  r2  r-  rz  rA   )r  r  r  rH   "set_tracing_context_output_strides
  s    
r  c                  C  s`   t jd urt jS t  sdS tj r*dS zddlm}  W n tyN   Y dS 0 | tj	dkS )NFr   REMOTE_CACHE_VERSIONz.pytorch/remote_cache:fx_graph_memcache_version)
r^   Zfx_graph_remote_cacher  rC   Z_utils_internalZis_fb_unit_testZtorch._inductor.fb.remote_cacher  ModuleNotFoundErrorZjustknobs_getval_intr  rA   rA   rH    should_use_remote_fx_graph_cache  s    

r  c                 C  s   t dd| S )Nz[^a-zA-Z0-9_]r   )r  subr  rA   rA   rH   normalize_name#  s    r  ztl.int1ztl.float8e4nvztl.float8e5ztl.float8e4b8ztl.float8e5b16ztl.uint8)ztl.boolztl.float8_e4m3fnztl.float8_e5m2ztl.float8_e4m3fnuzztl.float8_e5m2fnuzztl.float8_e8m0fnuztl.float4_e2m1fn_x2c                 C  s   i | ]\}}||qS rA   rA   r  rA   rA   rH   r   3  rJ   r   z^.*[.]c                 C  s   t dt| }t||S )z"Convert torch.dtype to triton typetl.)_triton_type_rer  r>   _triton_type_mappingr  )r   Ztriton_type_namerA   rA   rH   triton_type9  s    r  c                 C  s6   t | | }|dd}tt|}t|tjs2J |S )Nr  r  )_torch_triton_mappingr  r}  rB   rC   rm   r   )r   Zadjusted_type	type_namer  rA   rA   rH   triton_type_to_torch?  s
    
r  )r  rw   r@   c                 C  sh   | j  of|  | kof|  | kof| j|jkof| j|jkof|   |  kof|  | kS rx   )	is_mkldnnr\  r  r   r   Zuntyped_storager  r  r  rw   rA   rA   rH   is_same_tensorG  s    

r  c                 C  sJ   | j oH|  | koH| j|jkoH| j|jkoHtjj| tjj|kS rx   )r  r\  r   r   rC   r   mkldnnr  r  rA   rA   rH   is_same_mkldnn_tensorS  s    

r  ztuple[str, ...]c                   C  s   dS )N)rc  isnanZlogical_notlogical_andZsignbitand_leltgegteqner  xorrA   rA   rA   rA   rH   boolean_ops]  s    r  c                   @  s   e Zd ZU ded< ded< dS )OpDtypeRuler%   type_promotion_kindr   override_return_dtypeNr3  rA   rA   rA   rH   r  q  s   
r  zdict[str, OpDtypeRule]op_dtype_propagation_rulesr%   )r   r  r   r@   c                 C  s   t ||t| < d S rx   )r  r  )r   r  r   rA   rA   rH   #register_op_dtype_propagation_rulesz  s    r  zOrderedSet[str]op_requires_libdevice_fp64c                 C  s   t |  d S rx   )r  r  r  rA   rA   rH   #register_op_requires_libdevice_fp64  s    r  c                  C  s<   ddl m}  | j j}|dkr&tjS |dkr2dS tjS d S )Nr   r  r   r;   )r  r  r  Zget_current_device_or_throwr   r^   Zcpu_backendZcuda_backend)r  Z
device_strrA   rA   rH   get_current_backend  s    r  c                 C  s,   | t jt jfv r(tjjr(t dkr(t jS | S )z"Maybe upcast [b]float16 to float32r  )rC   r   r  r^   r  Zcodegen_upcast_to_fp32r  r  r/  rA   rA   rH   upcast_compute_type  s    r  KeyTypeValTypec                   @  s   e Zd ZdZddddZdddd	d
ZddddddZdddddZd#ddddddZddddZ	ddddZ
dddd Zdddd!d"ZdS )$
ScopedDictz
    A dictionary-like object that allows for scoped updates. It maintains
    an original dictionary and a set of new items that can override
    the original items within the scope.  The original dictionary is
    unmodified.
    zMapping[KeyType, ValType])original_dictc                 C  s   || _ i | _d S rx   r
  	new_items)ro  r
  rA   rA   rH   r:    s    zScopedDict.__init__r  r  r   c                 C  s   || j v r| j | S | j| S rx   r  r
  ro  r   rA   rA   rH   r'    s    

zScopedDict.__getitem__rB  )r   rw   r@   c                 C  s   || j |< d S rx   )r  )ro  r   rw   rA   rA   rH   __setitem__  s    zScopedDict.__setitem__rq  rk   c                 C  s   || j v p|| jv S rx   r  r  rA   rA   rH   __contains__  s    zScopedDict.__contains__NzOptional[ValType])r   ru  r@   c                 C  s"   || j v r| j | S | j||S rx   )r  r
  r  )ro  r   ru  rA   rA   rH   r    s    

zScopedDict.getrf   r?   c                 C  s,   t | j}| jD ]}|| jvr|d7 }q|S r;  )rL   r
  r  )ro  r  r  rA   rA   rH   rz    s
    



zScopedDict.__len__zIterator[KeyType]c                 c  s,   | j E d H  | jD ]}|| j vr|V  qd S rx   r  )ro  r  rA   rA   rH   __iter__  s    

zScopedDict.__iter__c                 C  s   t | jp| jS rx   )rk   r
  r  rv  rA   rA   rH   rJ    s    zScopedDict.__bool__c                 C  s   t d S rx   rw  r  rA   rA   rH   __delitem__  s    zScopedDict.__delitem__)N)r|   r}   r~   r   r:  r'  r  r  r  rz  r  rJ  r  rA   rA   rA   rH   r	    s   r	  )Zfrozen_defaultzOptional[type[Any]])rz   r   r@   c                 s(   ddd fdd}| d u r |S || S )Nra   )rz   r@   c                   s,   t jdkrtj| d dS tj|  dS d S )N)r>  rL  T)Zkw_onlyr   r   )r  version_infodataclasses	dataclass)rz   r   rA   rH   wrap  s    
zir_dataclass.<locals>.wraprA   )rz   r   r  rA   r   rH   ir_dataclass  s    r  zOptional[list[int]]c                  C  s&   t jj } | d ur"| jr"| jjS d S rx   )rC   r  r  r  Zfw_metadataZbw_donated_idxs)r  rA   rA   rH   get_donated_idxs  s    r  z3Union[Sequence[BaseSchedulerNode], ExternKernelOut])r}  r  	is_externr@   c                   s   ddl m}m} ddlm} ddlm} |rft| |s:J |jj	
|g    fdd| jD  nZt| tstJ | D ]F}|||fvrx|jd urx|jj	
|g    fdd|jjD  qxd S )Nr)   r  )r1   r  c                 3  s   | ]}|j  vr|j V  qd S rx   r  r  Zcurr_node_inforA   rH   r     s   
z:set_kernel_post_grad_provenance_tracing.<locals>.<genexpr>c                 3  s   | ]}|j  vr|j V  qd S rx   r  r  r  rA   rH   r   	  s   
)Zcodegen.simd_kernel_featuresr  r  r  r1   r  r  rm   r   Z._inductor_triton_kernel_to_post_grad_node_info
setdefaultr  r  r  r?  )r}  r  r  r  r  r1   r  r{  rA   r  rH   'set_kernel_post_grad_provenance_tracing  s,    

r  c                   @  s    e Zd ZdZdZdZdZdZdS )TritonAttrsDescriptorVersionr   r)   r  r>  r6  N)r|   r}   r~   V0_NO_TRITONV1_COMPILERV2_BACKENDSZV3_BACKENDS_TUPLEV4_DICTrA   rA   rA   rH   r    s   r  c                  C  sX   t jdd u rtjS dd l} dd l} t| jj	dr:tj
S t| j	j	drNtjS tjS d S )Nr  r   ZAttrsDescriptor)	importlibutil	find_specr  r  Ztriton.backends.compilerZtriton.compiler.compilerr   rH  compilerr   r  r!  )r  rA   rA   rH   #get_triton_attrs_descriptor_version  s    r&  c                   C  s   t  tjkS rx   )r&  r  r!  rA   rA   rA   rH   triton_version_uses_attrs_dict4  s    r'  r4   c                 C  sF   ddl m} t| |jsdS t| jtjjrBtjj	j
| jjv rBdS dS )zq
    Returns True if the node is an op that is not cudagraphable.
    Usually only custom ops have this tag.
    r)   r~  FT)r  r  rm   rt  rs  rC   r,  r-  r   r0  r  r1  ry  rA   rA   rH   is_cudagraph_unsafe_op8  s    r(  c                  C  sX   t jdd} t rTddlm} | }|rTt j|dd}| rPt j	|| gn|} | S )NZLD_LIBRARY_PATHr  r   )get_runtime_pathr  lib)
r  r  r  r^   r  Zlibfb.py.parutilr)  r  r  pathsep)r  r)  Zruntime_pathZlib_pathrA   rA   rH   get_ld_library_pathK  s    r,  )r{  r@   c                 C  s    ddl m} t| |o| jd uS )Nr   )SubgraphPythonWrapperCodegen)Ztorch._inductor.codegen.wrapperr-  rm   Zpartition_signatures)r{  r-  rA   rA   rH   #is_codegen_graph_partition_subgraphX  s    
r.  )r\  r@   c                 C  s<   ddl m} |jj| dr2|jj| dr2tjS tjS d S )Nr)   r  l        i   )	r  r  r  r  r  Zstatically_known_geqrC   r  r  )r\  r  rA   rA   rH   dtype_from_sizea  s    r/  )r   r<   )r   r@   c                 C  s$   | dkrt jj S d| v r dS dS )z;
    Returns True if the device supports MKL-DNN BF16.
    r   r<   TF)rC   r   r  Z_is_mkldnn_bf16_supportedr   rA   rA   rH   is_mkldnn_bf16_supportedo  s
    r1  c                 C  s$   | dkrt jj S d| v r dS dS )z;
    Returns True if the device supports MKL-DNN FP16.
    r   r<   TF)rC   r   r  Z_is_mkldnn_fp16_supportedr0  rA   rA   rH   is_mkldnn_fp16_supported{  s
    r2  )r   r   )r   r   )r:   )r)   r:   )rA   rL  rL  rM  r:   )N)NNT)r   )N)FTFN)N)N)N)N)F(S  
__future__r   r  rY  r  enumr   r"  rE  rg  r  r  rb  r   r  rU  r  r   r   r  r  rd  rF  r)  collections.abcr   r   r   r   r   r   r	   typingr
   r   r   r   r   r   r   r   r   r   r   Ztyping_extensionsr   r   r   r   r   r   r   rn   rC   Ztorch._inductor.runtime.hintsr   Ztorch.utils._ordered_setr   Ztorch.utils._pytreer   ZOPTIMUS_EXCLUDE_POST_GRADr   r    r!   r"   r#   r$   Ztorch._prims_commonr%   Ztorch.fxr&   r  r'   Ztorch.fx.noder(   r  r*   r  r,   r  r.   r  r/   r0   r1   r2   r3   r4   r5   Zoutput_coder7   r`  r8   r9   rK   r=   r   rN   Ztorch._dynamo.device_interfacerO   Ztorch._dynamo.utilsrP   Ztorch.autogradrQ   Ztorch.autograd.profiler_utilrR   Z(torch.fx.passes.graph_transform_observerrS   Ztorch.fx.passes.shape_proprT   Ztorch.utils._sympy.functionsrU   rV   rW   rX   rY   Ztorch.utils._sympy.symbolrZ   r[   Ztorch.utils._sympy.value_rangesr\   r]   r  r^   Zruntime.runtime_utilsr_   r   Z_IS_WINDOWS	getLoggerr|   r   ra   r  r   Z	VarRangesr;  rf   Z	InputTypeZGPU_KERNEL_BIN_EXTSr  r  r  r  rh   ri   rr   ZFunctionrt   r  r   r   r   r   r   r   r   r   r  r  r  r  r"  r(  rA  r   rK  rR  rW  rY  r[  r]  re  rf  rg  ri  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r   r  r  rZ  r$  Zclear_on_fresh_inductor_cacheZclear_inductor_cachesZfresh_inductor_cacher)  r.  	lru_cacher0  r1  r4  r5  rm  rv  r>  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r  r  r  r"  r*  r-  r0  r4  r5  r7  r8  r:  r  r@  rJ  rM  rO  rR  rS  rW  r  r  r[  Enumr\  rk  rn  ro  rw  rz  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  compiler  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r  r&  r'  r(  r,  r.  r/  ZSUPPORTED_MKLDNN_DEVICESr1  r2  rA   rA   rA   rH   <module>   s|  4 $	


$
HV&
	     	$;/;  $8$  
 ") 
 $6.

    ,@""
	 , 	 "!
 	
 $ $		,,('	$"$


$0
(
 $
	