o
    ZhE                    @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlmZ d dlmZmZmZmZmZmZmZ d dlmZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZm Z  d d	l!m"Z"m#Z#m$Z$m%Z% d
dl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3 ddl*m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z: ddl+m;Z;m<Z<m=Z= ddl>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZKmLZLmMZM ddlNmOZO ddlPmQZQmRZRmSZSmTZT ddlUmVZV ddlWmXZXmYZYmZZZm[Z[ erd dl\m]Z]m^Z^m_Z_ e`eaZbejcdeadZeejcdeadZfejcdeadZgeT jhZieg d ZjejkG d!d" d"ZlG d#d$ d$elZmG d%d& d&elZnd9d+d,Zoed-eQeQd.ZpG d/d0 d0eSep eep ZqG d1d2 d2e<Zrejkd3d4G d5d6 d6ZsG d7d8 d8etZudS ):    )annotationsN)Counter)AnyCallableGenericno_type_checkOptionalTYPE_CHECKINGUnion)TypeVar)immutable_dict)
OrderedSet)FloorDivIdentityModularIndexing)free_symbol_is_type
prefix_strsymbol_is_typeSymT   )counters   )configir	scheduler)prologue_preserves_zero_mask)	code_hash)	MemoryDepStarDepWeakDep)IRNodeTritonTemplateBuffer)!indexing_dtype_strength_reduction)
green_textyellow_text)BaseSchedulerNodeBaseScheduling	WhyNoFuse)cache_on_selfexpr_fits_within_32bitget_dtype_sizeIndentedBufferPlaceholderprefix_is_reduction'set_kernel_post_grad_provenance_tracingsympy_index_symbolsympy_product
sympy_subsunique)ops
OpsWrapperV   )BlockPatternMatcher)CSEVariableindex_prevent_reorderingKernelPythonPrinter)MultiKernel)DisableReductionEnableReductionNodeScheduleMarkerSIMDKernelFeatures)IterableIteratorSequenceZ
perf_hintsZscheduleZfusion)zyxr0_r1_c                      sh   e Zd ZdZejjejjdd fddZee	e
dddZdddZee	e
d ddZ  ZS )!IterationRangesa  
    Each range tree represents multiple sets of iteration indexing
    in a single tiled dimension in the output kernel.

    If you have two loops ranges one (4, 3, 2) and another (4, 6),
    then the range tree will be:
            4 (i0)
        3 (i1)  6 (i3)
        2 (i2)
    Where i0 is shared between both loops, but then the split into
    different indexing vars.  All loop ranges must iterate over
    the same number of elements.
    )divisorlengthnamestrvar_listlist[sympy.Symbol]
var_rangesdict[sympy.Symbol, sympy.Expr]numel
sympy.Exprprefixkernel
SIMDKernelrootIterationRangesRootreturnNonec          
        sD   t    || _|| _|| _|| _|| _|| _|| _|| _	|	| _
d S N)super__init__rL   rN   rP   rR   rT   rJ   rK   rU   rW   )
selfrL   rN   rP   rR   rT   rU   rJ   rK   rW   	__class__ K/var/www/auris/lib/python3.10/site-packages/torch/_inductor/codegen/simd.pyr]   `   s   

zIterationRanges.__init__boolc                 C  
   t | jS r[   )r-   rT   r^   ra   ra   rb   is_reductionx   s   
zIterationRanges.is_reductionsympy.Symbolc                 C  rd   r[   )r/   rL   re   ra   ra   rb   symbol~      
zIterationRanges.symbolr   c                 C  s   dd t  D }|| j S )Nc                 S     i | ]\}}||qS ra   ra   ).0symtrT   ra   ra   rb   
<dictcomp>       z(IterationRanges.symt.<locals>.<dictcomp>)r   itemsrT   )r^   Zprefix_to_symtra   ra   rb   rl      s   
zIterationRanges.symt)rL   rM   rN   rO   rP   rQ   rR   rS   rT   rM   rU   rV   rW   rX   rY   rZ   rY   rc   rY   rg   )rY   r   )__name__
__module____qualname____doc__sympySOner]   propertyr(   r   rf   rh   rl   __classcell__ra   ra   r_   rb   rI   P   s    
rI   c                      sh   e Zd Z	d.d/ fddZd0ddZd1ddZd2ddZd3d!d"Zd4d&d'Zd5d)d*Z	d6d,d-Z
  ZS )7rX   NrL   rM   rR   rS   rT   indexintrU   rV   	pid_cacheOptional[dict[str, str]]is_looprc   
tensor_dimOptional[int]grid_dimhas_zdimrY   rZ   c             	     sj   |d u ri }t  j|g i |||| d || _i | _|| _|r'| jr%|	d u s'J || _|| _|	| _|
| _	d S )N)rL   rN   rP   rR   rT   rU   rW   )
r\   r]   r{   nodesr}   rf   r   r   r   r   )r^   rL   rR   rT   r{   rU   r}   r   r   r   r   r_   ra   rb   r]      s&   	
zIterationRangesRoot.__init__c                 C  s   d| j d| j dS )NzIterationRangesRoot(, z, ...))rL   rR   re   ra   ra   rb   __repr__   s   zIterationRangesRoot.__repr__c                 C  s   | j  D ]}|  qd S r[   )r   valuescache_clear)r^   nodera   ra   rb   r      s   
zIterationRangesRoot.cache_clearrg   c                 C  s   t | j dS )Nr{   )r/   rT   re   ra   ra   rb   	index_sym   s   zIterationRangesRoot.index_symrJ   rK   IterationRangesEntryc                 C  s   t jj|| | jrt|  |}nt|  ||}|| jvrMt	| j
 tt jj |||| }|t jj| < | j|  || j| < || j|< | j| S )zF
        Lookup a given RangeTreeEntry, creating it if needed
        )r5   graphsizevarsstatically_known_equalsrR   r   r   r   r   r   rT   nextrU   iter_vars_countrange_tree_nodesrh   rN   appendrP   )r^   rJ   rK   exprr   ra   ra   rb   lookup   s    


zIterationRangesRoot.lookuplengthslist[sympy.Expr]list[IterationRangesEntry]c                 C  s@   t jj}g }t|D ]}|| || || }q
g t|S r[   )rv   rw   rx   reversedr   r   )r^   r   rJ   itervarsrK   ra   ra   rb   construct_entries   s   
z%IterationRangesRoot.construct_entriesrO   c                 C  s   dd |  |D S )Nc                 S     g | ]}|  qS ra   )rh   )rk   era   ra   rb   
<listcomp>       z1IterationRangesRoot.construct.<locals>.<listcomp>)r   r^   r   ra   ra   rb   	construct      zIterationRangesRoot.construct+tuple[list[sympy.Symbol], list[sympy.Expr]]c                   s   dd |j D }fdd|D }|jdd d tjj g g  fdd}|D ]}tjj|j	 sE|
 t|j	  |j	 || q+tjjj s_|
 tj  g tg tfS )	z,Figure out vars from this tree used in indexc                 S  s   g | ]	}t jj|qS ra   )r5   rU   r   getrk   sra   ra   rb   r      s    z6IterationRangesRoot.vars_and_sizes.<locals>.<listcomp>c                   s    g | ]}|r|j  j kr|qS ra   rT   rk   nre   ra   rb   r      s     c                 S  s   t jjj| jtjdS )N)fallback)r5   r   r   	size_hintrJ   r   Zunbacked_symint_fallbackrF   ra   ra   rb   <lambda>   s    z4IterationRangesRoot.vars_and_sizes.<locals>.<lambda>keyc                   s(    |    | j  | j  d S r[   )r   rh   rK   )r   )rJ   
index_varssizesra   rb   add   s   z/IterationRangesRoot.vars_and_sizes.<locals>.add)free_symbolssortrv   rw   rx   r5   r   r   r   rJ   r   r   rR   r   )r^   r{   r   r   r   ra   )rJ   r   r^   r   rb   vars_and_sizes   s"   
z"IterationRangesRoot.vars_and_sizesr[   )rL   rM   rR   rS   rT   rM   r{   r|   rU   rV   r}   r~   r   rc   r   r   r   r   r   rc   rY   rZ   rY   rM   rY   rZ   rq   )rJ   rS   rK   rS   rY   r   )r   r   rY   r   )r   r   rY   rO   )r{   rS   rY   r   )rr   rs   rt   r]   r   r   r   r   r   r   r   rz   ra   ra   r_   rb   rX      s    
*





rX   c                      sd   e Zd Zd  fddZd!ddZd"ddZd#ddZd!ddZd$ddZd%ddZ	d&ddZ
  ZS )'r   rL   rM   rJ   rS   rK   r   parentrI   rY   rZ   c                   sP   t  j||j| |j|j|j|||j|jd	 || _t	
d | j| _|| _d S )N)	rL   rR   rN   rP   rT   rJ   rK   rU   rW   )r\   r]   rR   rN   rP   rT   rU   rW   r   	functools	lru_cache_codegencodegenr   )r^   rL   rJ   rK   r   r   r_   ra   rb   r]     s   
zIterationRangesEntry.__init__c                 C  s.   d| j  d| j d| j d| j d| j dS )NzIterationRangesEntry(r   ))rL   rJ   rK   r   rP   re   ra   ra   rb   r     s   .zIterationRangesEntry.__repr__c                   s$    fdd| _ dd | j _ | _d S )Nc                     s    S r[   ra   ra   rL   ra   rb   r          z/IterationRangesEntry.set_name.<locals>.<lambda>c                   S     d S r[   ra   ra   ra   ra   rb   r   !  r   )r   r   rL   )r^   rL   ra   r   rb   set_name  s   
zIterationRangesEntry.set_namec                 C  s   | j   d S r[   )r   r   re   ra   ra   rb   r   $  s   z IterationRangesEntry.cache_clearc                 C  s   t j|  | jS r[   )r5   rU   codegen_iteration_ranges_entryrL   re   ra   ra   rb   r   '  s   zIterationRangesEntry._codegenr   c                 C  s   g }t | jtjr|S t | jttfsJ t| j| jjdd  D ]"}t |tjtjfsD|j	}t
|dkrDtdd |D rD|| q"|S )Nr6   r   c                 s  s    | ]	}t |tjV  qd S r[   )r   r   SIZEr   ra   ra   rb   	<genexpr>4  s    
z8IterationRangesEntry.precomputed_args.<locals>.<genexpr>)
isinstancer   rv   Symbolr   r   typeargsIntegerr   lenallr   )r^   precomputed_argsargsymbolsra   ra   rb   r   +  s   
z%IterationRangesEntry.precomputed_argsr|   c                 C  rd   r[   )hashrL   re   ra   ra   rb   __hash__:  ri   zIterationRangesEntry.__hash__otherobjectrc   c                 C  s   t |tsJ | j|jkS r[   )r   r   rL   )r^   r   ra   ra   rb   __eq__=  s   zIterationRangesEntry.__eq__)rL   rM   rJ   rS   rK   rS   r   rS   r   rI   rY   rZ   r   )rL   rM   rY   rZ   r   )rY   r   rY   r|   )r   r   rY   rc   )rr   rs   rt   r]   r   r   r   r   r   r   r   rz   ra   ra   r_   rb   r     s    





r   valueUnion[int, float]rY   rM   c                 C  s6   | t dkrdS | t dkrdS t| rdS t| S )Ninfzfloat("inf")z-infzfloat("-inf")zfloat("nan"))floatmathisnanrepr)r   ra   ra   rb   constant_reprB  s   
r   CSEVariableType)bounddefaultc                      s  e Zd ZU dZeZded< ded< dZded< ded	< 	
	
	
dd fddZe	e
edddZdddZe	ddd Zdd!d"Zdd(d)Zdd+d,Zdd/d0Zdd6d7Zdd8d9Zdd:d;Zdd=d>Zdd?d@ZddBdCZddEdFZddGdHZddIdJZddMdNZddOdPZddRdSZddVdWZe dd\d]Z!e"e#j$j%fdd_d`Z&ddbdcZ'e"ddddeZ(ddfdgZ)ddhdiZ*ddjdkZ+ddldmZ,dddodpZ-ddrdsZ.ddtduZ/dddxdyZ0e1j2dd~dZ3dddZ4e dd Z5dd Z6dd Z7dd Z8dd Z9dd Z:dd Z;dddZ<  Z=S )rV   zo
    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
    zCallable[[sympy.Expr], str]sexprkexprFrc   allow_block_ptrrM   kernel_nameNtilingdict[str, sympy.Expr]featuresr@   r}   r~   override_persistent_reductionOptional[bool]override_cooperative_reductionrY   rZ   c                   s   |d u ri }t    | _|  _t  _t  _dd | D  _	g  _
i  _t  _|  _|d ur;|n   _|d urF|n   _   _d  _td d fdd}| _ | d S )Nc                 S  s    i | ]\}}|t jj|qS ra   )r5   r   r   simplify)rk   rT   valra   ra   rb   rm   h  s    z'SIMDKernel.__init__.<locals>.<dictcomp>r{   rS   c                   s6   t jj|   }  jD ]} | |} q | S r[   )r5   r   r   simplify_with_rangesrP   range_treescombine_contiguous_dimscombine_modular_indexing_pairs)r{   treere   ra   rb   simplify_indexing}  s   

z.SIMDKernel.__init__.<locals>.simplify_indexing)r{   rS   )r\   r]   r   Zget_mutationsZ	mutationsr+   bodyZindexing_codero   numelsr   r   	itertoolscountr   rf   inside_reduction should_use_cooperative_reductioncooperative_reductionshould_use_persistent_reductionpersistent_reductionwant_no_x_dimno_x_dimr   r   r   r   initialize_range_tree)r^   r   r   r}   r   r   r   r_   re   rb   r]   Y  s8   




zSIMDKernel.__init__r|   c                 C     t dd | jD S )Nc                 s      | ]}t |V  qd S r[   )r-   rk   rT   ra   ra   rb   r         z0SIMDKernel.num_reduction_dims.<locals>.<genexpr>)sumr   re   ra   ra   rb   num_reduction_dims  s   zSIMDKernel.num_reduction_dimsdtypetorch.dtypec                 C     t r[   NotImplementedError)r^   r  ra   ra   rb   dtype_to_str     zSIMDKernel.dtype_to_strc                 C  s   |  | j S r[   )r  r   Zselect_index_dtypere   ra   ra   rb   index_dtype  s   zSIMDKernel.index_dtypec                 C     dS NFra   re   ra   ra   rb   r     r	  zSIMDKernel.want_no_x_dimr   rf   r   r   list[IterationRangesRoot]c                   s   t  fddtD }| p| }ddd}g d}	dd	g}
|r#|
}n	|r(|	}n|	|
 }|||}||	t}g }t|D ]6\}}t|}||}||}|d u rT|n|}|t| d
 | ||| ||oi| j ||d v d
 q<|S )Nc                 3      | ]	}| v r|V  qd S r[   ra   r   r   ra   rb   r     s    z3SIMDKernel.construct_range_trees.<locals>.<genexpr>rY   dict[Any, int]c                   s    dd t  fdd| D D S )Nc                 S  rj   ra   ra   )rk   idxr   ra   ra   rb   rm     s    
zPSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<dictcomp>c                 3  r  r[   ra   )rk   r   maskra   rb   r         zOSIMDKernel.construct_range_trees.<locals>.filtered_index_map.<locals>.<genexpr>)	enumerate)seqr  ra   r  rb   filtered_index_map  s   z<SIMDKernel.construct_range_trees.<locals>.filtered_index_map)rF   rE   rD   rG   rH   r{   rD   )r}   r   r   r   r   )rY   r  )r   all_prefixesr  r-   r   r   rX   r   )r^   r}   r   rf   r   r   Zactive_prefixesZno_r_dimr  Z	grid_dimsZreduction_dimsZtensor_dimsZtensor_dim_mapZgrid_dim_mapr   irT   r   r   r{   ra   r  rb   construct_range_trees  sF   





z SIMDKernel.construct_range_treesdict[str, str]c                 C  s.   |  || j| j | j| j}| j| d S r[   )r  r   r   rf   r   r   r   extend)r^   r}   r   ra   ra   rb   r     s   z SIMDKernel.initialize_range_treeindicesSequence[sympy.Expr]c                 C  r  )zr
        Hook called right before codegen with every index that will be
        used in the fused kernel.
        Nra   )r^   r  ra   ra   rb   finalize_indexing  r   zSIMDKernel.finalize_indexingrL   r{   rS   r   r8   c                 C  s,   | j }d| _ z| |||W || _ S || _ w r  )r   store)r^   rL   r{   r   priorra   ra   rb   store_reduction  s
   zSIMDKernel.store_reductionc                 C  r  r  ra   re   ra   ra   rb   r     r	  z+SIMDKernel.should_use_cooperative_reductionc                 C  r  r  ra   re   ra   ra   rb   r     r	  z*SIMDKernel.should_use_persistent_reductionrQ   c                 C  s   t tjdd | jD S )Nc                 s  s    | ]}|j  V  qd S r[   )rP   ro   rk   r   ra   ra   rb   r     s    

z(SIMDKernel.var_ranges.<locals>.<genexpr>)dictr   chainfrom_iterabler   re   ra   ra   rb   rP     s
   zSIMDKernel.var_rangesc                 C  r   )Nc                 s  s    | ]
}t |jd uV  qd S r[   )r|   r   r#  ra   ra   rb   r     s    z0SIMDKernel.triton_tensor_ndim.<locals>.<genexpr>)r  r   re   ra   ra   rb   triton_tensor_ndim  r   zSIMDKernel.triton_tensor_ndimr  c                 C  s(   dg|    }d||< dd| dS )NrZ   :[r   ])r'  join)r^   r  r   ra   ra   rb   indexing_size_str  s   zSIMDKernel.indexing_size_str	list[str]c                 C  sL   dg|    }| jD ]}|jd u rq
|jr| jr#|j  d||j< q
|S )N1ZBLOCK)r'  r   r   rf   r   rT   upper)r^   r   r   ra   ra   rb   dense_size_list  s   

zSIMDKernel.dense_size_listc                 C  s   |   }dd| dS )Nr)  r   r*  )r0  r+  r^   r   ra   ra   rb   dense_size_str  s   zSIMDKernel.dense_size_strc                 C  sx   t |ts|S |jd }| j| }d u r|S t|||ji}tjj	
|}t||j |jtjj|jj iS )Nr   )r   r   r   r   r   r1   r   r5   r   r   r   rW   r   r   rv   rw   rx   rR   rh   )r^   r{   rF   Z	tree_node	new_indexra   ra   rb   r   	  s   

z)SIMDKernel.combine_modular_indexing_pairsr   rX   c                 C  s8   t jj| }r|\}}t| |||S | ||S r[   )r5   r   r   Zexpand_floor_divr   _combine_contiguous_dims)r^   r{   r   Z
expand_resr3  denominatorra   ra   rb   r     s   z"SIMDKernel.combine_contiguous_dimsc           
      C  s   t |tjtjfr|S ||\}}t|dkr|S tjj	||t
|g||\}}}||kr1|S ||}t|tt|||}	|	S )zI
        More aggressive simplification to merge contiguous dims
        r6   )r   rv   r   r   r   r   r5   r   r   Z_simplify_loopsr9   r   r1   r$  zip)
r^   r{   r   r   r   Z	new_sizesZreindexZ_pruneZnew_index_varsr3  ra   ra   rb   r4  $  s   

z#SIMDKernel._combine_contiguous_dims'contextlib.AbstractContextManager[None]c                   s,    j d jp jtj fdd}| S )Nc                   3  sf     j  s jrJ d V  d S r   d _zd V  r)   W d _d S W d _d S d _w )NFT)r   rf   r   codegen_bodyra   r^   Zshould_flushra   rb   ctx;  s   



z)SIMDKernel.disable_reduction.<locals>.ctx)r   r   r   
contextlibcontextmanager)r^   r;  ra   r:  rb   disable_reduction8  s   zSIMDKernel.disable_reductionr   rO   c                 G  s,   t |t | jksJ dd t|| jD S )Nc                 S  s   g | ]	\}}| |qS ra   )r   )rk   rK   rangesra   ra   rb   r   R  s    z)SIMDKernel.set_ranges.<locals>.<listcomp>)r   r   r6  r   ra   ra   rb   
set_rangesP  s   
zSIMDKernel.set_rangesgroupsIterable[sympy.Expr]Sequence[Sequence[sympy.Expr]]Stuple[list[list[sympy.Expr]], list[list[Callable[[list[sympy.Expr]], sympy.Expr]]]]c              
     s  t dd |D rdd | D g fS tjjdd | D  fdd| D t d fdd}ddd}g }d}|D ]~}g }|D ]r}|drU|dd  qE|tk ru| dru|d7 }|tk ru| dsc|d tk r	|| r
|| st| }	t|| }
|||
|||	||d |
 qE|t||| qE|| q?t dd D sJ d d|  |fS )Nc                 s  s    | ]	}t |d kV  qdS r   Nr   )rk   rK   ra   ra   rb   r   ^  r  z5SIMDKernel._split_iteration_ranges.<locals>.<genexpr>c                 S     g | ]}g qS ra   ra   )rk   groupra   ra   rb   r   _      z6SIMDKernel._split_iteration_ranges.<locals>.<listcomp>c                 S  rG  ra   ra   )rk   _ra   ra   rb   r   b  rI  c                   s   g | ]}  |qS ra   r   )rk   g)svra   rb   r   c  rn   r  r|   r   rS   rY   c                   sF    |}|  |stt|  || <  |  | tS r[   )r   statically_known_multiple_of	CantSplitr   r   r   )r  r   
new_ranges	remainingrM  Z	var_countra   rb   	add_rangef  s   
z5SIMDKernel._split_iteration_ranges.<locals>.add_rangesizeidx1idx2(Callable[[list[sympy.Expr]], sympy.Expr]c                   s   d fdd}|S )N	flat_varsr   rY   rS   c                   s   |    |   S r[   ra   )rX  rU  rV  rT  ra   rb   getterr  r   zISIMDKernel._split_iteration_ranges.<locals>.make_combined.<locals>.getter)rX  r   rY   rS   ra   )rT  rU  rV  rZ  ra   rY  rb   make_combinedo  s   z9SIMDKernel._split_iteration_ranges.<locals>.make_combinedr   r6   c                 S  s   t jjS r[   )rv   rw   ZZero)rJ  ra   ra   rb   r   }  s    z4SIMDKernel._split_iteration_ranges.<locals>.<lambda>c                 s  s"    | ]}t jj|d kV  qdS )r6   Nr5   r   r   r   r   ra   ra   rb   r          zfailed to set ranges  )r  r|   r   rS   rY   r|   )rT  rS   rU  r|   rV  r|   rY   rW  )r   r5   r   r   r   r   r   r   r   Zstatically_known_gtrN  rO  r   operator
itemgetter)rA  r   rS  r[  return_getters_groupsZcurrent_groupZlength_groupZreturn_gettersrT  Zsize1Zsize2ra   rP  rb   _split_iteration_rangesW  sb   
	z"SIMDKernel._split_iteration_rangesreduction_numelc                 C  sj   t jj}t|d dkr!|t|t|d | r!|d |gf}z	| || W dS  ty4   Y dS w )Nr6   r   TF)r5   r   r   r   r   r0   rb  rO  )clsrA  r   rc  r   ra   ra   rb   is_compatible  s   zSIMDKernel.is_compatiblelist[list[sympy.Expr]]c                 C  sP   dd | j D }| js|D ]}t|rtjj||< qg | }| ||| jS )Nc                 S  s   i | ]}|j |jqS ra   )rT   rR   )rk   rtra   ra   rb   rm     rn   z3SIMDKernel.split_and_set_ranges.<locals>.<dictcomp>)	r   r   r-   rv   rw   rx   r   map_kernel_groups_to_node_sizesr@  )r^   r   r   rT   rA  ra   ra   rb   split_and_set_ranges  s   zSIMDKernel.split_and_set_rangesc                   sf   t |t |krtdd t||D r|| S | ||\}}g tj||   fdd|D S )a  
        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).

        To do this we need to split up the iteration space of i0 into something like:
            for i1 in s0:
              for i2 in s1:
                i0 = i1*s1 + i2
                ....

        This function matches and resplits lengths to the groups of
        this kernel to enable tiled + non-tiled fusions.
        c                 s  s.    | ]\}}t jjt|| d kV  qdS rE  r5   r   r   r   r0   )rk   rF   rL  ra   ra   rb   r     s
    
z=SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<genexpr>c                   s   g | ]} fd d|D qS )c                   s   g | ]}| qS ra   ra   )rk   fnr   ra   rb   r     r   zISIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<listcomp>.<listcomp>ra   )rk   fnsrl  ra   rb   r     s    z>SIMDKernel.map_kernel_groups_to_node_sizes.<locals>.<listcomp>)r   r   r6  rb  r   r%  r&  )rd  rA  r   r@  rQ  ra  ra   rl  rb   rh    s   z*SIMDKernel.map_kernel_groups_to_node_sizesc                 C  s   t |tjS r[   )r   r   TMPr^   r{   ra   ra   rb   is_indirect_indexing  s   zSIMDKernel.is_indirect_indexingc                   s   |  |rdS dgt| j }|jD ] }|| jvrq| j| }t|jts'J ||jj  |j	9  < qt
jjj t fddt|| j D S )NFr6   c                 3  s$    | ]\}} | |kV  qd S r[   ra   )rk   Z	idx_rangeZ
iter_rangerK  ra   rb   r     s
    
z,SIMDKernel.is_broadcasted.<locals>.<genexpr>)rp  r   r   r   r   r   r   rX   r{   rK   r5   r   r   r   anyr6  r   )r^   r{   Zindex_numelsrh   entryra   rK  rb   is_broadcasted  s   




zSIMDKernel.is_broadcastedc                 C  s4   t |trddt| j| dS | | |S )a  
        Convert an index expr to a string that can be used in output code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the generated kernel.

        Index expressions often need to be passed in as arguments to the triton kernel.
        Rename_indexing and codegen_indexing keep track of the needed indices and add
        new parameters to the function signature.
        r)  r   r*  )r   listr+  mapindex_to_strr   Zrename_indexingro  ra   ra   rb   rv    s   
	zSIMDKernel.index_to_strc                 C  s   |  |}t|tjjj}t|tj	st|tj
r%|tjjj}t|tj
rV|tj
D ]"}|j}t|dkrUtdd |D rU|tjj|i}t||}q3|  |}t|tsb|n|jd }| |S )Nr   c                 s  s"    | ]}t |tjtjfV  qd S r[   )r   r   r   ZPRECOMPUTED_SIZEr   ra   ra   rb   r     s
    
z.SIMDKernel.prepare_indexing.<locals>.<genexpr>)r   r1   r5   r   r   Zprecomputed_replacementsr   Zatomsrv   floorZceilingsubsr   r   lookup_precomputed_sizer   r   r   codegen_indexing)r^   r{   ar   replacementsZ
simp_indexra   ra   rb   prepare_indexing  s"   
 


zSIMDKernel.prepare_indexingreorderc                   s    fdd j D }|rIt|dkrItdd |D }ddd |d | D d| d  ks=J d	d |d | D t|d | |d |< |S )
Nc                   s   g | ]
}|j r
 jr|qS ra   )rf   r   rk   tre   ra   rb   r   2  s    z1SIMDKernel.active_range_trees.<locals>.<listcomp>r6   c                 s  s    | ]}|j d v V  qdS )ZxyzNr   r  ra   ra   rb   r   6  s    z0SIMDKernel.active_range_trees.<locals>.<genexpr> c                 s  s    | ]}|j V  qd S r[   r   r  ra   ra   rb   r   7  s    Zzyxc                 S     g | ]}|j qS ra   r   r  ra   ra   rb   r   7  s    )r   r   r  r+  r   )r^   r~  Ztreesr   ra   re   rb   active_range_trees1  s   
2
zSIMDKernel.active_range_treesr   c                 C  s   t jj||  }t|jtdD ]6}|| jv rGi }| j| 	 D ]}t jj
|||< q!t|dkr@t| j| j|| j| _| j|   q|S )Nr   r   )r5   r   r   r   rP   sortedr   rM   r   r   ry  r   r1   r   r   )r^   r   symr|  Zpsra   ra   rb   rz  =  s   

zSIMDKernel.codegen_indexingc                 C     t d)NzNYI: codegen_nan_checkr  re   ra   ra   rb   codegen_nan_checkN     zSIMDKernel.codegen_nan_checkr   Optional[IRNode]c                 C  r  )NzNYI: call_kernelr  )r^   rL   r   ra   ra   rb   call_kernelQ  r  zSIMDKernel.call_kernelr  Union[str, OpsWrapper]r   Iterator[str]c                 c  s\    | j }| j}|rt||}t|}|| _ || _z|V  W || _ || _dS || _ || _w )z:Context manager to add an additional mask to tl.load/storeN)Z
_load_maskZ_load_otherr3   logical_andr4   _unwrap)r^   r  r   r!  Z	prior_valra   ra   rb   
mask_loadsT  s   

zSIMDKernel.mask_loadsc                 C  s\   dd | j  D }t||}i }| jD ]}t|j}t||dit||di ||< q|S )a\  
        This gets the stride of the index for each of the tiling variables
        (technically, it does it at index 0)

        For example, if
        xindex = x0 + 512*x1 + 1024*r0
        x0 = (xindex//512)
        x1 = (xindex % 512)
        r0 = rindex // 1024

        this function would return
        {xindex: 512, rindex: 1024}
        c                 S  s   i | ]\}}||j qS ra   )r   )rk   kvra   ra   rb   rm   v      z2SIMDKernel.get_strides_of_load.<locals>.<dictcomp>r6   r   )r   ro   r1   r   r/   rL   )r^   r{   Zindex_to_tile_indexesZindex_in_tile_varsstridesZ
range_treer   ra   ra   rb   get_strides_of_loadh  s   


zSIMDKernel.get_strides_of_loadc                 C  s    t |trtt| |S | |S r[   )r   tupleru  )rk  r   ra   ra   rb   _map_tuple_or_scalar  s   
zSIMDKernel._map_tuple_or_scalarc                 C  s0  g }t t| jj }| j \}}}}| j }tj	j
t| j }t|D ]i\}}||vr8|d q*tj	|}	tj	j
|	}
|
|krxtt  }d}|| D ]}t|ttfrj|d|  |d7 }qT||j qTt || }n|
}tj	|}t|}||| dt||k    q*t|S )a+  
        Try the best to estimate the total size (in bytes) of the
        kernel's inputs and outputs, which is used for estimating the memory
        throughput of this kernel. This information is used for checking how
        far we are from the peak memory bandwidth. It's important that
        we want to avoid overestimating the sizes of the inputs and outputs,
        because it can wrongfully give us a very large memory traffic value,
        which may be even larger than the theoretical bandwidth and thus
        become very misleading. This is particularly problematic for cases
        where we slice some inputs. In those cases, we should only count
        the size of the "slices" instead of the original inputs, because
        only the slices contribute to the real memory traffic.
        r   Zno_index_dep_r6   )r   r2   r   inplace_buffersr   python_argdefsr   buf_accessesr5   r   r   r   r0   r   r  r   Z	get_numelr   r   r   r   r   r   r{   Z	get_dtyper*   r|   r  )r^   nbytesZninplace_argsrJ  	call_argsr  Z	out_numelr  r   Z	arg_numelZbuf_sizer  Zno_index_dep_countdeprR   r  Z
dtype_sizera   ra   rb   estimate_kernel_num_bytes  s2   



 z$SIMDKernel.estimate_kernel_num_bytesc                 C  st  t | jjdkrt | jjdkrt | jjdkrdS | j \}}}}d}|D ]}tj|}|s2q'|	 }	t |	j
dkrt dd |	j
D dkrJq't|	j}
|du rW|
}q'||
krtd| d	d
|
 d|  }t| dd |D }dd |D }dd |D }dd |D }td| d| d| d| d| d }t|  dS q'td| d}t| dS )zr
        Print message if the kernel have mixed layout inputs.
        Only care about 4D tensor for now.
        r6   r   N   c                 S  s   g | ]}|d kr|qS )r6   ra   rk   rF   ra   ra   rb   r     r  z.SIMDKernel.warn_mix_layout.<locals>.<listcomp>r   zExpected stride order z, but found stride orderr^  z for kernel c                 S  s4   g | ]}t j|rtt j| jnd qS r[   )r5   r   try_get_bufferr   get_stride_order
get_buffer
get_layoutstriderk   rL   ra   ra   rb   r     s    
c                 S  s.   g | ]}t j|rt j| jnd qS r[   )r5   r   r  r  r  rT  r  ra   ra   rb   r     s    
c                 S  s0   g | ]}|t jjv rd n	|t jjv rdndqS )Z
GraphInputZIntermediateBufferN)r5   r   Zgraph_inputsZname_to_bufferr  ra   ra   rb   r     s    c                 S  r  ra   r   r  ra   ra   rb   r          z  param names z
  buf names z
  strides z	
  sizes z
  sources 
z%All the inputs for the triton kernel z have uniform layout)r   r   Zinput_buffersZoutput_buffersr  r  r5   r   r  r  rT  r   r  r  r$   logwarningr#   )r^   r   Zargdefsr  
_signaturerJ  Zuniform_stride_orderZarg_namebufZlayoutZstride_ordermsgZstride_order_listZ	size_listZsource_listZargdef_namesra   ra   rb   warn_mix_layout  s\   

	

zSIMDKernel.warn_mix_layoutc           	      C  sp   t ||d|}d| _t | jj|}t ||}d| _t ||}t ||}t ||d|}t	
|||fS )Nr  FT)r3   	reductionr   Z
index_exprr   rc  truedivsubmulr4   r  )	r^   r  r   Zsum_rnumelmeanZdxZdx2m2ra   ra   rb   welford_reduce_fallback  s   z"SIMDKernel.welford_reduce_fallbackc                 C  sD   t ||d|}t ||}t |}t ||d|}t||fS )Nmaxr  )r3   r  r  expr4   r  )r^   r  r   Zvmaxr  r  Zvsumra   ra   rb    prepare_softmax_twopass_fallback  s
   
z+SIMDKernel.prepare_softmax_twopass_fallbackc                 C  r  r[   r  re   ra   ra   rb   codegen_kernel  r	  zSIMDKernel.codegen_kernelc                 C  r   r[   ra   re   ra   ra   rb   r9  "  r	  zSIMDKernel.codegen_bodyrr  r   c                 C  r   r[   ra   )r^   rr  ra   ra   rb   r   %  r	  z)SIMDKernel.codegen_iteration_ranges_entry)NNN)r   r   r   r@   r}   r~   r   r   r   r   rY   rZ   r   )r  r  rY   rM   r   rp   )r}   r~   r   rc   rf   rc   r   r   r   rc   rY   r  )r}   r  rY   rZ   )r  r  rY   rZ   )rL   rM   r{   rS   r   r8   rY   rZ   )rY   rQ   )r  r|   rY   rM   )rY   r-  )r{   rS   rY   rS   )r{   rS   r   rX   rY   rS   )rY   r7  )r   rS   rY   rO   )rA  rB  r   rC  rY   rD  )rA  rB  r   rC  rc  rS   rY   rc   )r   rC  rY   rf  )rA  r  r   rC  rY   rf  )r{   rS   rY   rc   )r{   rS   rY   rM   F)r~  rc   rY   r  )r   rS   rY   rS   r   r[   )rL   rM   r   r  rY   rZ   )r  r  r   r   rY   r  )r{   rS   rY   rQ   )rr  r   )>rr   rs   rt   ru   pexprr   __annotations__r   r]   ry   r(   r   r  r  r
  r   r  r   r  r"  r   r   rP   r'  r,  r0  r2  r   r   r4  r>  r@  staticmethodrb  classmethodrv   rw   rx   re  ri  rh  rp  rs  rv  r}  r  rz  r  r  r<  r=  r  r  r  r  r  r  r  r  r9  r   rz   ra   ra   r_   rb   rV   O  s|   
 /



6













	

L




&



?GrV   c                   @  s.  e Zd ZU eZded< dd Zdd ZeZeZ	dd Z
dQddZedRddZdSddZdTddZdd ZdddUd!d"Zd#d$ Z	dVdWd,d-Zd.d/ Zeed0dXd2d3ZedYd8d9ZedZd<d=Zed[d?d@Zed\dBdCZeejjfd]dDdEZ dFdG Z!d^dHdIZ"dVdJdKZ#dLdM Z$dNdO Z%dPS )_SIMDSchedulingz	type[Any]kernel_typec                 C  s   t dd |D S )Nc                 s  s"    | ]}t jjt|V  qd S r[   rj  r   ra   ra   rb   r   -  r]  z*SIMDScheduling.group_fn.<locals>.<genexpr>)r  r1  ra   ra   rb   group_fn,     zSIMDScheduling.group_fnc                   sd  t |tjst |tjrtj||S |j\}\}}|j\}\ t||}| r7| s7| r6|d n| rG| sG| rG|d | rc| rc| koV|k}|sa|d| | |S | s7| s7| kru|ks| s|d| | dS |	 D ]+}| r n$|
 | @ sq|j\}\}	}
||	kr||
ks|d||	||
  dS qt||fdD ]\}}| rt | t}|s|| d |  S q| |	 ||}| |	 ||}| |	 |	  ||}tjjr5d}t|d	krt|d	kr||  ko|kn  }n||k}nt|d	kr)||k}|s5|d
||| dS dS | s| r|dkrKdksMJ |  krt fdd|	 D si|d dS tjjr| st| |	 | |df dffv }|s|d |S dS | kr|d | kS | r| rJ | ||S )z
        Hook called by Scheduler to determine if the Triton backend
        can fuse node1 and node2.  These nodes might already be
        FusedSchedulerNodes.
        z&Split scan cannot fuse with reductionsz1numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)z5numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)Fz:numel/rnumel mismatch prologue mismatch (%s, %s), (%s, %s))node1node2z is not TritonTemplateBufferTr   ztiling mismatch (%s, %s, %s)r6   c                 3  s$    | ]}t  f| V  qd S r[   )rV   re  
get_rangesr   Znumel2Zrnumel2ra   rb   r     s
    
z*SIMDScheduling.can_fuse.<locals>.<genexpr>z"nodes numel/rnumel incompatibilityzinvalid tiling for reductionznodes numel incompatibility)r   r   ZForeachKernelSchedulerNodecan_fuserH  r'   Zis_split_scanrf   is_template	get_nodesused_buffer_namesget_buffer_namesr6  Zget_template_noder!   select_tilingr   tritonZ tiling_prevents_pointwise_fusionr   r   Z tiling_prevents_reduction_fusionr  r   can_fuse_horizontal)r^   r  r  rJ  Znumel1Zrnumel1whyZreduction_can_fuser   Z	pro_numelZ
pro_rnumelr   Z	node_nameZis_triton_templatetiling1Ztiling2Ztiling3ZcondZis_reduction_tiling_validra   r  rb   r  /  s   





zSIMDScheduling.can_fusec              	     sb  g t tj  t t  t t   d fdd}fdd} fdd} fdd}tj fd	d
}fdd}	|D ]h}
|
v rMqF|
 ||
r|	|
rn|  W d    n1 siw   Y   r{||
s{pytnd ||
 qF||
r|  |
 W d    n1 sw   Y  qFt	d d d|
j
d  S )Nc                   s2   | j \}\}}| kr|kp|  ko|dkS Nr6   rH  r   rJ  Z
node_numelZnode_rnumelrR   r  ra   rb   fits_in_main_body  s   z@SIMDScheduling.generate_node_schedule.<locals>.fits_in_main_bodyc                   s&   | j \}\}}| ko|dkodkS r  r  r  r  ra   rb   fits_outside_reduction  s   zESIMDScheduling.generate_node_schedule.<locals>.fits_outside_reductionc                   s"   | j jD ]
}|j v r dS qdS )NTF)read_writesreadsrL   )r   read)current_loop_buffer_usagera   rb   expect_improved_memory_usage  s
   
zKSIMDScheduling.generate_node_schedule.<locals>.expect_improved_memory_usagec                   s    |  |   dd | jjD  |  r8t| tjr8t| j	t
jr8t| j	jt
js8 |   d S  dd | jjD  d S )Nc                 S  r  ra   r   r  ra   ra   rb   r     r  zXSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loop.<locals>.<listcomp>c                 S  r  ra   r   r  ra   ra   rb   r     r  )r   r   updater  r  rf   r   r   SchedulerNoder   r   ZComputedBufferdataZScanget_namewrites)r   )r  donenode_schedulenot_ready_yet_nodesra   rb   schedule_node_in_loop  s   


zDSIMDScheduling.generate_node_schedule.<locals>.schedule_node_in_loopc                   3  sn    rd t u r  nt r%t d t  d d V  t       d S )Nr8  r6   )r>   popr   r=   insertclearra   )r  maybe_split_indexr  r  ra   rb   end_current_reduction_loop  s   


zISIMDScheduling.generate_node_schedule.<locals>.end_current_reduction_loopc                   s<   dkrdS  | j @ sdS |rt|d ttfrJ t S )Nr6   Fr8  )Z	ancestorsr   r>   r=   rc   )r   r  )r  r  ra   rb   #requires_closing_previous_reduction  s   
zRSIMDScheduling.generate_node_schedule.<locals>.requires_closing_previous_reductionzunexpected group: (r   z) != r6   )r   r   r%   rM   r<  r=  r   r   r   r  rH  )r^   r   rR   r  r  r  r  r  r  r  r   ra   )r  r  r  r  r  rR   r  rb   generate_node_schedule  sD   





z%SIMDScheduling.generate_node_scheduler   <Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]c                 C  sN   |  }t|dd dj\}\}}| |||}td| | t|||S )zK
        Given a set of pre-fused nodes, generate a Triton kernel.
        c                 S     t |  S r[   r|   rf   r   ra   ra   rb   r   #      z-SIMDScheduling.codegen_node.<locals>.<lambda>r   zSchedule:
 %s)r  r  rH  r  schedule_logdebugcodegen_node_scheduler@   )r^   r   r   rJ  rR   r  r  ra   ra   rb   codegen_node  s   
zSIMDScheduling.codegen_noderR   rS   buffers<Iterable[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]rY   rc   c                 C  sl   t t jj}t| sdS dd |D }tdd |D sdS tjj	| | |D ]
}tjj	|| q)dS )NFc                 S  s    g | ]}|  r|  qS ra   )Zhas_tensor_outputr  Zstorage_size)rk   r  ra   ra   rb   r   8  s    
z9SIMDScheduling.can_use_32bit_indexing.<locals>.<listcomp>c                 s  r   r[   )r)   )rk   rT  ra   ra   rb   r   >  r   z8SIMDScheduling.can_use_32bit_indexing.<locals>.<genexpr>T)
torchZiinfoZint32r  r)   r   r5   r   r   Z	guard_leq)rR   r  Zint_maxZ	buf_sizesrT  ra   ra   rb   can_use_32bit_indexing,  s   z%SIMDScheduling.can_use_32bit_indexingkernel_featuresr@   c              	   C  s"  |j }| ||j|j}| ||gd|i}|D ]}| || qt| |D ]9}t	| |
 }W d    n1 s>w   Y  | |||}tjjrSt|| td| ||_t||_q(~t|dkrnt|}n|\}t	| | D ]}	|	  q{W d    n1 sw   Y  | | ||j tjr|  tjr||d j tj j|jO  _tj j|jO  _tjjj rtj!r|d j"# }
| D ]5}	|	$ }||
vrq|	j%d usJ |	j%& }|d ur
t'd d  d7  < tjj(d|j)d| d	 q| *  d S )
Nr   z+Generating kernel code with kernel_name: %sr6   r   ZinductorZintermediate_hookszrun_intermediate_hooks(r   r   )+r  r  rR   rc  create_kernel_choices!codegen_node_schedule_with_kernelr<   Zmerge_workspaces_inplacer5   set_kernel_handlerr  define_kernelr   traceenabledr.   r  r  r   r   r   Zscheduler_nodesmark_runcodegen_commentr  Znan_assertsr  r  r   removed_buffersinplaced_to_removewrapper_codeZsupports_intermediate_hooksZgenerate_intermediate_hooksr   Zlive_output_buffersr  r   Zget_origin_noder   	writelinerL   free_buffers_in_scheduler)r^   r  r  r   ZkernelsrU   src_coder   Zfinal_kernelr   Z	live_outsrL   Zorigin_nodera   ra   rb   r  H  st   







z$SIMDScheduling.codegen_node_schedulelist[SIMDKernel]c                 C  s   | j |i |gS r[   )r  )r^   r  Zkernel_argsZkernel_kwargsra   ra   rb   r    s   z$SIMDScheduling.create_kernel_choicesc              	   C  s   |t t  }i }|D ]0}|tu r||  q|tu r"|  q|  ||	 }|
t|j|  q||  |D ](}|tu rS||  qE|tu r\|  qEt|j ||	 }|| qEW d    d S 1 syw   Y  d S r[   )r<  	ExitStackr=   enter_contextr>  r>   closeZdecide_inplace_updateri  r  r  r$  fromkeys_bodyZindexing_from_argsr   r  keysr"   r   )r^   r  rU   stackZall_indexingr   r   ra   ra   rb   r    s4   


"z0SIMDScheduling.codegen_node_schedule_with_kernelFonly_gen_src_codeOptional[str]c                C  s  |j \}\}}|dksJ |j|j\}}	i }
| }g }|D ]+}| }|| ||@ rKt|dks7J ||
tt|< |j	
tt| g }q t|dksTJ | |se|g|D ]}|  q^|	 }|d |D ]}|||  qp|jt  W d   n1 sw   Y  |j D ]|\}}d| d}|
| g  }rtdd |D }td	| N ||9 |D ]'}t| dkrt|dkrt|r| j| O  _|||  q|jt  W d   n1 sw   Y  W d   n	1 sw   Y  qW d   n	1 s w   Y  t|ts7|d
 |jddd t | |j! D ]}d| d}|j|dd qB|d t|trb|}n|d |j"}W d   n	1 suw   Y  g |||}tj#r|$ d }|%  d| d|&|'  }|r|W  d   S | (|||}tj)j*rt+|| W d   n	1 sw   Y  | ,| |-||j tj. j/|j/O  _/tj. j0|j0O  _0| 1  dS )z
        Codegen a triton template

        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
        r6   r   z<STORE_OUTPUT>Nz<LOAD_INPUT_>c                 s      | ]}|  V  qd S r[   )Zcan_codegen_without_upcasts)rk   Zp_nra   ra   rb   r     s    
z2SIMDScheduling.codegen_template.<locals>.<genexpr>ztriton.codegen_upcast_to_fp32z<DEF_KERNEL>z	<ARGDEFS>F)strictg    eAr  )2rH  r   Zmake_kernel_renderr  r  r   r   r   iterZprologue_fused_inputsr   r  Zset_subgraph_bodyr   ri  r  ZcseZ
invalidater   Znamed_input_nodesro   r   r  r   r   patchr   Z#prologue_fused_inputs_preserve_zeror   rM   Zfinalize_hookr5   r  r  codebenchmark_kernelr  Zimports_for_benchmark_kernelZcodegen_kernel_benchmarkgetvaluer  r  r  r.   r   r  r   r  r  r  )r^   Ztemplate_nodeZepilogue_nodesZprologue_nodesr  rJ  Z_numelr  rU   renderZbuf_name_to_prologue_groupZtemplate_readsZprologue_groupprologuenamesr   Zpartial_codeZ
input_namebufferZsubgraph_nameZcan_codegen_without_upcastZprologue_noder  r  Znum_gbr   ra   ra   rb   codegen_template  s   


.





zSIMDScheduling.codegen_templatec                 C  s   t jjt jj  d S r[   )r5   r   r  r  Z
device_opsZsynchronizere   ra   ra   rb   codegen_sync-  s   zSIMDScheduling.codegen_syncsubkernel_nodeslist[BaseSchedulerNode]custom_part_algorithmenable_autotunemixed_sizesr  list[tuple[str, Any, Any]]c              
   C  s  ddl m} dd |D }i i }}	t||D ]6\}
}t|dd dj\}\}}| |||}| |||}||||f|	|
< |j|t|||| d||
< q|j	|| |||	d	}t
d
t|dd |D  g }|D ]s}dd |D }|||d}t||D ]R\}
}| |	|
 d |||
  ||
 }|	|
 d }|st| t|D ]}|  qW d    n1 sw   Y  tj j|jO  _tj j|jO  _q~| }||||f qj|S )Nr6   )ComboKernelc                 S  r   ra   r  rk   r   ra   ra   rb   r   :  r   z=SIMDScheduling.generate_combo_kernel_code.<locals>.<listcomp>c                 S  r  r[   r  r   ra   ra   rb   r   =  r  z;SIMDScheduling.generate_combo_kernel_code.<locals>.<lambda>r   )r   Zoptimize_mask)r   Ztriton_schedulingZcustom_algorithmZ
kernel_mapZnode_info_mapz1ComboKernels: %d nodes partitioned into %s groupsc                 S  s   g | ]}t |qS ra   rF  )rk   pra   ra   rb   r   Q  r   c                 S  r   ra   r'  r(  ra   ra   rb   r   U  r   )r#  r$  r   )Ztriton_combo_kernelr&  r6  r  rH  r  r  Zcreate_triton_kernelr@   Zhorizontal_partitionr  r  r   r  Zcreate_sub_kernelr5   r  r?   Z
only_nodesr  r   r  r  r  r   )r^   r   r"  r#  r$  r  r&  Zfused_node_listsZsubkernel_mapZnode_schedule_mapZpnr   rJ  rR   r  r  r   Z
partitionskernel_code_listZ
node_grouprU   Z	subkernelr   r  ra   ra   rb   generate_combo_kernel_code0  sd   



z)SIMDScheduling.generate_combo_kernel_codec                 C  s   |  }|j}|j}tjdkptjdko|}| ||||}|D ]!\}}}	| ||g|}
| |g t	d|
 |
tjj|
 q |   d S )Nr6   z"ComboKernels: generated kernel %s.)Zget_subkernel_nodesZuse_custom_partition_algor#  r   Zcombo_kernel_allow_mixed_sizesr+  r  r   r  r  r  r5   r   r  r  )r^   Zcombo_kernel_noder   r"  r#  r$  r*  r  rU   rJ  r   ra   ra   rb   codegen_combo_kernelm  s   
z#SIMDScheduling.codegen_combo_kernel    list[CandidateTiling]c           	        s   dk}d
 fdd}|  \}t|dkr!tdkr!g S |  \}|||r-|n||} fdd	|D }|S )Nr6   is_pointwiserc   rY   r.  c                   s  t |jt |ksJ d|jd||j|jg}tdd tj|D s)J dd tj|D }tdd |jD }ddd}t	 
||g| dddg}|D ]}tjj|j|j}	t |	t |ksjJ z |	dd }
|
t |krzW qTtdd |	|
d D rW qTW n	 ty   Y qTw ||d|
 |||
d f}tjjtdd t||	D }|j|v r|d9 }t	|d r|d9 }t	|d r|d9 }tjj|tt| dkr|t	 
||d|
 |||
d g||jd qT|S )zX
            Compute tiling candidates by dividing up the iteration ranges.
            zrw.range_vars=z ranges=c                 s  s    | ]
}t |ttfV  qd S r[   )r   r   r   rk   r  ra   ra   rb   r     s
    
zHSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<genexpr>c                 S  s(   g | ]}|j tjjvrt|tr|qS ra   )rL   r5   r   r  r   r   r0  ra   ra   rb   r     s    zISIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.<listcomp>c                 S  r  ra   r   r0  ra   ra   rb   r     r  r?  r  rY   rS   c                 S  s   t jjt| S r[   rj  )r?  ra   ra   rb   collapse_ranges  r  zNSIMDScheduling.candidate_tilings.<locals>.tile_ranges.<locals>.collapse_rangesnoner   )r   rL   scorer6   c                 s  s    | ]}|d kV  qdS rE  ra   r   ra   ra   rb   r     r   Nc                 s  s     | ]\}}|d kr|V  qdS rE  ra   )rk   rT  r  ra   ra   rb   r     s    r   r   r3  rL   )r?  r  rY   rS   )r   Z
range_varsr  r  r   r   r%  r&  r   CandidateTilingcreate_partial_tilingr5   r   r   Zstride_hintsr{   
ValueErrorr   r0   r6  rL   is_good_sizer   )r/  r?  rwZdep_sourcesdepsZwrite_namesr1  tilingsr  r  splitZtiled_groupsr3  )rd  rc  reduction_rangesra   rb   tile_ranges  s   (




z5SIMDScheduling.candidate_tilings.<locals>.tile_rangesc                   s*   g | ]}t  |j|j|jd qS )r4  )r5  complete_partial_tilingr   r3  rL   )rk   r   )rd  rR   rc  ra   rb   r     s    z4SIMDScheduling.candidate_tilings.<locals>.<listcomp>)r/  rc   rY   r.  )r  r   Z"pointwise_or_reduction_read_writes)	rd  r   rR   rc  r/  r>  Zpointwise_rangesZpartial_tilingsZfull_tilingsra   )rd  rR   rc  r=  rb   candidate_tilings  s   ^
z SIMDScheduling.candidate_tilings	pw_tilingr  reduction_tilingr   c                 C  sF   g dt | d }ddgdt | }tg t||t||S )zK
        Create a tiling dict from pointwise and reduction splits.
        )rD   rE   rF   NrG   rH   )r   r   r6  )rd  rA  rB  Zpw_prefixesZreduction_prefixesra   ra   rb   create_tiling  s
   zSIMDScheduling.create_tilingr   r/  c                 C  s   |  |r|ng |s|S g S r[   )rC  )rd  r   r/  ra   ra   rb   r6    s   
z$SIMDScheduling.create_partial_tilingrc  c           	      C  sH   t | }d|v }|| }|t| g}|r||fn||f}| j| S )zb
        Given a tiling for only pointwise or reduction dimensions, adds the missing one.
        rF   )rt  r   r0   rC  )	rd  r   rR   rc  Zsplitsr/  Ztotal_numelZmissing_tilingZtiling_argsra   ra   rb   r?    s   

z&SIMDScheduling.complete_partial_tiling"list[dict[str, tuple[sympy.Expr]]]c              
   C  s   |dk}t tttjf   }t|D ]}t|tj	sq|
 }|s+t|d dkr+q||r0dnd }|g}	dd |j D }
|
D ]~}g |j }tjj}tjj}t|D ]\}\}}||9 }|||ri nqW|||sqqB|d }|r}|d| n||d }g }|D ]3\}}t|j|}td|t|t t|}t||||}|dur|d n|g}| | q|	!| qB|	D ]2}tdt|t"j#j$ }|d }t%|d| }|ft&||d  }|'| (| )|||| qqt*|tdd}|S )	z
        Creates N-dimensional tiling candidiates, attempting to simplify loads/stores
        by tiling the kernel into higher dimensions.

        Returns a list of tilings ranked by dimensionality.
        r6   r   c                 S  s(   g | ]}t |trt|jd kr|qS )r   )r   r   r   r?  r0  ra   ra   rb   r   K  s    z1SIMDScheduling.get_nd_tilings.<locals>.<listcomp>Nr   T)r   reverse)+r   r$  rM   rv   Exprr>   filterr   r   r  r  r   r  Zreads_and_writesr?  ro   rw   rx   r5   r   r   r  Zstatically_known_geqr   r7   Zget_subexpr_involving_symbolr{   r  r   r   r   Zmatch_mod_div_block_exprr  r   r   r  	max_tilesr0   r  r   r?  r6  r  )rd  r  Zpointwise_numelrc  r/  r;  r   Znode_rangesZranges_to_tileZnode_tilingsZmemory_depsr  Zall_var_rangesZpointwise_vars_numelr   Zpointwise_end_idxvarrR   Zreduction_start_idxrP   Zindex_tilingr{   Znum_dimsZmatch_resultdimsZnode_tilingZnum_leading_dimsZfirst_trailing_dimZcollapsed_leading_dimZcollapsed_splitsranked_tilingsra   ra   rb   get_nd_tilings+  s   


zSIMDScheduling.get_nd_tilingsc                   s   dk}|  |g g}|stjjrtjjdkrBtjtjkr@t	
|D ]}tjjs?t| || dkr?ttd  |S q#|S tt  }t }t	
|D ]&}| || D ]}	|	j|v raqY|	jdurl||	j ||	  |	j7  < qYqPdd | D }
tjjdkr|rddd}tdt|
D ]}||
d |
| }|dur|g|
 }
 nqt|
dkrtd|
 tjjr| || |
 }
|
D ]ttsJ t fdd|D r߈  S q|S )z
        Heuristics to decide how to tile kernels.
        Currently, we tile based on stride-1 dimensions.

        Returns:
            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`

        r6   r   z
                                Reduction over non-contiguous dims.
                                Consider setting config.triton.tile_reductions to True.
                                Nc                 S  s   g | ]\}}|j qS ra   )r   )rk   candidate_tilingr3  ra   ra   rb   r     s    z0SIMDScheduling.select_tiling.<locals>.<listcomp>r   tiling0r   r  rY   Optional[dict[str, sympy.Expr]]c                 S  s   | d |  dd}}|d | dd}}tjj|| dkr#d S tjj|| dk r;||f||f\}}\}}tjj|| dksHJ tjj||sRd S |t|||| d d}|S )NrF   rE   r6   r   rG   )rD   rE   rF   rG   )r   r5   r   r   r   rN  r   )rN  r  Za0Za1Zb0b1Z
new_tilingra   ra   rb   convert_tiling_to_3d  s   z:SIMDScheduling.select_tiling.<locals>.convert_tiling_to_3dzpossibly bad tiling: %sc                 3  s4    | ]}t |tjrtj |  d V  qdS ))rc  N)r   r   r  rV   re  r   r  r(  rc  r   ra   rb   r     s    

z/SIMDScheduling.select_tiling.<locals>.<genexpr>)rN  r   r  r   rY   rO  )rC  r   r  Ztile_reductionsrH  perf_hint_loglevelloggingWARNINGr>   rG  r   r@  infotextwrapdedentr   rM   collectionsr   rL   r   r3  most_commonrangeZprefer_nd_tilingrL  r   r$  r   )rd  r  rR   rc  r/  Zdefault_tilingr   Z
seen_namesZcandidate_tilesrM  rK  rQ  r  Znew_3d_tilingra   rR  rb   r    sv    




	zSIMDScheduling.select_tilingc                 C  r   r[   ra   re   ra   ra   rb   flush  r	  zSIMDScheduling.flushc                 C  r  r  ra   re   ra   ra   rb   ready_to_flush  r	  zSIMDScheduling.ready_to_flushc              	   C  s2  t dd |D set|dd dj\}\}}| |||}| |||}| j|t|||d}| || t	d|! t
| | }	W d    n1 sPw   Y  W d    n1 s_w   Y  n)|d |\}
}}t	d| | j|||
d	d
}	W d    n1 sw   Y  |	ttjd}	|	S )Nc                 s  r  r[   )r  r   ra   ra   rb   r     r   zASIMDScheduling.generate_kernel_code_from_nodes.<locals>.<genexpr>c                 S  r  r[   r  r   ra   ra   rb   r     r  z@SIMDScheduling.generate_kernel_code_from_nodes.<locals>.<lambda>r   )r   r  r   Tr  Ztriton_)rq  r  rH  r  r  r  r@   r  r   r  r5   r  r  Zget_prologue_template_epiloguer  replacerM   r,   ZKERNEL_NAME)r^   r   r  rJ  rR   r  r  r   rU   r  r  templateepiloguera   ra   rb   generate_kernel_code_from_nodes  s>   


 
z.SIMDScheduling.generate_kernel_code_from_nodesc                 C  r   r[   ra   )r^   r  ra   ra   rb   r   4  r	  zSIMDScheduling.codegen_commentc                 C  r  r[   r  )r^   r  r  rU   ra   ra   rb   r  7  r	  zSIMDScheduling.define_kernelN)r   r  )rR   rS   r  r  rY   rc   )r  r@   )r  r@   rY   r  )rY   r  r  )r   r!  r"  rc   r#  rc   r$  rc   r  rc   rY   r%  )rY   r.  )rA  r  rB  r  rY   r   )r   r  r/  rc   rY   r   )r   r   rR   rS   rc  rS   rY   r   )rY   rD  )rY   r   rp   )&rr   rs   rt   rV   r  r  r  r  Zcan_fuse_verticalr  r  r  r  r  r  r  r  r  r  r+  r,  r  r   r   r@  rC  r6  r?  rL  rv   rw   rx   r  r]  r^  rb  r   r  ra   ra   ra   rb   r  )  sN   
  	
`

C
#v	={
qr

r  T)frozenc                   @  s6   e Zd ZU ded< ded< dZded< edd	 ZdS )
r5  r   r   r|   r3  Nr  rL   c                 C  s"   t jj| } | dko| d dkS )z@Somewhat arbitrary heuristic used to boost scores for some sizesr-  r   r\  )r   ra   ra   rb   r8  A  s   zCandidateTiling.is_good_size)rr   rs   rt   r  rL   r  r8  ra   ra   ra   rb   r5  ;  s   
 r5  c                   @  s   e Zd ZdS )rO  N)rr   rs   rt   ra   ra   ra   rb   rO  H  s    rO  )r   r   rY   rM   )v
__future__r   rZ  r<  dataclassesr   r   rU  r   r_  rX  r   typingr   r   r   r   r   r	   r
   Ztyping_extensionsr   rv   r  Ztorch._loggingZtorch.fx.immutable_collectionsr   Ztorch.utils._ordered_setr   Ztorch.utils._sympy.functionsr   r   r   Ztorch.utils._sympy.symbolr   r   r   r   Z_dynamo.utilsr   r  r   r   r   Zanalyze_preserves_zero_maskr   Z	codecacher   dependenciesr   r   r   r    r!   Zoptimize_indexingr"   Zruntime.runtime_utilsr#   r$   r%   r&   r'   utilsr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   Zvirtualizedr3   r4   r5   Zblock_analysisr7   commonr8   r9   r:   r;   Zmulti_kernelr<   Zsimd_kernel_featuresr=   r>   r?   r@   collections.abcrA   rB   rC   	getLoggerrr   r  Z_loggingZgetArtifactLoggerrS  r  Z
fusion_logZdoprintr  r  	dataclassrI   rX   r   r   r   rV   r  r5  	ExceptionrO  ra   ra   ra   rb   <module>   s   $4
8{
>
     _        
