o
    Zh                    @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlmZmZ d dlmZmZmZmZmZmZmZ erbd dlmZ d dlmZ d dlZd dlZd dlZd dlm Z m!Z! d dl"m#Z#m$Z$ d d	l%m&Z&m'Z' d d
l(m)Z) d dl*m+Z+ d dl,m-Z-m.Z. d dl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6 ddl7m8Z8 ddl9m:Z:m;Z;m<Z< ddl=m>Z> ddl4m?Z?m@Z@mAZAmBZB ddlCmDZDmEZE ddl5mFZFmGZGmHZHmIZImJZJ ddlKmLZL ddlMmNZNmOZO ddlPmQZQmRZR ddlSmTZT ddlUmVZVmWZWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZb ddlcmdZd eeefZgejhiefdZjejhiefdZkeld ZmejnG dd  d ZoejnG d!d" d"eoZpG d#d dZqG d$d% d%Zrd\d*d+ZsG d,d- d-Ztd]d4d5Zuejvjwjxejvjwjyejvjwjzejvjwj{ejvjwj|d6Z}G d7d8 d8eqZ~G d9d: d:eqZG d;d< d<eqZd^d?d@Zd_dEdFZG dGdH dHeqZG dIdJ dJeZG dKdL dLeqZ	Md`dadUdVZejnG dWdX dXZe ZG dYdB dBZG dZd[ d[ZdS )b    )annotationsN)Counterdefaultdict)AnyCallableGenericOptionalTYPE_CHECKINGTypeVarUnion)Sequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)get_metric_tableis_metric_table_enabled)free_unbacked_symbols
OrderedSet)free_symbol_is_typeSymT)
has_triton   )commsconfigdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)ComputedBufferget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)
green_textred_text)SimplifyIndexing)cache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsIndentedBufferis_collectiveis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)VZfusionZloop_orderingBaseSchedulerNodec                   @  s   e Zd ZU ded< ded< ded< ejedZded	< ejedZ	d
ed< d%ddZ
d&ddZd%ddZd%ddZd'ddZd(ddZd)ddZd*d d!Zd*d"d#Zd$S )+SchedulerBuffer	Scheduler	schedulerz	ir.BuffernodeOptional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr1   
mpi_bufferreturnstrc                 C  s   | j }|d us	J | S N)rJ   get_name)selfop rU   H/var/www/auris/lib/python3.10/site-packages/torch/_inductor/scheduler.pydefining_op_nameX   s   z SchedulerBuffer.defining_op_nameintc                 C  s   t | jjS rQ   )hashrH   namerS   rU   rU   rV   __hash__]   s   zSchedulerBuffer.__hash__c                 C  s  t  }|  }|| dt| jj  || d| jj  |  r3|| dt|    | 	 rE|| dt| 	   t
| jdkr[|| d| j  | S || d |d | jD ]
}|| d qlW d    n1 sw   Y  |d	 | S )
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])r<   rR   	writelinetyperH   __name__layoutget_aliasespformatget_mutationslenrM   indentgetrawvalue)rS   resultrZ   userrU   rU   rV   	debug_str`   s&   

zSchedulerBuffer.debug_strc                 C  
   | j  S rQ   rH   rR   r[   rU   rU   rV   rR   t      
zSchedulerBuffer.get_nameNonec                 C  s   | j d usJ | j  sd S | j  s!| j  s!t| j  tjr+tj	j
| j  d S ttjdra|  tjjv ratjj|   }|| jjv rO| jj| j }n| jj| j }tj	j
|| j  d S tj	j
| j  d S )Nargs)rH   should_allocateget_inputs_that_alias_outputget_mutation_names
isinstanceget_output_specr   ZCommBufferLayoutrC   graphwrapper_codeZcodegen_allocationhasattrkernelrR   inplace_update_buffersrG   name_to_donated_buffername_to_bufZcodegen_inplace_reuse)rS   Zinput_buffer_nameZinput_bufferrU   rU   rV   allocatew   s6   

zSchedulerBuffer.allocateboolc                 C  sN   | j d usJ t| j jtjst| j rdS | jD ]}t|j tr$ dS qdS NFT)rH   ru   rc   r   
NoneLayoutr?   rM   
OutputNode)rS   userU   rU   rV   can_free   s   
zSchedulerBuffer.can_freec                 C  s\   i }|D ] }t |j|v r||t |j |t |j< q||t |j< qt| | _d S rQ   )idrH   mergelistvaluesrM   )rS   rM   rj   r   rU   rU   rV   	set_users   s    zSchedulerBuffer.set_usersSequence[str]c                 C     | j d usJ | j  S rQ   )rH   rs   r[   rU   rU   rV   rd         
zSchedulerBuffer.get_aliasesc                 C  r   rQ   )rH   rt   r[   rU   rU   rV   rf      r   zSchedulerBuffer.get_mutationsNrO   rP   rO   rX   rO   rp   rO   r   )rM   rL   rO   rp   rO   r   )rb   
__module____qualname____annotations__dataclassesfieldr   rM   r1   rN   rW   r\   rl   rR   r~   r   r   rd   rf   rU   rU   rU   rV   rE   N   s"   
 





!


rE   c                   @  s   e Zd ZU dZded< dS )SchedulerDonatedBufferNrI   rJ   )rb   r   r   rJ   r   rU   rU   rU   rV   r      s   
 r   c                   @  sZ  e Zd ZU ded< ded< ded< ded< ded	< d
ed< dddZdddZdddZdddZdddZdddZ	dd d!Z
dd"d#Zdd'd(Zdd+d,Zdd/d0Zdd2d3Zdd5d6Zdd:d;Zdd<d=Zdd>d?Zdd@dAZddBdCZddDdEZddHdIZddJdKZddLdMZeddNdOZeddPdQZeddRdSZeddTdUZddWdXZddZd[Z dd^d_Z!ddadbZ"ddcddZ#ddedfZ$ddgdhZ%ddidjZ&ddkdlZ'ddmdnZ(ddodpZ)ddsdtZ*ddudvZ+ddwdxZ,	yddd}d~Z-edddZ.edddZ/edddZ0dddZ1dddZ2edddZ3dddZ4dddZ5e6dddZ7dS )rD   z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writeszOrderedSet[Dep]unmet_dependenciesrX   	min_order	max_orderr2   Zmpi_noderG   rF   rO   rp   c                 C  s   || _ dd | _d S )Nc                  _  s   g S rQ   rU   )rq   kwargsrU   rU   rV   <lambda>   s    z,BaseSchedulerNode.__init__.<locals>.<lambda>)rG   debug_device_strrS   rG   rU   rU   rV   __init__   s   zBaseSchedulerNode.__init__rH   ir.Operationc                   sR   | _ tt   _tt   _d _ fdd| D  _dd  jD  _d S )NFc                   s   g | ]
}t  j| d qS ))rG   rH   rJ   )rE   rG   ).0outputr[   rU   rV   
<listcomp>   s    z5BaseSchedulerNode._init_from_node.<locals>.<listcomp>c                 S     i | ]}|  |qS rU   rR   r   bufrU   rU   rV   
<dictcomp>       
z5BaseSchedulerNode._init_from_node.<locals>.<dictcomp>)	rH   r   rP   	ancestors
last_usagewrittenget_outputsoutputsoutputs_by_namerS   rH   rU   r[   rV   _init_from_node   s   
z!BaseSchedulerNode._init_from_noderP   c                 C  s   t | j d|  dS )Nz(name=)ra   rb   rR   r[   rU   rU   rV   __repr__      zBaseSchedulerNode.__repr__c                 C  s  |   }t }|| dt| j dtt| ddj d| dt| jj d| dt| j	 d| d	t| jj
| j	  d| d
 |  |  D ]	}||  qKW d   n1 s_w   Y  |d z	||   W n ty   tjddd Y nw |  S )#Longer form printout for trace logsr]   (rH   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        r_   Ignoring error in debug_str()Texc_info)rR   r<   splicera   rb   getattrre   r   writesr   readsrh   r   rl   r`   debug_str_extra	Exceptionlogwarningri   rstrip)rS   rZ   r   outrU   rU   rV   rl      sH   

	
zBaseSchedulerNode.debug_strc                 C     dS )N rU   r[   rU   rU   rV   r         z!BaseSchedulerNode.debug_str_extra	list[str]c                 C  s
   |  | S rQ   )r   r[   rU   rU   rV   _debug_str_for_device  ro   z'BaseSchedulerNode._debug_str_for_devicec                 C  sz   t | jdd }d}t|tjjjrd|j| gddd }nt|tjjj	r7d|j|
 | gddd }|  | S )Ndatar   z, F)shorten	multiline)r   rH   ru   torch	_inductorr   Z	PointwiseZ
str_helperget_sizeZ	ReductionZget_reduction_sizeget_reduction_type)rS   Z
maybe_dataZdata_strrU   rU   rV   debug_str_short  s   
z!BaseSchedulerNode.debug_str_shortc                 C  s   t d| | j| jj d S )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   r[   rU   rU   rV   log_details  s   zBaseSchedulerNode.log_detailsself_depr&   	other_depc                 C     d S rQ   rU   )rS   r   r   rU   rU   rV   reorder_loops_by_dep_pair     z+BaseSchedulerNode.reorder_loops_by_dep_pairrenamesdict[str, str]c                 C     |  | j| d S rQ   )set_read_writesr   renamerS   r   rU   rU   rV   update_mutated_names      z&BaseSchedulerNode.update_mutated_namesdepr%   c                 C  r   rQ   )r   r   	with_readrS   r   rU   rU   rV   add_fake_dep#  r   zBaseSchedulerNode.add_fake_depr   c                 C     t dd |  D S )Nc                 s  s     | ]}|  p| V  qd S rQ   )rd   rf   r   rU   rU   rV   	<genexpr>'  s    
z=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>)anyr   r[   rU   rU   rV   has_aliasing_or_mutation&  s   z*BaseSchedulerNode.has_aliasing_or_mutationrwc                 C  s   || _ | j j| _|   d S rQ   )r   r   r   
prune_deps)rS   r   rU   rU   rV   r   +  s   
z!BaseSchedulerNode.set_read_writesfuture_used_buffersOrderedSet[str]mutation_real_namec                   s,   |   }t fdd|D }|| | _d S )Nc                 3  s    | ]	}  ||V  qd S rQ   )get)r   kr   rU   rV   r   4  s    z3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>)used_or_aliased_buffer_namesr   r   )rS   r   r   Zused_buffersrU   r   rV   set_last_usage0  s   z BaseSchedulerNode.set_last_usagec                 C  s   | j D ]}|  qd S rQ   )r   r~   )rS   r   rU   rU   rV   mark_run7  s   

zBaseSchedulerNode.mark_runc                 C  s"   t dd t| jj| jjD S )Nc                 s      | ]}|j V  qd S rQ   rZ   r   r   rU   rU   rV   r   <  
    
z6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr>)r   	itertoolschainr   r   r   r[   rU   rU   rV   used_buffer_names;  s   z#BaseSchedulerNode.used_buffer_namesc                   s   t t   dd t| jj| jjD }t|dkrB| } 	| t
jj|r<| fddt
jj|  D  t|dks S )Nc                 S     g | ]}|j qS rU   r   r   rU   rU   rV   r   D  s    zBBaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>r   c                 3  s    | ]	}| vr|V  qd S rQ   rU   )r   aliasZ
used_namesrU   rV   r   L  s    zABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>)r   rP   r   r  r   r   r   rg   popaddrC   rw   name_to_bufferr   extendrs   )rS   depsr   rU   r  rV   r   A  s    

z.BaseSchedulerNode.used_or_aliased_buffer_namesc                   s   t  fdd jD  _d S )Nc                 3  s"    | ]}|j  jjvr|V  qd S rQ   )rZ   rG   available_buffer_namesr   r[   rU   rV   r   V      z/BaseSchedulerNode.prune_deps.<locals>.<genexpr>r   r   r[   rU   r[   rV   r   U  s   zBaseSchedulerNode.prune_depsc                   s>   d	 fddt fdd jjD }  j| d S )
Nr   r%   rO   r   c                   s,   t | tsdS  jj| j  }|tjjv S NF)	ru   r(   rG   r}   rZ   rW   rC   rw   removed_operations)r   op_namer[   rU   rV   should_prune^  s   
z7BaseSchedulerNode.prune_weak_deps.<locals>.should_prunec                 3      | ]	} |r|V  qd S rQ   rU   r   r  rU   rV   r   d      
z4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>r   r%   rO   r   )r   r   r   r   remove_reads)rS   	to_removerU   )rS   r  rV   prune_weak_deps\  s
   z!BaseSchedulerNode.prune_weak_depsname_to_fused_nodedict[str, BaseSchedulerNode]c                 C  s   t | || jj d S rQ   )_prune_redundant_depsrG   r}   )rS   r  rU   rU   rV   prune_redundant_depsi  s   z&BaseSchedulerNode.prune_redundant_depsc                 C  r   rQ   )rH   get_operation_namer[   rU   rU   rV   rR   n  r   zBaseSchedulerNode.get_namec                 C  s   |   S rQ   r   r[   rU   rU   rV   get_first_namer  s   z BaseSchedulerNode.get_first_namec                 C  r   )Nc                 s      | ]}|  V  qd S rQ   r   r   rH   rU   rU   rV   r   w      z8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>)r   	get_nodesr[   rU   rU   rV   get_operation_namesu     z%BaseSchedulerNode.get_operation_namesc                 C     t dd | jD S )Nc                 s  r  rQ   r   r   r   rU   rU   rV   r   {  r!  z5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>)r   r   r[   rU   rU   rV   get_buffer_namesy     z"BaseSchedulerNode.get_buffer_namesc                 C  r   )Nc                 s  s&    | ]}t |tot|d dV  qdS )T)Zdisallow_fp32_opsNru   SchedulerNoder    r   nrU   rU   rV   r     s    


zABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>allr"  r[   rU   rU   rV   can_codegen_in_low_precision}  s   z.BaseSchedulerNode.can_codegen_in_low_precisionc                 C  r   )Nc                 s  s"    | ]}t |tot|V  qd S rQ   r)  r+  rU   rU   rV   r     
    
z@BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>r-  r[   rU   rU   rV   r      s   z-BaseSchedulerNode.can_codegen_without_upcastsSequence[BaseSchedulerNode]c                 C  s   | gS rQ   rU   r[   rU   rU   rV   r"       zBaseSchedulerNode.get_nodesSequence[SchedulerBuffer]c                 C     | j S rQ   )r   r[   rU   rU   rV   r     r2  zBaseSchedulerNode.get_outputsbuf_namerE   c                 C  s
   | j | S rQ   )r   )rS   r5  rU   rU   rV   
get_output  ro   zBaseSchedulerNode.get_outputOptional[torch.device]c                 C  r   rQ   )rH   
get_devicer[   rU   rU   rV   r8    r   zBaseSchedulerNode.get_devicec                 C  s   |   }|d uo|jdkS Ncpu)r8  ra   rS   devicerU   rU   rV   is_cpu     zBaseSchedulerNode.is_cpuc                 C  s   |   }|d uot|jS rQ   )r8  r>   ra   r;  rU   rU   rV   r>     r>  zBaseSchedulerNode.is_gpuc                 C  r   r  rU   r[   rU   rU   rV   is_reduction  r   zBaseSchedulerNode.is_reductionc                 C  r   r  rU   r[   rU   rU   rV   is_split_scan  r   zBaseSchedulerNode.is_split_scanc                 C  r   r  rU   r[   rU   rU   rV   is_template  r   zBaseSchedulerNode.is_templatec                 C  r   r  rU   r[   rU   rU   rV   	is_extern  r   zBaseSchedulerNode.is_externc                 C  r   r  rU   r[   rU   rU   rV   
is_foreach  r   zBaseSchedulerNode.is_foreachread_depdependencies.Depc                 C  r   r  rU   rS   rD  rU   rU   rV   can_inplace  r   zBaseSchedulerNode.can_inplacec                 C  r   r  rU   r[   rU   rU   rV   has_side_effects  r   z"BaseSchedulerNode.has_side_effectsc                   sd  ddl m} ttr1tjr1tj	 t
jr1ttjtjjjjr+ttjdddur1ttjds3dS jtjjB jjB  dfd
d} D ]}|j}|dusTJ | rh| sh| sh| tjjv riqIjj D ]}|j!jj"v r~jj"|j! }njj#$|j!}|r.tjj%&|r.t|j't(s.|j)dusJ  fdd|j)D }t*|dkr.|d j+r.|d ju r.|jdur.t|j, t-j.t-j/t-j0fs.|j'rt|j'jt-j1t-j2frt*|j dks.||j|jr.||r.tjj34| |  ttjtjjjjr"tjj56|  tjj56|  | tjj7| <  nqmqIdS )z~
        Decide if there should be inplace updates for the node
        and record the decision in the active kernel.
        r   )can_match_buffer_size	mutationsNrq   buf_to_be_inplacedrE   rO   r   c                   s|   | j }|   t }| jD ]+}|j}t|tsq| j ||ur$q| fdd|j	 D O }t
|dkr; dS qdS )Nc                 3  s    | ]
}|j  kr|V  qd S rQ   r   )r   or5  rU   rV   r     s    
z^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>r   FT)rG   get_fused_noderR   r   rM   rH   ru   rD   r   reads_and_writesrg   )rK  Z
fused_noder
  rk   	user_noder[   rM  rV   single_index_in_fused_node  s$   


zKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_nodec                   s   g | ]}|j   vr|qS rU   rn   r   x)inconsequential_nodesrU   rV   r   	  s
    z;BaseSchedulerNode.decide_inplace_update.<locals>.<listcomp>r   )rK  rE   rO   r   )8codegen.wrapperrI  ru   r*  r   Zinplace_buffersrC   rw   Zhas_featurer8  r!   ZINPLACE_BUFFERSrz   r   r   codegenZsimdZ
SIMDKernelr   ry   r   r  rG   completed_operationsr   rH   rr   rs   rt   rR   removed_buffersr   r   rZ   r|   r}   r   rx   Z	can_reuserJ   NopKernelSchedulerNoderM   rg   rG  rv   r   r   r/   ZMutationLayoutSHOULDREMOVEZFallbackKernelr.   rq   Zmake_inplacerJ  r  r{   )rS   rI  rQ  r   Zbuf_nodereadZ	input_bufZremaining_usesrU   )rT  rS   rV   decide_inplace_update  s   
 



z'BaseSchedulerNode.decide_inplace_updateTbufferr<   	only_oncec           	      C  s  t jsd S |r| jrd S | jd usJ | j }g }|D ]_}|jdkr$q|d |d d|j d|j }d|jv rG|d|jd   }|| d|jv r{|jd  }|	d	d
 }|d|
dd
dd
dd  |d |d qt|dkrd S || d| _d S )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  Zseq_nrz seq_nr:stack_trace|{z{{}z}}r   \z#pragma CMT END ORIGINr   T)r   Zcomment_originr   rH   get_originsrT   appendtargetmetasplitreplacerg   
writelines)	rS   r\  r]  originsZ	out_linesrL  Zop_info_strr_  Zstack_trace_last_linerU   rU   rV   codegen_originating_info6  sB   












z*BaseSchedulerNode.codegen_originating_infoc                 C  s   | j dddS )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implr[   rU   rU   rV   get_read_write_buffers_sizesb     z.BaseSchedulerNode.get_read_write_buffers_sizesc                 C     | j dddS )NTFrn  rq  r[   rU   rU   rV   get_read_buffer_sizesh  rt  z'BaseSchedulerNode.get_read_buffer_sizesc                 C  ru  )NFTrn  rq  r[   rU   rU   rV   get_write_buffer_sizesn  rt  z(BaseSchedulerNode.get_write_buffer_sizesro  rp  c                 C  s   t | j||d ddS )Nrn  r   )start)sumget_read_write_buffer_accessesr   )rS   ro  rp  rU   rU   rV   rr  t  s   z3BaseSchedulerNode.get_read_write_buffers_sizes_impldict[str, int]c                   s  t tri S t trt jtri S dddt tr1t d t d  ntd	t	
t}|rKjjD ]
}||j | q@|r\jjD ]
}||j | qQ|ritd
d jjD nt }|rytdd jjD nt }dfddt trtfdd|D }|| }|| }i }||B D ]I}	tfdd||	 D  |	tjjv rtjj|	 }
n|	tjjv rtjj|	 }
nqd fdd|
}|	|vr|||	< q||	  |7  < q|S )az  
        Counting the number of bytes accessed for a kernel is
        surprisingly tricky. In particular, there is a differentiation
        between 'theoretical' memory accesses and practical memory
        accesses. For example, a layernorm kernel may actually access an
        input 3 times, but in theory, it only needs to access its input
        once (and may be optimized to do so through say, persistent
        reductions)

        Another example is that even though a buffer is passed in, we may
        not access the entire buffer. This may occur if we are accessing
        a slice of the buffer. Another tricky case is for indirect
        indexing, where the amount of bytes accessed depends on the
        values of the input.

        What this function aims to compute is the memory accesses for
        worst-case inputs, best-case optimization. What this means is
        that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

        1. Numel in ranges multiplied by number of deps the buffer has
        2. The buffer size

        Returns memory accesses per buffer.
        s
sympy.ExprrO   rX   c                 S  s   t jjj| ddS )Nr   fallback)rC   rw   sizevars	size_hint)r|  rU   rU   rV   try_size_hint  s   zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hintr   r       eAc                 s  r   rQ   r   r   rU   rU   rV   r         zCBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>c                 s  r   rQ   r   r   rU   rU   rV   r     r  r   rP   snodesr1  r   c                   s4    j j|  j}tdd |D }t|t| dkS )Nc                 s  r   rQ   rH   r   rk   rU   rU   rV   r     r  z\BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>r   )rG   r}   rM   r   rg   )r   r  rM   Zbuf_usesr[   rU   rV   is_materialized  s   zIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materializedc                 3  s     | ]} |j s|V  qd S rQ   r  r   )r  rS   rU   rV   r     s    
c                 3  s    | ]} V  qd S rQ   rU   r   )
node_numelrU   rV   r     s    <Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]c                   s   | sdS t | tjr|  S t | jtrGjj|   j	}d}|D ]#}t |j
ts+J t |j
j
trB|j
 D ]	}||j
7 }q7q! dS |S t | jtjr[tfdd|  D S t|  }t|  t | S )Nr   c                 3  s     | ]} t j|V  qd S rQ   )rC   rw   
get_buffer)r   Zmut_name)get_buf_bytesrU   rV   r     s
    
zZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>)ru   r   TorchBindObjectr  rc   r/   rG   r}   rR   rM   rH   rD   r.   r   r   ry  rt   rB   r   r:   Z	get_dtypemin)r   rM   Ztotrk   Z	sched_bufZ	buf_elems)buf_accessed_elemsr  rS   r  rU   rV   r    s.   zGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytesN)r|  r}  rO   rX   )r   rP   r  r1  rO   r   )r   r  rO   rX   )ru   rY  ExternKernelSchedulerNoderH   r.   r*  rB   
get_rangesrX   collectionsr   r   r   r   rZ   rf  r   r   FusedSchedulerNodery  rC   rw   r  graph_inputs)rS   ro  rp  Zbuf_accessesr   r   r   rX  Zbuf_byte_accessesr5  r   Z	buf_bytesrU   )r  r  r  r  rS   r  rV   rz  ~  sd   




#
z0BaseSchedulerNode.get_read_write_buffer_accessesfloatc                   s  |   d  d }|j }tt|sdS t| jr[t| jtj	s%J zt
| jW S  tyC } zt| W Y d}~dS d}~w tyZ } zt| W Y d}~dS d}~ww t| jrbdS |j }zt }t|d }W n
 ty|   Y dS w t| trst| jtjsJ dt| jtt| jddd}|durqddlm} ddlm}	 td	d
 | jjD rdS | }
|	dd}t !| jj"q t #|
[ ddlm$   fdd| jjD }| jj%}|j&|g|R i | jj' d}|( }| ) }|| | d }|| }t*||W  d   W  d   W  d   W  d   S 1 s3w   Y  W d   n	1 sCw   Y  W d   n1 sSw   Y  W d   dS W d   dS 1 slw   Y  dS t| t+st| jt,r| ) | S dS )zB
        Returns estimated op runtime in nanoseconds (ns)
        r   Nl    J)type(self.node)=python_kernel_namer   )FakeTensorMode)FlopCounterModec                 s  s$    | ]}t t| d kV  qdS r   N)rg   r   	get_numelr+  rU   rU   rV   r   6  s
    
z:BaseSchedulerNode.get_estimated_runtime.<locals>.<genexpr>F)displayr   ir_node_to_tensorc                   s   g | ]} |d dqS )F)Zguard_shaperU   )r   inputr  rU   rV   r   F  s    
z;BaseSchedulerNode.get_estimated_runtime.<locals>.<listcomp>g      ?r  )-r"  r   rH   rv   r>   r,   r=   ru   r   ZIRNoder$   
ValueErrorr   r   	TypeErrorrA   Zmaybe_get_dtyper;   r9   r   r  ExternKernelra   kernel_name_to_opr   r   Ztorch._subclasses.fake_tensorr  Ztorch.utils.flop_counterr  r   inputsrC   set_current_nodeZfx_nodeZset_fake_moder  	__class__Zprocess_kernelr   Zget_total_flopsrs  maxr  r+   )rS   r   rc   edtypeZgpu_memory_bandwidthZ	gpu_flopsrT   r  r  Z	fake_modeZflop_counter_modeZfake_inputsclsfactorZcounted_flopsZcounted_bytesZcompute_timeZtransfer_timerU   r  rV   get_estimated_runtime  s   





"

l* z'BaseSchedulerNode.get_estimated_runtimeOptional[ir.TemplateBuffer]c                 C  r   rQ   rU   r[   rU   rU   rV   get_template_node_  r   z#BaseSchedulerNode.get_template_nodeir.TemplateBufferc                 C  s   |   }|d us
J |S rQ   r  )rS   templaterU   rU   rV   get_template_node_or_throwb  s   z,BaseSchedulerNode.get_template_node_or_thrownodeslist[BaseSchedulerNode]Jtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]]c                 C  sD   t dd t| D }| d| }| | }| |d d }|||fS )zQ
        For the list of nodes, get the prologue, template, and epilogue
        c                 s  s     | ]\}}|  r|V  qd S rQ   rA  r   ir,  rU   rU   rV   r   n  s    zCBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>Nr   )next	enumerate)r  Ztemplate_indexprologuetemplate_nodeepiloguerU   rU   rV   get_prologue_template_epilogueg  s
   
z0BaseSchedulerNode.get_prologue_template_epilogueN)rG   rF   rO   rp   )rH   r   rO   rp   r   )rO   r   r   r   r&   r   r&   rO   rp   r   r   rO   rp   )r   r%   rO   rp   r   )r   r   rO   rp   r   r   r   r   rO   rp   rO   r   r  r  rO   rp   rO   r1  )rO   r3  )r5  rP   rO   rE   rO   r7  rD  rE  rO   r   T)r\  r<   r]  r   rO   rp   r   )ro  r   rp  r   rO   rX   )ro  r   rp  r   rO   r{  )rO   r  rO   r  )rO   r  )r  r  rO   r  )8rb   r   r   r   r   r   r   rl   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r  r  rR   r  r6   r#  r'  r/  r    r"  r   r6  r8  r=  r>   r?  r@  rA  rB  rC  rG  rH  r[  rm  rs  rv  rw  rr  rz  r  r  r  staticmethodr  rU   rU   rU   rV   rD      s   
 



































 ,


 
[
c                   @  sD   e Zd ZU g dZded< ded< dddZdddZdddZdS )	WhyNoFuse)node1node2reasonrq   rP   r  ztuple[Any, ...]rq   r  rD   r  rO   rp   c                 C  s   || _ || _d S rQ   r  r  rS   r  r  rU   rU   rV   r   }  s   
zWhyNoFuse.__init__r   c                 G  s   || _ || _t|  d S rQ   )r  rq   
fusion_logdebug)rS   r  rq   rU   rU   rV   __call__  s   zWhyNoFuse.__call__c                 C  s*   d| j   d| j  d| j| j  S )Nzcannot fuse z with r]   )r  rR   r  r  rq   r[   rU   rU   rV   __str__  s   
zWhyNoFuse.__str__Nr  rD   r  rD   rO   rp   )r  rP   rq   r   rO   rp   r   )rb   r   r   	__slots__r   r   r  r  rU   rU   rU   rV   r  v  s   
 

r  objr   rO   rP   c                 C  sF   t | ttfrt| td} tj| dd}d|v r!dt|d S |S )Nkey   )rh   r       )	ru   r   setsortedrP   pprintre   textwraprh   )r  rj   rU   rU   rV   re     s   re   c                   @  s8   e Zd ZdddZddd	ZdddZdddZeZdS )r   r   r'   rO   rp   c                 C  s   t |g| _d S rQ   r  r   rU   rU   rV   r        zOutputNode.__init__r   c                 C  r   r  rU   r[   rU   rU   rV   r?    r   zOutputNode.is_reductionr   c                 C  r   )NrU   rU   r[   rU   rU   rV   rs     r   z'OutputNode.get_inputs_that_alias_outputrP   c                 C  r   )NZOUTPUTrU   r[   rU   rU   rV   rR     r   zOutputNode.get_nameN)r   r'   rO   rp   r   r   r   )rb   r   r   r   r?  rs   rR   r   rU   rU   rU   rV   r     s    



r   rH   r  r  r}   dict[str, SchedulerBuffer]rp   c                   s   t  jD ]}t|ts! |j  }|    d7  < qd fddtfdd	jD }|rKj| _	j
| d
S d
S )am  
    Prunes weakdeps intended for mutation ordering
    on an upstream fused node if after fusion there is another dependency
    on the fused upstream node, making the weakdep redundant

    In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
    be incrementally removed, enabling other fusions, ensuring they are fused in order.
    r   r   r%   rO   r   c                   sD   t | tr  | j  }|   dk}| k}|p|S dS )Nr   F)ru   r(   rZ   rW   rR   )r   r  Zis_redundantZis_self_dep)r}   name_to_dep_countr  rH   rU   rV   r    s   
z+_prune_redundant_deps.<locals>.should_prunec                 3  r  rQ   rU   r   r  rU   rV   r     r  z(_prune_redundant_deps.<locals>.<genexpr>Nr  )r  r   r   ru   r(   rZ   rW   rR   r   r   r   r  )rH   r  r}   r   r  Zdeps_to_prunerU   )r}   r  r  rH   r  rV   r    s   

r  )zextern_kernels.convolutionzextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmzextern_kernels._scaled_mmc                      s<   e Zd Zd fddZdd
dZdddZdddZ  ZS )r  rG   rF   rH   r   rO   rp   c                   (   t  | | | | |  d S rQ   superr   r   r   Zget_read_writesrS   rG   rH   r  rU   rV   r        
z"ExternKernelSchedulerNode.__init__rP   c                 C  s   |    dt| jdd  S )Nz.node.kernel = r  )rR   r   rH   r[   rU   rU   rV   r     s   z)ExternKernelSchedulerNode.debug_str_extrar   c                 C  r   NTrU   r[   rU   rU   rV   rB    r   z#ExternKernelSchedulerNode.is_externc                 C  s$   | j d usJ t| j do| j  S )NrH  )rH   ry   rH  r[   rU   rU   rV   rH    s   z*ExternKernelSchedulerNode.has_side_effectsrG   rF   rH   r   rO   rp   r   r   )rb   r   r   r   r   rB  rH  __classcell__rU   rU   r  rV   r    s
    

r  c                      s   e Zd Zd	 fddZ  ZS )
rY  rG   rF   rH   r   rO   rp   c                   r  rQ   r  r  r  rU   rV   r     r  zNopKernelSchedulerNode.__init__r  )rb   r   r   r   r  rU   rU   r  rV   rY    s    rY  c                      s  e Zd ZU ded< ded< dN fddZ		dOdPddZ		dOdPddZdQddZdRddZdSdd Z	dTd$d%Z
dUd'd(ZdVd*d+ZdWd,d-ZdWd.d/ZdWd0d1ZdXd3d4ZdYd7d8ZdZd:d;Zd[d<d=Z	>d\d]dAdBZed^dCdDZed^dEdFZd_dIdJZed`dLdMZ  ZS )ar*  z tuple[Sequence[sympy.Expr], ...]_sizesr0   _bodyrG   rF   rH   +Union[ir.ComputedBuffer, ir.TemplateBuffer]rO   rp   c                   s"   t  | | | |   d S rQ   )r  r   r   _compute_attrsr  r  rU   rV   r     s   
zSchedulerNode.__init__Nextra_indexing_constraints*Optional[tuple[dict[Any, Any], list[Any]]]recompute_sizes_body_funcOptional[Callable[..., Any]]c                 C  s   t | jtjtjfsJ | jj||d\| _| _| j }| j	
|j}||| jf| _tj p5t|j }t | jtjrI| | jj|d d S | tj| jg| jR d|i d S )Nr  r  )	normalizer  )ru   rH   r   r+   TemplateBufferZsimplify_and_reorderr  r  Zget_device_or_errorrG   get_backendgroup_fnr   r   loop_ordering_after_fusionr>   ra   r   extract_read_writesr   )rS   r  r  r<  r  Zshould_normalizerU   rU   rV   r    s0   

zSchedulerNode._compute_attrsc                 C  s   | j ||d d S )Nr  )r  )rS   r  r  rU   rU   rV   recompute_size_and_body  s   
z%SchedulerNode.recompute_size_and_bodyr  r   need_clear_tiling_cachec                 C  sl   t dd | jjD }| tj| jg| jR d|i| | j	
|  |r4ddlm} |j  d S d S )Nc                 s  s"    | ]}t |ttfr|V  qd S rQ   )ru   r(   r'   r   rU   rU   rV   r   -  s    
z5SchedulerNode.refresh_dependencies.<locals>.<genexpr>r  r   SIMDScheduling)r   r   r   r   r   r  r  r  r   pointwise_read_writesclear_cachecodegen.simdr  Zcandidate_tilingscache_clear)rS   r  r  Z	fake_depsr  rU   rU   rV   refresh_dependencies(  s$   z"SchedulerNode.refresh_dependencies	new_orderSequence[int]c                 C  s*   | j || _ | j j| _| jddd d S )NFTr  r  )r  Zreorder_iter_loopssizesr  r  )rS   r  rU   rU   rV   apply_new_loop_orderD  s
   
z"SchedulerNode.apply_new_loop_orderc                 C  s(   | j  | _ | j j| _| jddd d S )NTFr  )r  merge_loopsr  r  r  r[   rU   rU   rV   r	  L  s   
zSchedulerNode.merge_loopsr   r&   r   c                 C  s~   d }| j d }t||j  kr|jkrn n||}|r5t jd7  _td|  | | 	| d S td|   d S )Nr   r   z"Reorder loops for %s with order %szEDon't reordering %s because we can not decide the suitable loop order)
r  rg   num_varsdecide_loop_order_to_matchr   num_loop_reorderingloop_ordering_logr  rR   r  )rS   r   r   r  
self_sizesrU   rU   rV   r   X  s   
 

z'SchedulerNode.reorder_loops_by_dep_pairrP   c                 C  s   |   }| d| jd  | d| jd  | d| j g}| j D ]#}t|tsG|j}tj	
|}t|tjsG|| dt|j  q$t| jtrc|d| d |t| j d	 | jd usjJ ||   d
|S )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  r   )rR   r   r  r   rO  ru   r(   rZ   rC   rw   r  r   r  rf  re   rc   r  r0   r  rh   rl   rH   r	  r   join)rS   rZ   linesr   r5  r   rU   rU   rV   r   l  s$   

zSchedulerNode.debug_str_extraSequence[Sequence[sympy.Expr]]c                 C  r4  rQ   )r  r[   rU   rU   rV   r    r2  zSchedulerNode.get_rangesc                 C  s6   t | jtjtjfsJ dt| jt| j S Nr  )ru   rH   r   r+   r  ra   r   r   r[   rU   rU   rV   r?    s   zSchedulerNode.is_reductionc                 C  sF   t | jtjtjfsJ dt| jt | jtjo"t | jjtjS r  )ru   rH   r   r+   r  ra   r   Z	SplitScanr[   rU   rU   rV   r@    s   
zSchedulerNode.is_split_scanc                 C  s   t | jtjS rQ   ru   rH   r   r  r[   rU   rU   rV   rA       zSchedulerNode.is_templater  c                 C  s   t | jtjr
| jS d S rQ   r  r[   rU   rU   rV   r       zSchedulerNode.get_template_node
index_varsSequence[sympy.Expr]c                 G  s   |    |   | | d S rQ   )r[  r   rV  )rS   r  rU   rU   rV   run  s   zSchedulerNode.rundict[sympy.Expr, sympy.Expr]c                 C  sH   | j }ttt|ttt|ksJ tttj|tj|}|S rQ   )	r  ry  maprg   dictzipr   r  from_iterable)rS   r  r  
var_rangesrU   rU   rV   ranges_from_index_vars  s    

z$SchedulerNode.ranges_from_index_varsc              	   C  s   |  |}zCttt |. tj|  | j|  W d    n1 s'w   Y  W d    W d S W d    W d S 1 sAw   Y  W d S  tyW   t	
d| j  w )NzError in codegen for %s)r  rC   Zset_ops_handlerr5   Zget_ops_handlerrz   r  r  r   r   fatalrH   )rS   r  r  rU   rU   rV   rV    s   

VzSchedulerNode.codegenT	pointwiser   c                 C  s:   |r| j nt| j \}}tj| j|tjjgt| gdS )z\
        Get the memory dependencies in either the pointwise or the reduction axes.
        )Zhidden_args)	r  reversedr   r  r  sympySZZerorg   )rS   r!  Z
keep_sizesZignore_sizesrU   rU   rV   "pointwise_or_reduction_read_writes  s   z0SchedulerNode.pointwise_or_reduction_read_writesc                 C     | j ddS )zH
        Get the memory dependencies in the non-reduction axes.
        Tr!  r%  r[   rU   rU   rV   r       z#SchedulerNode.pointwise_read_writesc                 C  r&  )zD
        Get the memory dependencies in the reduction axes.
        Fr'  r(  r[   rU   rU   rV   reduction_read_writes  r)  z#SchedulerNode.reduction_read_writesrD  rE  c                 C  s   |   rdS tdd |  D rdS t| jjdkrDt|tjrDt	t
| jj}t|tjs8J dt||j|jkoC|j|jkS dS )NFc                 s  r  rQ   )rd   r&  rU   rU   rV   r     r!  z,SchedulerNode.can_inplace.<locals>.<genexpr>r   ztype(write_dep)=)rA  r   r   rg   r   r   ru   r   r&   r  iterra   indexsize)rS   rD  Z	write_deprU   rU   rV   rG    s   zSchedulerNode.can_inplacer   c                 C  s   t t  }t| jtrR| j D ]A}|jdkrQ|jdkrQd|jv r(|jd dks6t	|j
dkrQ|j
d dkrQ|d|jv rB|jd nt	|j
dkrN|j
d	 nd
 q|S )NZcall_methodstoremode
atomic_add   r  rZ      r   r   )r   rP   ru   r  r0   r"  rT   rg  r   rg   rq   r  )rS   Zbuffers_store_as_atomic_addrH   rU   rU   rV   _get_atomic_add_buffers  s   




z%SchedulerNode._get_atomic_add_buffers)rG   rF   rH   r  rO   rp   NN)r  r  r  r  rO   rp   )r  r   r  r   rO   rp   )r  r  rO   rp   r   r  r   )rO   r  r   r  )r  r  rO   rp   )r  r  rO   r  )r  r  rO   rp   r  )r!  r   rO   r   )rO   r   r  r  )rb   r   r   r   r   r  r  r  r  r	  r   r   r  r?  r@  rA  r  r  r  rV  r%  r6   r  r*  rG  r3  r  rU   rU   r  rV   r*    s@   
 "














r*  group_snode/Union[FusedSchedulerNode, GroupedSchedulerNode]c                   sV    j } tjdd |D  t fddtjdd |D  D  jj  _	d S )Nc                 S  r  rU   r   rR  rU   rU   rV   r         z3refresh_group_node_dependencies.<locals>.<listcomp>c                 3  "    | ]}|j   vr|V  qd S rQ   rZ   r'  r   r5  rU   rV   r     r  z2refresh_group_node_dependencies.<locals>.<genexpr>c                 S  r  rU   )r   rR  rU   rU   rV   r     r8  )
r  r   r   
ReadWrites
merge_listr   unionr   r   r   )r5  r  rU   r;  rV   refresh_group_node_dependencies  s   r?  rG   rF   r  r  c                 C  s   t | ttfs	J || _|| _d | _tjdd |D  | _t	|  t
dd | jD | _tdd | jD | _dd |  D | _d S )Nc                 S  s   g | ]
}|j d ur|j qS rQ   )r   rR  rU   rU   rV   r         z#init_group_node.<locals>.<listcomp>c                 s  r   rQ   r   rR  rU   rU   rV   r     r  z"init_group_node.<locals>.<genexpr>c                 s  r   rQ   )r   rR  rU   rU   rV   r     r  c                 S  r   rU   r   r   rU   rU   rV   r     r   z#init_group_node.<locals>.<dictcomp>)ru   r  GroupedSchedulerNoder  rG   rH   r   r>  r   r?  r  r   r  r   r   r   )r5  rG   r  rU   rU   rV   init_group_node  s   rC  c                      s>  e Zd ZU dZded< edMdd	ZdNddZdO fddZe	dPddZ
dPddZe	dQddZdRddZdPdd ZdPd!d"ZdS fd&d'Ze	dQd(d)Ze	dQd*d+ZdTd-d.ZdPd/d0Ze	dUd2d3Ze	dUd4d5Ze	dUd6d7Ze	dVd9d:ZdWd<d=Ze	dUd>d?ZdXdAdBZdYdEdFZdZdIdJZdPdKdLZ  ZS )[r  z
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be fused together. The way it does this is by maintaining
    its unmet dependencies as the union of its constituent nodes.
    r  r  r  rD   r  rO   c                 C  s:  |j |j u sJ t|ttfsJ | rt|trt|jts"J t|j	j
dks,J ttt|j	j
ts9J tt|j	j
j}dd | D }t|dksSJ |d }t|j	j
dksaJ tt|j	j
}t|tspJ tt||j|j|j|jg|j	_
n	t|ttfsJ tt| | }| |j |S )Nr   c                 S     g | ]}|  r|qS rU   r  r   rU   rU   rV   r   3      z+FusedSchedulerNode.fuse.<locals>.<listcomp>r   )rG   ru   r*  r  rA  r  rH   r.   rg   r   r   r  r+  r'   rZ   r"  r&   r   r,  Z	var_namesr-  r/  r   r   r  )r  r  r  rZ   template_nodesr  writer  rU   rU   rV   fuse#  s,   
zFusedSchedulerNode.fuser   r&   r   rp   c                 C  s  |   rd S d }| jD ]%}t|tsJ |d ur+t|t|jd kr+td  d S |jd }qd }|d us9J t||j	  krG|j	krNn n|
|}|sZtd|   d S t jd7  _td|  | | jD ]}t|tsvJ || qmt|  d S )Nr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %s)rA  r  ru   r*  tupler  r  r  rg   r
  r  rR   r   r  r  r?  )rS   r   r   r  snoder  rU   rU   rV   r   E  s:   
 


z,FusedSchedulerNode.reorder_loops_by_dep_pairrG   rF   c                   s6   t  | t| || g | _t|dd dj| _d S )Nc                 S  s   t |  S rQ   )rX   r?  rS  rU   rU   rV   r   n  s    z-FusedSchedulerNode.__init__.<locals>.<lambda>r  )r  r   rC  rM   r  r   rS   rG   r  r  rU   rV   r   j  s   zFusedSchedulerNode.__init__rP   c                 C     d dd | jD S )N_c                 S     g | ]}|  qS rU   r   rR  rU   rU   rV   r   r      z/FusedSchedulerNode.get_name.<locals>.<listcomp>r  r  r[   rU   rU   rV   rR   p  r$  zFusedSchedulerNode.get_namec                 C     | j d  S Nr   r  rR   r[   rU   rU   rV   r  t  r  z!FusedSchedulerNode.get_first_namer   c                 C     t jdd | jD  S )Nc                 S  rO  rU   r'  rR  rU   rU   rV   r   y  rP  z7FusedSchedulerNode.get_buffer_names.<locals>.<listcomp>r   r>  r  r[   rU   rU   rV   r'  w  r$  z#FusedSchedulerNode.get_buffer_nameslist[SchedulerBuffer]c                 C  "   g }| j D ]	}||  q|S rQ   r  r	  r   rS   rj   rH   rU   rU   rV   r   {     
zFusedSchedulerNode.get_outputsc                   sP    fddt  jD } jd j}|d ur|   td| dS )Nc                   s,   g | ]\}}    d | d|  qS )z.snodes[z] =
)rR   rl   )r   r  rH   r[   rU   rV   r     s    z6FusedSchedulerNode.debug_str_extra.<locals>.<listcomp>r   r   r  )	r  r  rH   r	  r   r  rh   r  r   )rS   r  rH   rU   r[   rV   r     s   
z"FusedSchedulerNode.debug_str_extrac                 C  s   dd | j D }|  d| S )Nc                 S  rO  rU   )r   r   rU   rU   rV   r     rP  z6FusedSchedulerNode.debug_str_short.<locals>.<listcomp>z
, snodes: r  )rS   Z
snodes_strrU   rU   rV   r     s   z"FusedSchedulerNode.debug_str_shortr   r   r   c                   sD   t  || tt  }t| jD ]}||| ||j qd S rQ   )r  r   r   rP   r"  r  updater   )rS   r   r   rH   r  rU   rV   r     s   
z!FusedSchedulerNode.set_last_usagec                 C  rU  )Nc                 S  rO  rU   )r  rR  rU   rU   rV   r     rP  z8FusedSchedulerNode.used_buffer_names.<locals>.<listcomp>rW  r[   rU   rU   rV   r    r$  z$FusedSchedulerNode.used_buffer_namesc                 C  rU  )Nc                 S  rO  rU   )r   rR  rU   rU   rV   r     rP  zCFusedSchedulerNode.used_or_aliased_buffer_names.<locals>.<listcomp>rW  r[   rU   rU   rV   r     s   z/FusedSchedulerNode.used_or_aliased_buffer_namesr1  c                 C  r4  rQ   r  r[   rU   rU   rV   r"    r2  zFusedSchedulerNode.get_nodesc                 C  s   t | j d|   dS )Nz(nodes=r   r   r[   rU   rU   rV   r     r   zFusedSchedulerNode.__repr__r   c                 C  r%  )Nc                 s  r  rQ   )r?  rR  rU   rU   rV   r     r!  z2FusedSchedulerNode.is_reduction.<locals>.<genexpr>r   r  r[   rU   rU   rV   r?    r(  zFusedSchedulerNode.is_reductionc                 C  r%  )Nc                 s  r  rQ   )r@  rR  rU   rU   rV   r     r!  z3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>r^  r[   rU   rU   rV   r@    r(  z FusedSchedulerNode.is_split_scanc                 C  r%  )Nc                 s  r  rQ   r  rR  rU   rU   rV   r     r!  z1FusedSchedulerNode.is_template.<locals>.<genexpr>r^  r[   rU   rU   rV   rA    r(  zFusedSchedulerNode.is_templater  c                 C  s$   | j D ]}| r|   S qd S rQ   )r  rA  r  r   rU   rU   rV   r    s
   
z$FusedSchedulerNode.get_template_nodetorch.devicec                 C  s
   | j d S rS  )r   r[   rU   rU   rV   r8    ro   zFusedSchedulerNode.get_devicec                 C  r%  )Nc                 s  r  rQ   )r   rR  rU   rU   rV   r     r!  z>FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>r^  r[   rU   rU   rV   r     r(  z+FusedSchedulerNode.has_aliasing_or_mutationr   c                 C     t rQ   NotImplementedErrorr   rU   rU   rV   r     r   z'FusedSchedulerNode.update_mutated_namesrZ   r%   c                 C  r`  rQ   ra  )rS   rZ   rU   rU   rV   r     r   zFusedSchedulerNode.add_fake_deprD  rE  c                 C  r`  rQ   ra  rF  rU   rU   rV   rG    r   zFusedSchedulerNode.can_inplacec                 C  s  |   }ddd | jD }t }|| dt| j d| d| dt| jj	 d| d	t| j
 d| d
t| jj| j
  d| d |  |  D ]	}||  qOW d   n1 scw   Y  |d z	||   W n ty   tjddd Y nw |  S )r   r^   c                 s  s    | ]}t |jV  qd S rQ   )ra   rb   r+  rU   rU   rV   r         z/FusedSchedulerNode.debug_str.<locals>.<genexpr>r]   r   r   r   r   r   r   z.outputs = [
            Nr_   r   Tr   )rR   r  r  r<   r   ra   rb   re   r   r   r   r   rh   r   rl   r`   r   r   r   r   ri   r   )rS   rZ   Znode_typestrr   r   rU   rU   rV   rl     sJ   

	
zFusedSchedulerNode.debug_strr  rD   r  rD   rO   r  r  rG   rF   r  r  rO   rp   r   r  rO   rX  r  r  r   r  )rO   r_  r  )rZ   r%   rO   rp   r  ) rb   r   r   __doc__r   classmethodrH  r   r   r6   rR   r  r'  r   r   r   r   r  r   r"  r   r?  r@  rA  r  r8  r   r   r   rG  rl   r  rU   rU   r  rV   r    sJ   
 
!%









r  c                      s   e Zd ZU dZd<ddZd=d	d
Zed>ddZed?ddZ			d@dA fddZ	edBddZ
edCd!d"ZeZd#ed$< edDd&d'ZedCd(d)ZdEd*d+ZdEd,d-ZdFd.d/ZdGd0d1ZdHd3d4ZdId6d7ZdJd:d;Z  ZS )KForeachKernelSchedulerNodez
    This is a schedular node that consists of a set of scheduler nodes that
    has no data dependencies among them and can be executed in parallel.
    producerrD   rO   rI   c                 C  s2   |  D ]}| | jv r| j|    S qd S rQ   )r   rR   read_to_node)rS   rj  r   rU   rU   rV   get_consumer_subnode_for  s
   z3ForeachKernelSchedulerNode.get_consumer_subnode_forconsumerc                 C  sp   t t  }|jjD ] }|j| jjvrq	| jj|j  }|| jv r)|	| j|  q	t
|dkr6tt|S d S Nr   )r   rD   r   r   rZ   rG   r}   rW   name_to_noder  rg   r  r+  )rS   rm  Z	producersrdZ	node_namerU   rU   rV   get_producer_subnode_for  s   

z3ForeachKernelSchedulerNode.get_producer_subnode_forr   c                   s&  t  |}  r;| r;tt  tt|}t jt|jk}|s)|d |o:t fddt j|jD S | re 	 rI|d dS tt|}|
 }|d ur_|j |S |d dS   r|	 rs|d dS tt   |}|d ur j||S |d dS td	)
Nzforeach do not have same lengthc                 3  s"    | ]\}} j ||V  qd S rQ   )rG   can_fuser   lrrj  rU   rV   r     s
    
z6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  rC  typingcastri  rg   r  r.  r  r?  rl  rG   rr  rq  AssertionError)r  rj  rm  whyZforeach_matchconsumer_subnodeproducer_subnoderU   rv  rV   rr    sJ   


z#ForeachKernelSchedulerNode.can_fusec                 C  s  |  s
|  s
J |  rtt|}|j}|j}ntt|}|j}|j}d }d }|  rL|  rLtt|}tt|}dd t|j|jD }nj|  rtt|}||}g }|}d }|jD ]}	|	|u rxt	
|	|}
|
}||
 qd||	 qdn7|  rtt|}||}g }|}d }|jD ]}	|	|u rt	
||	}
|
}||
 q||	 qntd| |j|||||dS )Nc                 S  s   g | ]
\}}t ||qS rU   )r  rH  rs  rU   rU   rV   r   Q  s    
z3ForeachKernelSchedulerNode.fuse.<locals>.<listcomp>zTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)rC  rw  rx  ri  r}  r  r  r  rq  r  rH  rf  rl  ry  rG   )r  rj  rm  r}  r  r~  r  fused_nodesr|  rH   new_noder{  rU   rU   rV   rH  >  sj   



zForeachKernelSchedulerNode.fuseNFrG   rF   r  r  r}  r~  r  r  rp   c                   s  i  _ i  _|d u s|d u r4t || |D ]}|jjD ]}| j |j< q| D ]}	| j|	< q*qny| _| _	d  _
g  _ tj|j|jg t fddt|j|jD  jj  _t|j|jg _t|j|jg _| rt|tsJ ||}
}nt|tsJ ||}
}|
j _ j|j |
j _| D ]}	| j|	< q| _|d  }|sJ |t dfff _!tt"j#j$   _%| _&d S )Nc                 3  r9  rQ   r:  r   r[   rU   rV   r     s    z6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>r   Zcombo_kernel)'rk  ro  r  r   r   r   rZ   r#  rG   r  rH   rM   r   r   r<  r=  r   r>  r   r   r  r   r  r   rC  ru   ri  r   r]  r}  r8  r#  Exprr   r   ZfxNoderl  r  )rS   rG   r  r}  r~  r  r  rH   rZ  rZ   Zforeach_nodeZ
other_noder<  r  r[   rV   r     s\   	


z#ForeachKernelSchedulerNode.__init__r  c                   s   dd |D }|rt dt|dd |D  dd |D }dd |D }|r/t dt| dd |D }d	d |D   rJt d
tt g  fdd|D }|S )Nc                 S     g | ]	}t |tr|qS rU   )ru   r  rR  rU   rU   rV   r     s    z?ForeachKernelSchedulerNode.combinable_nodes.<locals>.<listcomp>z/ComboKernels: %d external nodes are filtered %sc                 S  s    g | ]}|j d ur|j  qS rQ   rH   re  r   rU   rU   rV   r     s     c                 S  s   g | ]}t |ttfs|qS rU   )ru   rY  r  rR  rU   rU   rV   r     s    c                 S  r  rU   ru   ri  rR  rU   rU   rV   r     
    
z+ComboKernels: %d foreach nodes are filteredc                 S  s   g | ]	}t |ts|qS rU   r  rR  rU   rU   rV   r     r  c                 S  rD  rU   r  rR  rU   rU   rV   r     rE  z,ComboKernels: %d template nodes are filteredc                   s   g | ]}| vr|qS rU   rU   rR  rF  rU   rV   r     rE  )r   r  rg   r   )r  r  ZexternZfiltered_nodesZforeach_nodesrU   r  rV   combinable_nodes  s4   z+ForeachKernelSchedulerNode.combinable_nodeslist[list[BaseSchedulerNode]]c                   sD   |   }g }d |D ]| fddtdt D  q
|S )zS
        Returns a list of lists of nodes that are to be grouped together.
           c                   s   g | ]
}||   qS rU   rU   )r   r  Zmax_num_nodesr  rU   rV   r     s    zUForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels.<locals>.<listcomp>r   )_topological_sort_nodesr	  rangerg   )rG   Zsorted_nodesZgrouped_nodesrU   r  rV   &_default_group_nodes_for_combo_kernels  s   zAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelscustom_group_algorithmc                 C  s
   | t _d S rQ   ri  r  )r  rU   rU   rV   %set_group_algorithm_for_combo_kernels  s   z@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernelsc                 C  s
   t | S rQ   r  rG   rU   rU   rV   group_nodes_for_combo_kernels  s   
z8ForeachKernelSchedulerNode.group_nodes_for_combo_kernelsc                 C  r`  rQ   ra  r[   rU   rU   rV   r   
  r   z#ForeachKernelSchedulerNode.mark_runc                 C  r`  rQ   ra  r[   rU   rU   rV   rV    r   z"ForeachKernelSchedulerNode.codegenc                 C  r   r  rU   r[   rU   rU   rV   rC    r   z%ForeachKernelSchedulerNode.is_foreachc                 C  s
   t | jS )zeReturns a list of nodes which comprise the combo kernel.
        These nodes may be vertically fused.)r   r  r[   rU   rU   rV   get_subkernel_nodes  s   
z.ForeachKernelSchedulerNode.get_subkernel_nodesr1  c                 C  s   t tjdd | jD S )zqReturns all nodes contained in this kernel, unpacking fused nodes
        into their constituent scheduler nodes.c                 s  r  rQ   )r"  rR  rU   rU   rV   r     r!  z7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>)r   r   r  r  r  r[   rU   rU   rV   r"    s   z$ForeachKernelSchedulerNode.get_nodesrP   c                 C  rR  rS  )r  r  r[   rU   rU   rV   r    r  z)ForeachKernelSchedulerNode.get_first_namer  r  c                 C  s*   t | || jj | jD ]}|| qd S rQ   )r  rG   r}   r  r  )rS   r  rH   rU   rU   rV   r     s   
z/ForeachKernelSchedulerNode.prune_redundant_deps)rj  rD   rO   rI   )rm  rD   rO   rI   rj  rD   rm  rD   rO   r   )rj  rD   rm  rD   rO   ri  )NNF)rG   rF   r  r  r}  r   r~  rI   r  rI   r  r   rO   rp   r  r  rO   r  )rG   rF   rO   r  )r  r  rO   rp   r   r   rO   r  r  r   r  )rb   r   r   rg  rl  rq  rh  rr  rH  r   r  r  r  r  r   r  r  r   rV  rC  r  r"  r  r  r  rU   rU   r  rV   ri    s:   
 

	.ED 






ri  c                      s   e Zd ZU dZded< ed&ddZd' fd
dZd(ddZd)ddZ	e
d*ddZd*ddZe
d+ddZd,ddZd-ddZed.d$d%Z  ZS )/rB  aC  
    This is a "fake" scheduler node that represents a group of scheduler nodes
    that are meant to be *grouped* together (it does not allow another node to be scheduled
    in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
    The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
    Fusion will still happen among the nodes within each GroupedSchedulerNode.
    At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
    r  r  rO   c                   sX   |d j  t fdd|D sJ |  |}|D ]	}| j| < q| j| < |S )Nr   c                 3  s    | ]}|j  u V  qd S rQ   r  r   r  rU   rV   r   8  rc  z.GroupedSchedulerNode.create.<locals>.<genexpr>)rG   r.  r  rR   )r  r  Zgrouped_snoderJ  rU   r  rV   create5  s   

zGroupedSchedulerNode.createrG   rF   rp   c                   s   t  | t| || d S rQ   )r  r   rC  rL  r  rU   rV   r   ?  s   zGroupedSchedulerNode.__init__c                 C  s8   | j D ]
}|| jj| < q| jj|  = | j| j S )z
        Do fusion among nodes within this GroupedSchedulerNode,
        and then unpack this GroupedSchedulerNode into regular nodes.
        )r  rG   r  rR   
fuse_nodes)rS   rJ  rU   rU   rV   unpackC  s   
zGroupedSchedulerNode.unpackfake_depr%   c                 C  s"   |  | j| | j| d S rQ   )r   r   r   r   r  )rS   r  rU   rU   rV   r   M  s   z!GroupedSchedulerNode.add_fake_deprP   c                 C  rM  )NrN  c                 S  rO  rU   r   rR  rU   rU   rV   r   S  rP  z1GroupedSchedulerNode.get_name.<locals>.<listcomp>rQ  r[   rU   rU   rV   rR   Q  r$  zGroupedSchedulerNode.get_namec                 C  rR  rS  rT  r[   rU   rU   rV   r  U  r  z#GroupedSchedulerNode.get_first_namer   c                 C  rU  )Nc                 S  rO  rU   rV  rR  rU   rU   rV   r   Z  rP  z9GroupedSchedulerNode.get_buffer_names.<locals>.<listcomp>rW  r[   rU   rU   rV   r'  X  r$  z%GroupedSchedulerNode.get_buffer_namesrX  c                 C  rY  rQ   rZ  r[  rU   rU   rV   r   \  r\  z GroupedSchedulerNode.get_outputsr1  c                 C  r4  rQ   r  r[   rU   rU   rV   r"  b  r2  zGroupedSchedulerNode.get_nodesrj  rD   rm  r   c                 C  r   r  rU   )r  rj  rm  rU   rU   rV   rr  e  r   zGroupedSchedulerNode.can_fuse)r  r  rO   rB  re  r  )r  r%   rO   rp   r   r  rf  r  r  )rb   r   r   rg  r   rh  r  r   r  r   r6   rR   r  r'  r   r"  rr  r  rU   rU   r  rV   rB  )  s"   
 		





rB  rU   stride_lengthslist[list[int]]r  r  priority_idxtuple[int, ...]	list[int]c                   sb   t jd fdd}ttttd }t|dkr&fdd	|D tjr/|j|d
 |S )z
    A heuristic to decide loop iteration orders.  This has not been well
    tuned and may be something we should autotune.
    arX   brO   c                   s     dks dkrt   dk dkS  fddD }fddD }tdd t||D }tdd t||D }||krIdS ||krOdS t  S )	Nr   c                      g | ]}t |  qS rU   absr   sl)r  rU   rV   r   }  rE  z6pick_loop_order.<locals>.index_cmp.<locals>.<listcomp>c                   r  rU   r  r  )r  rU   rV   r   ~  rE  c                 s  s$    | ]\}}|d kp||k V  qdS r  rU   r   Zsl_aZsl_brU   rU   rV   r         
z5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>c                 s  s$    | ]\}}|d kp||k V  qdS r  rU   r  rU   rU   rV   r     r  ra  )r7   ry  r  )r  r  Zstride_len_aZstride_len_bZa_firstZb_firstr  r  )r  r  rV   	index_cmpu  s   
z"pick_loop_order.<locals>.index_cmpr   c                      g | ]} | qS rU   rU   )r   pi)r  rU   rV   r     rP  z#pick_loop_order.<locals>.<listcomp>r  N)r  rX   r  rX   rO   rX   )		functools
cmp_to_keyr   r"  r  rg   r   Zpick_loop_orderssort)r  r  r  r  orderrU   r  rV   pick_loop_orderk  s   
r  c                   @  sV   e Zd ZU ded< dZded< dZded< dd	d
ZdddZdddZdddZ	dS )NodeUser$Union[BaseSchedulerNode, OutputNode]rH   Fr   rG  is_weakrO   rX   c                 C  s   t | j | j| jfS rQ   )rY   rH   rR   rG  r  r[   rU   rU   rV   r\     r  zNodeUser.__hash__otherobjectc                 C  s2   t |to|  | ko| j|jko| j|jkS rQ   )ru   r  rR   rG  r  rS   r  rU   rU   rV   __eq__  s   


zNodeUser.__eq__rP   c                 C  rm   rQ   rn   r[   rU   rU   rV   rR     ro   zNodeUser.get_namec                 C  s.   | j |j u sJ t| j | jo|j| jo|jS rQ   )rH   r  rG  r  r  rU   rU   rV   r     s   

zNodeUser.mergeNr   )r  r  rO   r   r   )r  r  rO   r  )
rb   r   r   r   rG  r  r\   r  rR   r   rU   rU   rU   rV   r    s   
 


r  c                      s  e Zd ZU ded< dddZdŇ fd	d
ZdddZedddZej	dddZdddZ
dddZdddZdddZdd d!Zdd"d#Zdd%d&Zdd(d)Zdd+d,Zdd-d.Zdd/d0Zdd1d2Zdd3d4Zdd7d8Zdd;d<Zdd@dAZddBdCZddEdFZddJdKZddLdMZddNdOZdddSdTZddUdVZ ddXdYZ!ddZd[Z"dd\d]Z#dd^d_Z$ddbdcZ%ddedfZ&ddgdhZ'ddmdnZ(ddodpZ)ddqdrZ*ddudvZ+dd{d|Z,dd~dZ-dddZ.dddZ/dddZ0dddZ1dddZ2dddZ3dddZ4dddZ5dddZ6dddZ7dddZ8dddZ9dddZ:dddZ;dddZ<dddZ=dddZ>dddZ?dddZ@dddZAdddZBdddZCdddĄZD  ZES )rF   zdict[Dep, int]_Scheduler__dep_size_hint_cacher  list[ir.Operation]rO   rp   c                 C  s8   t d | | W d    d S 1 sw   Y  d S )NScheduler.__init__)r   _initrS   r  rU   rU   rV   r     s   
"r  c                   s  t    i  _ tj_i  _tt _	t
  _tt   _tg tjj tjj tjj  _ fdd|D  _    jtjj   jD ]}|  qQ   _dd  jD  _dd  jD  _ j  _i  _i  _ t!" j j j _ #   $ j _ %  dd  jD  _ &  t' j(t) j7  _(ddl*m+}m,} | j t) j _- .   $ j _tt/ttf    _0t1j2d urt12 j _ 3 j _ 4   5  t1j6r j7d d	 t1j8rd
dl9m8} | j j jttjj ttj:  _t1j;r$t!< j _ =   >  | j tj?@ j  A  tt   _Bi  _CtDdE fdd d S )Nc                   s   g | ]}  |qS rU   )create_scheduler_noder+  r[   rU   rV   r         z#Scheduler._init.<locals>.<listcomp>c                 S  r   rU   r   r+  rU   rU   rV   r     r   z#Scheduler._init.<locals>.<dictcomp>c                 S  s$   i | ]}|  D ]}| |qqS rU   )r   rR   )r   rH   r   rU   rU   rV   r     s
    
c                 S  r   rU   r   r+  rU   rU   rV   r     r  r   )log_ir_post_fusionlog_ir_pre_fusion)num_ck_nodesr   )reorder_for_peak_memoryZgraph_statsc                     s    j  jt jdS )N)Zgraph_idZnum_nodes_before_fusionZnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesrg   r  rU   r[   rU   rV   r   1  s   z!Scheduler._init.<locals>.<lambda>)Fr  r   r  rC   rw   rG   backendsr  _post_grad_graph_counterr  r   count_graph_partition_counterr   rP   rW  r  keys	constantsZtorchbind_constantsr  r  update_zero_dim_cpu_tensorr]  r   get_donated_buffersr|   ro  r}   copyr  r   mutation_renamesr   Zdecide_global_ordering_of_commscompute_dependenciestopological_sort_scheduledead_node_eliminationcompute_ancestorsr   Zir_nodes_pre_fusionrg   Ztorch._inductor.debugr  r  r  create_foreach_nodesrI  logged_slow_fusionr   Z_pre_fusion_custom_passr  r	  finalize_multi_template_buffersZcombo_kernelscreate_combo_kernel_nodesr  memoryget_output_namesZ reorder_for_compute_comm_overlapZ$reorder_compute_and_comm_for_overlapprocess_grouped_nodescompute_last_usager  Zgraph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_row)rS   r  rH   r  r  r  r  r[   rV   r    s   












zScheduler._init!dict[str, SchedulerDonatedBuffer]c                 C  sD   i }t jjD ]}tt jj| tjrt| t jj| d d||< q|S )N)rJ   )rC   rw   Zgraph_inputs_originalru   r   ZDonatedBufferr   )rS   Zname_to_donated_bufrZ   rU   rU   rV   r  8  s   

zScheduler.get_donated_buffersr7  c                 C  s   t jjS rQ   rC   rw   current_devicer[   rU   rU   rV   r  C  s   zScheduler.current_devicer<  c                 C  s   |t j_d S rQ   r  r;  rU   rU   rV   r  G  s   c                 C  s4   t jdddkrddlm} || jdd dS dS )z,Generate an image of the graph for debuggingZINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)Zprint_graph)osenvironr   r  r  r  )rS   r  rU   rU   rV   r  K  s   zScheduler.debug_draw_graphlabelrP   c                 C  s4   t tjrt d| | jD ]}|  qd S d S )Nz%s:)r   isEnabledForloggingINFOr   r  r   )rS   r  rH   rU   rU   rV   debug_print_nodesR  s   

zScheduler.debug_print_nodesrH   r   rD   c                 C  s`   |  d us
J d| rt| |S t|tjtjfr!t| |S t|tjr,t	| |S t
|)Nz2All nodes passed to scheduling must have an origin)re  Zis_no_oprY  ru   r   r+   r  r*  r  r  rb  r   rU   rU   rV   r  X  s   


zScheduler.create_scheduler_nodec                   s   t t  g }j  tjj D ]9} fdd|D }|s!q| fdd|D }t	j
dk}t|d|d}|| |D ]}|j|< qCqfddjD t| _d S )Nc                   s(   g | ]}| v rt j| ts|qS rU   )ru   ro  rY  r   rZ   )kept_node_namesrS   rU   rV   r   k  s    z2Scheduler.create_foreach_nodes.<locals>.<listcomp>c                   s   g | ]} j | qS rU   ro  r  r[   rU   rV   r   v  r  r   Fr}  r  c                   s   g | ]
}|   vr|qS rU   r   r   )removed_node_namesrU   rV   r     s    )r   rP   r  r  rC   rw   listsr   r]  r   combo_kernels_autotuneri  rf  r  r   )rS   Zfe_nodesnamesr  r  Zfe_noderZ   rU   )r  r  rS   rV   r  e  s6   






zScheduler.create_foreach_nodesc                   s\  t d}G  fdddt|  t jD ]Q}| D ]J}| }| D ]?}|v rV|v rV| }| }|| } D ]}	|	 |u sP|	 |u rT||	< qBq(|v ra| |< q(| |< q(qqd%fdd				d&d'fdd}
i }t	j
j D ]\}}t|tjr|jD ]}d||< qqjD ]>}td|j |jdusJ t|j dd d}|D ]}t|tjsJ ||vr| ||< qt|j dd d}|D ]+}||v sJ | d| ||  }durj|  D ]}|t|  qqt|jjdkr'tt|jj }r't|t r'|j!}nd}| D ]d}t|" dks:J |" D ]Q}|}|
|| |t||d | jD ]6}| | krdqWt|jt#smJ |j$ D ]}|}|t%|| d |
||dd qrqWq>q-|jj&D ]}t|t%s|
|j'||(| q|)j* | D ]'}|" D ]}| j*|< | j*|< j+,||j+| < qqqt	j
- D ]}td| |
|t.t| qt	j
j/D ]=}| D ]5}||v sJ | d|  ||  }r3j| $ D ]}td || |
|t.t| qqqj*D ],}|t	j
jv rU|
|t.t| t	j
j01| q:|t	j
j2v re|
|t.t| q:d!d" t3t	j
j D fd#d$t	j
j0D t	j
_4jD ]}| D ]}|5|  j qqj6D ]}j6| 5| j qdS )(zi
        Create dependency edges between nodes, handling aliasing and
        mutation properly.
        Tc                      s8   e Zd ZdZ		ddd	d
ZdddZd fddZdS )z1Scheduler.compute_dependencies.<locals>.DedupListan  
            This data structure behaves like a list except it makes sure the
            elements remain unique.
            Normally one could use a OrderedSet/dict for this purpose however
            the list in question gets elements appended as it is being
            iterated over which means that we need to keep the list
            semantics.
            NitemsOptional[list[T]]
membershipOptional[OrderedSet[T]]rO   rp   c                 S  s   |pg | _ |p	t | _d S rQ   )r  r   r  )rS   r  r  rU   rU   rV   r     s   
z:Scheduler.compute_dependencies.<locals>.DedupList.__init__	node_userr  c                 S  s*   || j v rd S | j| | j | d S rQ   )r  r  rf  r  )rS   r  rU   rU   rV   rf    s   
z8Scheduler.compute_dependencies.<locals>.DedupList.appendr  DedupList[T]c                   s4   t  j|j} j fdd|jD  }||S )Nc                   s   g | ]	}| j vr|qS rU   )r  rR  r[   rU   rV   r         zMScheduler.compute_dependencies.<locals>.DedupList.__add__.<locals>.<listcomp>)r   r>  r  r  )rS   r  Znew_membershipZ	new_items	DedupListr[   rV   __add__  s
   
z9Scheduler.compute_dependencies.<locals>.DedupList.__add__r4  )r  r  r  r  rO   rp   )r  r  rO   rp   )r  r  rO   r  )rb   r   r   rg  r   rf  r  rU   r  rU   rV   r    s    
r  r,  rP   rO   c                   s   | j v r j |  S | S rQ   )r  r,  )r   rS   rU   rV   r     s   
z.Scheduler.compute_dependencies.<locals>.renameFused_by_namerP  r  rG  r   r  rp   c                   s    |   t||| d S rQ   )rf  r  )r  rP  rG  r  )name_to_usersr   rU   rV   add_user  s   
z0Scheduler.compute_dependencies.<locals>.add_userNzscheduling %sc                 S  r4  rQ   r   rK  rU   rU   rV   r         z0Scheduler.compute_dependencies.<locals>.<lambda>r  c                 S  r4  rQ   r   rK  rU   rU   rV   r     r  z not in r   )r/  )mutating_bufT)r  zscheduling output %sz+scheduling output %s for unbacked symint %sc                 S     i | ]\}}||qS rU   rU   )r   r,  rZ   rU   rU   rV   r   K	  r   z2Scheduler.compute_dependencies.<locals>.<dictcomp>c                   r  rU   rU   r  )	inp_namesrU   rV   r   N	  s    z2Scheduler.compute_dependencies.<locals>.<listcomp>)r,  rP   rO   rP   )FF)
r  rP   rP  r  rG  r   r  r   rO   rp   )7r
   r   r  r   r  r   rR   rd   r  rC   rw   r  r  ru   r#  r  free_symbolsr   r  rH   r  Zget_unbacked_symbol_defsSymbolZget_unbacked_symbol_usesro  r   r'   rg   r   r   r  r+  r&   r/  rf   rD   r'  r(   r   rZ   rG  r   r  r   r   r  r   Zgraph_outputsZmutated_inputsr  r  r  Zmutated_input_idxsr   r|   )rS   r  rH   Zbuf1Z	buf1_nameZ	buf2_nameZlist1Zlist2combinedr  r  Zunbacked_symbol_to_origin_noderZ   valfsZunbacked_symbol_defsr|  Zunbacked_symbol_usesru  r   r   Z	node_modeZalt_namerk   Z
other_namerZ  r5  r   rU   )r  r  r  r   rS   rV   r    s  





	




zScheduler.compute_dependenciesc                   s   g }t | jD ]uddd d} D ]$}t fdd	|jD }|r6td
|  tj	j
|  qd}q  o@| }|sI| qtd  tj	j  jjD ]}|j| jv r{| j|j j}fdd|D | j|j _q^qtt || _| jD ]  qdS )z0
        Remove any nodes without users
        rk   r  rO   r   c                 S  s   | j p
|  tjjv S rQ   )r  rR   rC   rw   r  )rk   rU   rU   rV   can_eliminate_userd	  r   z;Scheduler.dead_node_elimination.<locals>.can_eliminate_userFc                 3      | ]} |V  qd S rQ   rU   r   u)r  rU   rV   r   i	  r!  z2Scheduler.dead_node_elimination.<locals>.<genexpr>zremoved dead buffer: %sTzremoved dead operation: %sc                   s"   g | ]}|j    kr|qS rU   rn   r  r  rU   rV   r   {	  s    z3Scheduler.dead_node_elimination.<locals>.<listcomp>N)rk   r  rO   r   )r"  r  r   r.  rM   r   r  rR   rC   rw   rX  r  rH  rf  r  r   r   rZ   r}   r   r  )rS   Zupdated_nodesZactive_buffersr   Zcan_eliminaterZ  rM   rU   )r  rH   rV   r  Z	  s6   



zScheduler.dead_node_eliminationr  c                   s^   t t  t  g d fdd|D ]}| D ]}| |< qq|D ]}| q&S )	z?
        Ensure nodes is in topologically sorted order
        r,  rD   rO   rp   c                   sV   | vr) |  t| jdd dD ]}|j vrq |j  q|  d S d S )Nc                 S  r4  rQ   r   )drU   rU   rV   r   	  r  zDScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>r  )r  r  r   rZ   rf  )r,  r   ro  rj   seenvisitrU   rV   r  	  s   

z2Scheduler.topological_sort_schedule.<locals>.visitN)r,  rD   rO   rp   )r   rD   r  r'  )rS   r  rH   rZ   rU   r  rV   r  	  s   



z#Scheduler.topological_sort_schedulerJ  c                   sv   t t  }t|ttttfr|jD ]}||j	 qn
t
dt| d fdd|D }tt  fdd|D S )Nz+get_unmet_dep_nodes is not implemented for .c                 3  s    | ]
} j |  V  qd S rQ   )r}   rW   r   r[   rU   rV   r   	      z1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>c                 3  s    | ]} j | V  qd S rQ   r  r+  r[   rU   rV   r   	  rc  )r   rP   ru   r*  r  rY  r  r   r  rZ   RuntimeErrorra   r   )rS   rJ  Z
unmet_depsr   Zunmet_dep_opsrU   r[   rV   _get_unmet_dep_nodes	  s"   

	zScheduler._get_unmet_dep_nodesr  c                 C  s   g }t | jd}i }| jD ]!}| |}t|||< |D ]}||g }|| |||< qqdd | D }|rf|| |D ]}	||	g D ]
}
||
  d8  < qJ||	 qBdd | D }|s;|rlJ d|S )zU
        Sort nodes by their topological order, return a list of node lists.
        r   c                 S     g | ]
\}}|d kr|qS r   rU   r   r,  vrU   rU   rV   r   	  r@  z5Scheduler._topological_sort_nodes.<locals>.<listcomp>r   c                 S  r  r  rU   r   rU   rU   rV   r   	  r@  zTopological sort failed!)	r  fromkeysr  r  rg   r   rf  r  r  )rS   r  r  childrenrH   r
  r   cZzero_deg_nodesr,  rk   rU   rU   rV   r  	  s,   




z!Scheduler._topological_sort_nodesc                 C  s   i }| j D ])}tt  }|jD ]}| j|j  }|| ||| O }q||| < ||_	qt
| j D ]
\}}||_||_q4dS )z.
        Populate each node.ancestors
        N)r  r   rP   r   r}   rZ   rW   r  rR   r   r  r   r   )rS   Zname_to_ancestorsrH   r   r   Zdep_node_namer  rU   rU   rV   r  	  s   



zScheduler.compute_ancestorsc                 C  sd   | j D ],}tjs	qt|ttfr| stjdkrq| D ]}t|tr)|	 r*q|
  qqd S )NZhalide)r  r   r  ru   r*  r  r>   Zcpu_backendr"  rA  r	  )rS   rH   rJ  rU   rU   rV   r	  	  s   


zScheduler.merge_loopsc                 C  s   t dC tdD ]4}t|}td|d | | |}t|}td|d || ||ks3|dkr=td|d   nq	|W  d   S 1 sJw   Y  dS )zB
        Combine eligible nodes into FusedSchedulerNodes.
        zScheduler.fused_nodes
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r   r  rg   r  r  fuse_nodes_once)rS   r  r  Zold_lenZnew_lenrU   rU   rV   r  	  s0   

$zScheduler.fuse_nodesc                 C  s8   g }| j D ]}|t|tr| n|g q|| _ dS )zA
        Unpack GroupedSchedulerNode into regular nodes.
        N)r  r	  ru   rB  r  )rS   Z	new_nodesrH   rU   rU   rV   r  
  s   

zScheduler.process_grouped_nodesr1  tuple[float, str]c                 C  sh   t |dksJ |d  }|| _| |}tdddd ||W  d   S 1 s-w   Y  dS )
        Benchmark fused list of nodes and return the execution time
        in milliseconds on randomly generated inputs.
        r   benchmark_fused_nodesTZcompile_time_autotune_time_us)Zlog_pt2_compile_eventZdynamo_compile_column_usN)rg   r8  r  r  r   r)  )rS   r  r<  backendrU   rU   rV   r)  !
  s   
$zScheduler.benchmark_fused_nodesbenchmark_kernelr   c                 C  sd   t |dksJ |d  }|| _| |}td |||W  d   S 1 s+w   Y  dS )r(  r   r)  N)rg   r8  r  r  r   generate_kernel_code_from_nodes)rS   r  r+  r<  r*  rU   rU   rV   r,  3
  s   


$z)Scheduler.generate_kernel_code_from_nodesmoduler   r_  c                 C  sF   || _ | |}td ||W  d   S 1 sw   Y  dS )r(  r)  N)r  r  r   benchmark_codegened_module)rS   r-  r<  r*  rU   rU   rV   r.  A
  s
   

$z$Scheduler.benchmark_codegened_modulec                 C  sH  ddd}t | jD ]\}}t|trt|jtjr|j}tjj	s(|
 \}}n
td	d
 |jD }t|tjjjrA|j| q
| }|j}t|tjsPJ |j}	t|	tjs[J |j|	_|||	 | |	}
|
| j|< |
| j| < |
| j| < t|
 | D ]\}}|| j| < |j|_q|j|
_|j|
_|j |
_ q
d S )N	orig_nodeir.MultiTemplateBufferr  ir.OperationBufferrO   rp   c                 S  s   |  }|   }t|trt|tsJ | }|  }t|tr&t|ts(J tjj|= ||_tjj|= ||_	tjj
| }tjj
| |tjj
|< |tjj|< tjj| }tjj| |tjj|< |tjj|< d S rQ   )rR   ru   rP   r  rC   rw   r  rZ   Z
name_to_opZoperation_namebuffersr,  remove
operations)r/  r  Zreplaced_buf_nameZorig_buf_nameZreplaced_op_nameZorig_op_nameorigrU   rU   rV   replace_operation_bufferN
  s$   

zKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_bufferc                 s  s$    | ]}t |tjjjr|V  qd S rQ   )ru   r   r   Zselect_algorithmZExternKernelCaller)r   ZtimingrU   rU   rV   r   r
  s    
z<Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>)r/  r0  r  r1  rO   rp   )!r  r  ru   r*  rH   r   MultiTemplateBufferr   Ztest_configsZ%force_extern_kernel_in_multi_templateget_min_choicer  choice_timingsr   r   TritonTemplateCallerBasefinalize_as_triton_callerZoutput_noder   
StorageBoxZOperationBufferrc   r  ro  rR   r  r  r   r}   rM   r   r   r   )rS   r6  r  rH   
multi_nodeZmin_node_unfusedrN  Zout_tensorboxZout_storageZ
out_bufferZnew_scheduler_nodeZnew_outZold_outrU   rU   rV   r  M
  sP   




z)Scheduler.finalize_multi_template_buffers	node_listc                 C  s   t dd |D S )Nc                 s  sB    | ]}t |jd o|jduot |jjdo|jjjdkV  qdS )r   Nscatter_moder0  )ry   rH   r   r?  r+  rU   rU   rV   r   
  s    
z,Scheduler._any_atomic_add.<locals>.<genexpr>)r   rS   r>  rU   rU   rV   _any_atomic_add
  s   zScheduler._any_atomic_addr  r  Union[bool, Callable[[], bool]]c              	     s  t dd fD }tjs|sdS  rt tjr& s& r(dS 	 }|d 
 s6J jdkr=dS 	 }tt||}|rPdS ddlm  t|d 
 dusgJ d fddtjj d!fdd}|r8t dd fD r8 dur n ttjsJ j} \}		 \}		r|n|\
}
g d}t| dd dD ]O\}}t|tjjjsqψst|dr|jjkrq|	
 kr n-|d7 }|tj kr n"!| "|g||R  W d   n	1 sw   Y  qt#dkr(dS d"	
fdd}|S ||||||d" fdd}|S )#
        If config.benchmark_fusion is False, always return True.
        Otherwise, return True if fusion can brings speedup.
        c                 s  s(    | ]}|  ot| tjV  qd S rQ   )rA  ru   r  r   r7  r+  rU   rU   rV   r   
  s    
z.Scheduler.speedup_by_fusion.<locals>.<genexpr>Tr   r:  CompilationErrorNms_fusedr  ms1ms2rO   rp   c              	     st   t tjr8| || k r"t d   t|| |  d d S t d   t| ||  d d S d S )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  r  r  DEBUGr  r'  r3   r4   )rF  rG  rH  r  rU   rV   
log_fusion
  s   z/Scheduler.speedup_by_fusion.<locals>.log_fusionr  r1  )tuple[Optional[LambdaFuture], ModuleType]c                   sP   j | dd}t|}  sd }||fS  jd|d}t|ts$J ||fS )NT)r+  Ztriton_)Zkernel_nameZsource_code)r,  r   loadZuse_process_pooltritonru   r   )r  Zsrc_codemodfut)async_compilerS   rU   rV   compile_kernel
  s   
z3Scheduler.speedup_by_fusion.<locals>.compile_kernelc                 s  s    | ]	}|  d uV  qd S rQ   r  r+  rU   rU   rV   r   
  s    
c                 S     | d S rn  rU   rK  rU   rU   rV   r         z-Scheduler.speedup_by_fusion.<locals>.<lambda>r  allowed_prologue_inpsr   Fr   c            	        s
  t d} d }i }D ]^\}}}z
|d ur|  W n% ty> } zttjr4tds.dndt| W Y d }~q
d }~ww 	| 
| \}}|||< || k rY|} |}W d    n1 scw   Y  q
|  |  k r|d ur| |_dS dS )NinfzException in compiling %s: %sr  r  TF)r  rj   r   r  r  r  rJ  r  rP   swap_as_triton_callerr.  r;  Z_choice_timings)	Zmin_ms_fusedZms_fused_choiceZnew_timingschoicefutureZ	mod_fusedr  rF  path)r<  epilogue_fusionfuture_choicesrK  rG  rH  r=  rS   rU   rV   benchmark_when_ready1  sF   
	
z9Scheduler.speedup_by_fusion.<locals>.benchmark_when_readyc               
     sp  ddl m}  zd 
d 	d fD ]
}|d ur|  qd \ t r3d W dS 
d \trId W dS 	d \tr_d W dS   tdr  krfjvrjf t	d
 fd	d
   k W S  | y   Y dS  y } zdt|v rW Y d }~dS  d }~ww )Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelZslow_fusionc                	     s       dS )N)Zkernel1_pathZkernel1_latencyZkernel2_pathZkernel2_latencyZfused_kernel_pathZfused_kernel_latencyZslow_down_ratiorU   rU   rG  rH  rF  Zpath1path2Z
path_fusedrU   rV   r     s   
zKScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>Loop-carried variableT)Z)torch._inductor.runtime.triton_heuristicsr^  rj   r.  mathisinfr   r  r  r   r  rP   )r^  rP  r  )rE  r<  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2rK  rS   rz  r_  rV   r]  `  sZ   


)rF  r  rG  r  rH  r  rO   rp   )r  r1  rO   rL  r   )$r   r   Zbenchmark_fusionrA  ru   r  r   TritonTemplateBufferrC  r"  r8  ra   r   r   r  rA  triton.compiler.errorsrE  r  r   r   rQ  ZAsyncCompiler7  r9  r8  r)  r  r  r:  ry   rU  Z max_epilogue_benchmarked_choicesrW  rf  rg   )rS   r  r  Zis_multi_templateZnode_list_1Znode_list_2Znode_list_fusedrR  r9  rN  r`  Ztriton_choicesrX  Zunfused_timer]  rU   )rE  rQ  r<  r[  rd  re  rf  r\  rK  rG  rH  r=  r  r  rS   rz  rV   speedup_by_fusion
  s   






'BzScheduler.speedup_by_fusionc                 C  s   | j |  S )z0Look up the node in Scheduler name_to_fused_node)r  r  r   rU   rU   rV   rN    s   zScheduler.get_fused_nodec                   s  t |ttjrtd D ]}td|   qi dfdd d fd
d}|D ]@\}}||| |}|}	||ru
||su||}t|rm|||f|< |||f|< q5|spq5 || q5t  } D ]/\}}	}
||v rq}|| |	|	u sJ |
|
u sJ | r
|	|
s |	|
 q}tdd d}|}| |S )a  
        Combine eligible nodes into FusedSchedulerNodes.

        This relies on two key functions to control the logic:
            - self.can_fuse(): checks if a fusion is legal
            - self.score_fusion(): assigns priority to a given fusion
        zfuse_nodes_once, candidates:z  r  rD   r  rO   c                   s   t d|  |  |  }| |ksJ || | |  |   j	 fdd 
 D   S )Nzfusing %s with %sc                      i | ]}|   qS rU   r   r+  Znode3rU   rV   r     r  zEScheduler.fuse_nodes_once.<locals>.fuse_two_nodes.<locals>.<dictcomp>)r  r  rR   r8  r  rH  r3  r  r  r]  r"  )r  r  r<  )r  rS   rk  rV   fuse_two_nodes  s   


z1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodesrp   c                   s    | v s |v rf |  |d }|d us$J |\}}}|d  |d   ||u s>J  ||u sGJ | rP| |rQq  ||  | v s |v sd S d S rQ   )rN  r   r  will_fusion_create_cycle)r  r  Zpending_fusionZ
is_speedup	node_key1	node_key2)rl  pending_fusionsrS   rU   rV   resolve_pending_fusions  s"   

z:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusionsc                 S  r4  rQ   rA  rK  rU   rU   rV   r     r  z+Scheduler.fuse_nodes_once.<locals>.<lambda>r  N)r  rD   r  rD   rO   rD   r  )r   r  r  r  rJ  r  r   get_possible_fusionsrN  rr  rm  ri  callabler   r  r  r  r  )rS   r  rH   rq  r  r  ZspeedupZseen_pair_speedup_fnZis_speedup_fnrn  ro  rU   )rl  r  rp  rS   rV   r&    sR   










zScheduler.fuse_nodes_onceNr  Optional[int]c           	        s<  t | j}d}t| j}td| tt| D ]a\}}t|}t|dk r)q|dur3||kr3 nH| 	|s?td| q|d7 }t
jdk}t|d j|d|d td	t|| |D ]}|| q^|  | j fd
d  D  qt|dd d| _| | j| _td||t| j | | j dS )z'
        Groups parallel nodes
        r   z2ComboKernels: Generating with num_ck_nodes = %d...r2  Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                   rj  rU   r   r+  r;  rU   rV   r   6  r  z7Scheduler.create_combo_kernel_nodes.<locals>.<dictcomp>c                 S  r4  rQ   rA  rK  rU   rU   rV   r   8  r  z5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>r  zEGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodels)r   r  rg   r   r  r  ri  r  r  speedup_by_combo_kernelr   r  rG   r   r3  r  r  r]  r"  r  r  r  )	rS   r  r  r  Znum_nodes_orignumr>  r  rH   rU   r;  rV   r    sV   





z#Scheduler.create_combo_kernel_nodesc                 C  s   |D ]}| | j qd S rQ   )r  r  )rS   r  rH   rU   rU   rV   r  B  s   zScheduler.prune_redundant_deps1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]c           	        s   g  t tttf   d fdd}tt}|D ]}|r#q| D ]	}|| | q'q|	 D ]}|| q6t
jrdtt}|D ]}t|dd}|rX|| | qG|	 D ]}|| q]   jjd	d
 tdt   S )z^
        Helper to find all legal fusion opportunities, sorted by self.score_fusion()
        r  r  rO   rp   c                   s   t | D ]>\}}| |d d  D ]1}||f}|v rq| ||r, | q| s4| rA||rA ||f qqd S rn  )r  r  rr  rf  rA  rC  )r  Znode1_indexr  r  r  possible_fusionsr  rS   rU   rV   check_all_pairsO  s   
z7Scheduler.get_possible_fusions.<locals>.check_all_pairsr   NT)r  reversezfound %d possible fusionsr  r  rO   rp   )r   rI  rD   r  r   r   unfusable_noder  rf  r   r   Zaggressive_fusionr   *get_possible_fusions_with_highest_priorityr  score_fusion_keyr  r  rg   )	rS   r  rz  Zbuffer_names_groupingrH   r   Znode_groupingZgroup_groupingr   rU   rx  rV   rr  F  s6   




zScheduler.get_possible_fusionsc                   s   t t  d fdd| j | j B |jj |jj B   tfdd D }|rAt||d	 |S )z~
        Finds whether there's a path from node1 to node2 (or vice-versa)
        caused indirectly by other fusions.
        rH   rD   rO   r   c                   s^   t | tr-| vr-|  |   rdS t| j@ p,tfdd| j  D S dS )NFc                 3      | ]
} j | V  qd S rQ   r  r+  
found_pathrS   rU   rV   r     s
    
zIScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>)ru   r  r  r#  issubsetr   r   r   r  Zcombined_ancestorsZcombined_namesr  rS   visitedrU   rV   r    s   

z6Scheduler.will_fusion_create_cycle.<locals>.found_pathc                 3  r  rQ   r  r+  r  rU   rV   r     r  z5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>zwill create cycleNrH   rD   rO   r   )r   r  r#  _dictr  r   r   r  )rS   r  r  cyclerU   r  rV   rm  x  s   
z"Scheduler.will_fusion_create_cyclec              	     s   ddl m  dfdd}||}||}t fd	d
|D }t fdd
|D }||}d}	|D ]}
z
|	t|
d 7 }	W q4 tyK   Y  dS w ||}tjj	
|	d| r^dS dS )a  
        Return true if fusing the two nodes can potentially increasing peak memory.

        The implementation is more like a heuristic since we don't really know if we are at peak
        or not when trying to fuse these two ndoes. The order of nodes may change later which makes the
        peak memory estimation hard.

        Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
        1. find all buffers read by each node with a single user. These buffers are supposed to
           be reused if we don't fuses these 2 nodes
        2. find the intersection of these buffers for the two node and sum the total buffer size.
           If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
           Note that the extra memory allocation is not necessarily causing peak memory increase.
           This is just a heuristic.

        We return true only if the saving for fusion can not trade off the extra memory allocation.
        r   buffer_reuse_keyrH   rD   rO   list[ir.Buffer]c                   sL   g }| j jD ]} j|j}|r#t|jdkr#|j r#|	|j q|S rn  )
r   r   r}   r   rZ   rg   rM   rH   Zhas_tensor_outputrf  )rH   r   rp  r   r[   rU   rV   _find_single_user_inputs  s   zKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputsc                 3  r  rQ   rU   r   r  rU   rV   r     r!  z<Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>c                 3  r  rQ   rU   r   r  rU   rV   r     r!  r   r2  F    TN)rH   rD   rO   r  )rU  r  r   intersectionrX   r  score_fusion_memoryrC   rw   r  Zstatically_known_gt)rS   r  r  r  Zlhs_dep_nodesZrhs_dep_nodesZlhs_reuse_keysZrhs_reuse_keysZcommon_reuse_keysZmemory_overheadr  Z	bw_savingrU   )r  rS   rV   can_fusion_increase_peak_memory  s$   
z)Scheduler.can_fusion_increase_peak_memoryc                 C  s*   t t|j|j t|j|j }|dkS )aB  
        This function prevents fusion for nodes that can increase memory
        footprint. This problem is more common in horizontal fusion, where nodes
        that are far apart in the original order get fused, lengthening the live
        intervals of tensors. This is very evident in models with activation
        checkpointing, where the recomputed nodes from different checkpointed
        regions get fused and significantly increase the memory footprint.

        The current attempt is a quick, possibly hacky, heuristic to prevent the
        fusion of nodes that are far away in the original order.

        A better but difficult to implement heurisitic would be to use live
        intervals of the buffers, find region of peak pressure in the original
        program and prevent fusion that crosses that peak region. We might need
        special care or good approximation in this implementation, as fusion of
        node changes live intervals, and re-computing live intervals and peak
        memory after each fusion can introduce large compilation overhead.
        @   )r  r  r   r   )rS   r  r  Zproximity_scorerU   rU   rV   are_long_distant_nodes  s
   z Scheduler.are_long_distant_nodescommon_buf_names"Union[tuple[str], OrderedSet[str]]c                 C  sb  i }dd |j  D }dd |j  D }|D ]}tj|}|| }	|| }
t|	tr2t|
tsAdt|	 dt|
 ||< q|	 |
 krXd|	  d|
  ||< qt	|	j
t	|
j
krgd||< q|	 }|
 }||kr~d| d| ||< q|	 |
 krd	|	 d|
 ||< qd
}t|tjsd|j }d|	 d|
 d| ||< qt|S )z}
        Try to decide reasons why fusion fail due to no shared memory even though
        there are common buffers.
        c                 S     i | ]}|j |qS rU   r   r   rU   rU   rV   r     rP  z7Scheduler.decide_fusion_fail_reason.<locals>.<dictcomp>c                 S  r  rU   r   r   rU   rU   rV   r     rP  znot MemoryDep: z v.s. zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r   zLayout: zUnknown reason: z. )r   rO  rC   rw   r  ru   r&   ra   r  rB   r-  Z
get_offsetnormalize_with_stride_orderr   r  rc   rP   )rS   r  r  r  reasonsnode1_name2depnode2_name2depr5  r   lhs_deprhs_depZlhs_offZrhs_offZ
layout_strrU   rU   rV   decide_fusion_fail_reason  sD   
z#Scheduler.decide_fusion_fail_reasonrX   c                 C  sr  t jrtdd ||fD rdS |j }|j }||@ }|s"dS dd |j D }dd |j D }g }|D ]#}	||	 }
||	 }|
 | kr]|tj	j
j|
 dd|
|f q:t|dkrfdS t|dd	 d
\}}
}t|
tr{t|ts}dS |
j|jkr|
 | kr| |
S dS | s||
| n| s|||
 ntd| |  | ||S )z
        Right now just greedily reorder the loop of node1 to be compatible with node2,
        but ideally we should have some heuristics to reorder the loop for node2
        to be compatibile with node1 if that's more efficient.
        c                 s  r  rQ   )r=  r+  rU   rU   rV   r   C  s    
z>Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>r   c                 S  r  rU   r   r   rU   rU   rV   r   O  rP  z?Scheduler.shared_data_after_reordering_loop.<locals>.<dictcomp>c                 S  r  rU   r   r   rU   rU   rV   r   P  rP  r~  c                 S  rS  rS  rU   rK  rU   rU   rV   r   g  rT  z=Scheduler.shared_data_after_reordering_loop.<locals>.<lambda>r  z?Don't reorder loops since both nodes are reductions: %s v.s. %s)r   r  r   r   Zbuffer_namesrO  r  rf  rC   rw   r  r  r  rg   r  ru   r&   r
  r  dep_size_hintr?  r   r  r  rR   r  )rS   r  r  Znode1_buffer_namesZnode2_buffer_namesZcommon_buffer_namesr  r  
candidatesZbuffer_namer  r  Z_numelrU   rU   rV   !shared_data_after_reordering_loop8  sX   


z+Scheduler.shared_data_after_reordering_loopc                 C  s$   t |ttfo|  ot|j S )z>
        Is this node unfusable under any conditions.
        )ru   r  rY  rA  r@   rH   r   rU   rU   rV   r}    s
   
zScheduler.unfusable_nodeprologue_noder  rz  r  c           	      C  s   |  tjjkr
dS | }| }d}||| kr |d dS tdd | D }|tj	j
jjfkr:|d dS ddd}|| jrP| sP|d dS dS )zT
        Heuristics to avoid benchmarking predictably slow prologue fusions
        Tg?z@prologue fusion will not increase amount of bytes read in kernelFc                 s  s:    | ]}|j d ur|j  D ]}|jdkr|jV  qqd S )NZcall_function)rH   re  rT   rg  r   r,  r  rU   rU   rV   r     s    

zEScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>z\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsr  torch.dtyperO   r   c                 S  s   | j dko| jS )Nr2  )itemsizeZis_floating_point)r  rU   rU   rV   low_prec_fp  r  zGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fpzVprologue fusion that must be upcast to fp32 not profitable for low precision templatesN)r  r  rO   r   )r#  rC   rw   Zinvoke_quant_opsrv  rw  rI  r"  r   opsatenZconstant_pad_nddefaultr  r  r/  )	rS   r  r  rz  
read_byteswrite_bytesZBYTES_THRESHOLD_MULTIPLIERrl  r  rU   rU   rV   (check_prologue_fusion_heuristics_fusable  s4   

z2Scheduler.check_prologue_fusion_heuristics_fusablec                   s  ||u rdS t ||}| r| | ||rdS t|ts&t|tr,|d dS t|ttfr=| s=|d dS t|ttfrN| sN|d dS |	 |j
@ r[|d dS | r,tjsi|d dS | sq| rw|d dS | }t|tjs|d	 dS | }td
d |jD | }| |@ r|d dS | s| r|d dS |   dd D ]}| }|D ]}	t fdd|	jD s|d   dS qqt|ts|gndd |jD }
t|
dksJ |
d }t d jdkrt d jd jdkr d jd jd j|u s"|d dS | |||s,dS | rE| s?| s?tj sE|d dS | t!j"j#@ sW| t!j"j#@ r]|d dS | }| }||krr|d|| dS ~| $||}|tj%k rtj&r| '||}t()t*j+rt(,d|- |- | t!j./| |||sdS |	 |j
@ r| 0||ot!j.0| |||o| |0||S t!j.1| |||o| |1||S )zj
        Determine if it is possible to combine node1 and node2 into a
        single fused node.
        FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc                 s  r  rQ   r   )r   inprU   rU   rV   r     r!  z%Scheduler.can_fuse.<locals>.<genexpr>z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNra  c                 3  s    | ]}|j  v V  qd S rQ   r  r  prologue_nodesrU   rV   r     rc  z7template prologue can only fuse nodes with a single usec                 S  rD  rU   r  r+  rU   rU   rV   r     rE  z&Scheduler.can_fuse.<locals>.<listcomp>r   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)z%s and %s has %s shared data)2r  rA  r  r8  can_fuse_multi_outputs_templateru   rB  r  rY  r#  r   r   Zprologue_fusionr?  r  r   rg  Zget_allowed_prologue_inpsr   r  r'  r   r"  r   r.  rM   r  r  rg   r   rH   r  r[  rC   rw   Zno_fuse_buffer_namesr  Zscore_fusion_memory_thresholdr  r  r  r  r  rJ  r  rR   choicesrr  can_fuse_verticalcan_fuse_horizontal)rS   r  r  rz  r  rU  Zunsupported_prologue_argsrH   Z	node_outsr   Ztemplate_snodesZtemplate_snoder<  Zdevice2Zshared_data_scorerU   r  rV   rr    s   



zScheduler.can_fusec                 C  s*  |  }t||}tt}|jD ]}| j|j|j}t|t	r(| 
|||r(q|| | q|jjD ]&}t|ts<q4|| j|j|j}	|	rZ|	D ]}
| |
|rY|	|
 qLq4tdd tj| D }||@ rt|d dS | }|D ]}| j|  }|| j| j@ r|d  dS qzdS )a  
        Check if it is legal to fuse a consumer (node2) into a producer (node1).

        We can fuse them if all the reads of node2 either match
        corresponding writes in node1, or are written by nodes that can
        be scheduled before the fusion of node1 and node2.
        c                 s  r   rQ   r   r   rU   rU   rV   r   z  r   z.Scheduler.can_fuse_vertical.<locals>.<genexpr>zmemory deps did not matchFz(intermediate nodes between node1 & node2T)r'  r  r   r   r   r  r   rZ   ru   r(   fusable_weak_deprf  r   r   r&   fusable_read_and_writer3  r   r   r  r  r   r#  r}   rW   r  r   )rS   r  r  Znode1_buf_namesrz  Zremaining_deps_by_namer   rZ   cd	remainingrp  Zremaining_depsZnode1_op_namesr  rU   rU   rV   r  [  sB   




zScheduler.can_fuse_verticalweak_depr(   c                   s   j | vr	dS fdd|jjD }t|dkrdS |d tts'J tjt	j
r0dS | jj   fdd|jjD }tfdd|D S )	NFc                   s   g | ]
}|j  jkr|qS rU   )rZ   r	  )r   rG  )r  rU   rV   r     s
    z.Scheduler.fusable_weak_dep.<locals>.<listcomp>r   r   c                   s   g | ]	}|j  kr|qS rU   r   r   rZ  )	real_namerU   rV   r     r   c                 3  sB    | ]}t |tot|jtj o|j jko|j jkV  qd S rQ   )ru   r&   r   r,  r   TMPr-  r  )rG  rU   rV   r     s    



z-Scheduler.fusable_weak_dep.<locals>.<genexpr>)rZ   r'  r   r   rg   ru   r&   r   r,  r   r  r   r	  r   r.  )rS   r  r  r  Zmutating_writesZrelevant_readsrU   )r  r  rG  rV   r    s$   

zScheduler.fusable_weak_deprZ  r%   rG  r&   c                 C  s   t |trQ| j|j|j}||jks!t|jtjs!t|jtjr#dS t	j
r4|j|jkr4| }| }|j|jkoPt|jt|jkoP|jd t|j |jkS t |try| j|j|j}| j|j|j}|j|jkry|jd ury||krydS dS r   )ru   r&   r  r   rZ   r   r,  r   r  r   r  r
  r  rg   r-  r'   r/  )rS   rZ  rG  Z	read_nameZ
write_namerU   rU   rV   r    s0   



z Scheduler.fusable_read_and_writer   c                 C  sR   d}|| j vr"z
| s| }W n	 ty   Y nw || j |< |S | j | }|S rS  )r  Zhas_unbacked_symbolsZnumbytes_hintKeyError)rS   r   resrU   rU   rV   r    s   


zScheduler.dep_size_hintc                   s   t |jjt |jj }t |jjt  jj }t||d t||k rH||kr.|} }|  fdd|jj|jjB D }tfdd|D S |jj|jjB  jj jjB @ }tfdd|D S )zn
        The first term in our fusion score that estimates number of saved
        memory operations.
        r  c                   s(   g | ]}| j jv s| j jv r|qS rU   )r   r   r   r   )r  rU   rV   r     s
    z1Scheduler.score_fusion_memory.<locals>.<listcomp>c                 3      | ]}  |V  qd S rQ   r  r   r[   rU   rV   r     rc  z0Scheduler.score_fusion_memory.<locals>.<genexpr>c                 3  r  rQ   r  r   r[   rU   rV   r     rc  )rg   r   r   r   r  r  ry  )rS   r  r  Znode1_dep_lenZnode2_dep_lentmpr
  Zcommon_memory_depsrU   )r  rS   rV   r    s   
zScheduler.score_fusion_memoryry  c                 C  s   t |dkr|S i }|D ]2\}}| | ksJ | }t| |||}||vr5||fg||< q|| ||f qt| t	ddd }t |dksTJ |S )Nr   r  r   )
rg   r8  rX   r  get_fusion_pair_priorityrf  r  r  operator
itemgetter)rS   ry  Z"possible_fusions_group_by_priorityr  r  r<  Zfusion_pair_priorityZ&possible_fusions_with_highest_priorityrU   rU   rV   r~    s.   
z4Scheduler.get_possible_fusions_with_highest_priority+tuple[BaseSchedulerNode, BaseSchedulerNode]r   c                 C  s   t jj| g|R  S )z-
        Shim for list.sort(key=...)
        )rC   r  Zscore_fusionr  rU   rU   rV   r  %  s   zScheduler.score_fusion_keyc                 C  s<   t tj }t| jD ]}||| j ||j	 qdS )zg
        Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
        N)
r   rC   rw   r  r"  r  r   r   r]  r   )rS   r   rH   rU   rU   rV   r  -  s
   zScheduler.compute_last_usagec                 C  s   t | jtjj tjjj D ]J}|| jv r'| j| }| r&tjj	|j
 q|tjjv rXtjj| }t|tjrAtjj	| q|j}t|tjrN| sPJ tjj	|j q| j  dS )z*Free any buffers that are no longer neededN)r  r  rC   rw   rX  rx   Zfreedr}   r   Zcodegen_freerH   r  ru   r   r  r   r<  Zis_input_bufferclear)rS   rZ   r   r  ZstoragerU   rU   rV   free_buffers8  s0   


zScheduler.free_buffersc                 C  s$   | j  D ]}|  q|   d S rQ   )r  r   flushr  )rS   r*  rU   rU   rV   r  P  s   
zScheduler.flushscheduler_noder  c                 C  s   t |tsJ td d  d7  < ttdd |  |  W d    n1 s,w   Y  |j}t |t	j
sCJ dt||tjj |   d S )NZinductorZextern_callsr   F)Zincrease_kernel_countztype(node)=)ru   r  r   rC   Zset_kernel_handlerr#   r[  r   rH   r   r  ra   rV  rw   rx   r  )rS   r  rH   rU   rU   rV   codegen_extern_callU  s   
zScheduler.codegen_extern_callBaseSchedulingc                 C  s   t |jr|jd usJ | dtj| t|j}|d u r(td|j t sR|jdkrBt	j
| }jdk rBt|t t |jrR|jdksRtt || S )Nz( should have been normalized in loweringzUnsupported device type: cuda   Zmps)r>   ra   r,  rC   rw   Zadd_device_infor"   r  r   r   r  Zget_device_propertiesmajorr)   inspectcurrentframer*   )rS   r<  Zdevice_schedulingZdevice_propsrU   rU   rV   create_backendd  s   

zScheduler.create_backendc                 C  s0   |d usJ || j vr| || j |< | j | S rQ   )r  r  r;  rU   rU   rV   r  y  s   

zScheduler.get_backendc                   s`   dfdd  fdd|  D }t| }|r.t|td	d
\}}tjj	| d S d S )Nr,  torch.fx.NoderO   rX   c                   s2   |  j vr j dd t| jjD   j |  S )Nc                 S  r
  rU   rU   r  rU   rU   rV   r     r  z>Scheduler.enter_context.<locals>.get_order.<locals>.<dictcomp>)r  r]  r  rw   r  r  r[   rU   rV   	get_order  s   

z*Scheduler.enter_context.<locals>.get_orderc                   s4   i | ]}|j d ur|j  D ]	} ||fd qqS rQ   r  r  )r  rU   rV   r     s    
z+Scheduler.enter_context.<locals>.<dictcomp>r   r  )r,  r  rO   rX   )
r"  r   r  r  r  r  rC   rw   rx   enter_context)rS   rH   rl  rN  lastrU   )r  rS   rV   r    s   
zScheduler.enter_contextrZ   fused_node_namesr   c                   sP   z| j | j}W n
 ty   Y dS w t fdd|D o'|| jvo'|| jvS )NFc                 3  s"    | ]}|j p|  v V  qd S rQ   )r  rR   r  r  rU   rV   r     s     zAScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>)r}   rM   r  r.  r  r   )rS   rZ   r  rM   rU   r  rV   $can_buffer_be_removed_through_fusion  s   z.Scheduler.can_buffer_be_removed_through_fusionc                 C  s|   |  sdS |jdu rdS t|jtjrdS t|jtjrdS t|jddr(dS t|jdr<tdd |jj	j
D r<dS dS )zBReturn True if we should partition the inductor graph on this nodeTNZunbacked_bindingsrc   c                 s  s"    | ]}t |tjo|jV  qd S rQ   )ru   r#  r  r  )r   exprrU   rU   rV   r     r0  z-Scheduler.should_partition.<locals>.<genexpr>F)r>   rH   ru   r   Z
DeviceCopyZConditionalr   ry   r   rc   r-  r   rU   rU   rV   should_partition  s   
zScheduler.should_partition;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]]c                 C  s@   i }| tjj | jD ]}|j D ]	\}}|j||< qq|S )z~
        Return a mapping from name strings to the corresponding graph inputs or
        base scheduler node outputs.
        )r]  rC   rw   r  r  r   r  rH   )rS   ro  rH   rZ   Zscheduler_bufferrU   rU   rV   get_name_to_nodes  s   
zScheduler.get_name_to_nodes
partitionslist[PartitionType]skip_cudagraphs
list[bool]list[GraphPartitionSignature]c                   s  g }t tj }|  tt|t|D ]n\}}t  }|D ]
}||j	  q|
|}	tjdd |D }
t dd |
j|
jB D | }t   |D ]} |j qNfdd|D } fdd|D }fdd|	D }|t|||| |||	 }q|ddd	 S )
z
        Gets signature for each graph partition, including input nodes, output nodes, and
        whether deallocating an input within graph partition.
        c                 S  r  rU   r7  r   rU   rU   rV   r     r8  z;Scheduler.get_graph_partition_signature.<locals>.<listcomp>c                 S  r  rU   r   rR  rU   rU   rV   r     r8  c                   s   i | ]}| v r| | qS rU   rU   r  r  rU   rV   r     s
    z;Scheduler.get_graph_partition_signature.<locals>.<dictcomp>c                   s&   i | ]}|v r|| v rd ndqS )TFrU   r  r  ro  rU   rV   r     s
    c                   r  rU   rU   r  r  rU   rV   r     rP  Nra  )r   rC   rw   r  r  r  r"  r]  r   r  r  r   r<  r=  r   r   r   rf  r-   r>  )rS   r  r  
signaturesZunmet_output_names	partitionskip_cudagraphZoutput_namesrH   Zreturned_output_namesr   Zpartition_input_namesZinput_nodesZinput_deallocationoutput_nodesrU   r  rV   get_graph_partition_signature  sN   

z'Scheduler.get_graph_partition_signature9tuple[list[PartitionType], list[GraphPartitionSignature]]c                 C  s   g }d}g }g }| j D ] }| |}|r$||kr$|| || g }|}|| q|r8|| || || j||dfS )z
        Given a list of BaseSchedulerNodes, split into a list of
        graph partitions and compute partition input/output signatures.
        T)r  r  )r  r  rf  r  )rS   r  r  Zcur_partitionr  rH   r  rU   rU   rV   graph_partition  s$   





zScheduler.graph_partitionc                 C  sL   t d tjjjr|  n| | j	 W  d    S 1 sw   Y  d S )NScheduler.codegen)r   r   r   r   r  _codegen_partitions_codegenr  r[   rU   rU   rV   rV  "  s   


$r  r  PartitionType	signaturer-   c                 C  s   t jj}t| j}t j % t jjdd| ||d | | t jjt jj	\}}W d   n1 s6w   Y  t jj
|j t jj|| t jjjdd |jD  dS )z,Codegen a partition given its inputs/outputsTZ
partition_)Zis_subgraphZsubgraph_nameparent_wrapper_codeZpartition_signaturesNc                 S  rO  rU   r   r   rU   rU   rV   r   A  rP  z8Scheduler._codegen_partition_wrapper.<locals>.<listcomp>)rC   rw   rx   r  r  Zset_current_wrapper_codeZinit_wrapper_coder  generateZis_inferenceZdefine_subgraph_launcher_fnvalueZcodegen_partition_callZ	allocatedr]  r  )rS   r  r  r  Zgraph_partition_idZpartition_coderN  rU   rU   rV   _codegen_partition_wrapper*  s"   



z$Scheduler._codegen_partition_wrapperc                 C  sx   |   \}}t||D ]"\}}t|dksJ dt| |jr'| | q| || qt| j}tj	j
| dS )z
        Split nodes into partitions and codegen each partition into separate functions.
        This allows further applying different optimizations (e.g., cudagraph) to
        each function.
        r   z5Each partition must have at least one node but found N)r  r  rg   r  r  r  r  r  rC   rw   rx   Zset_all_partition_names)rS   r  r  r  r  Znum_partitionsrU   rU   rV   r  D  s   
zScheduler._codegen_partitionsc              	   C  s  t jr@dd l}t }t }t|D ]-}|jdkr#|j|j	j
jkr# n|j|jf}||vs:J d|j d|j d|| qd | _|D ],}ttjrmztd| |  W n tyl   td|  Y nw | | |  }r|| jks| s| r|   || jkr| jrt| jjrtjj !  || _t|jr|j"d usJ dtjj #|j" | j$%|j& | r|'t(|) \}	}
}| *|+|
||	 nc| rt,-t.|}| /| nS|0 rt,-t1|}| *|}d	d
l2m3} d	dl4m5} t6|||fr|}n	t7dt| |8| nt6|t9t:fr/| *|;| nt6|t<s7J |=  t j>j?rG| *|@  | jA%|B  | jC%|D  t6|t<sr| }|d urr| *|E rr|   qE| jrt| jjrtjj !  |   d S )Nr   Z_compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0zdevice should have an indexr   )CUDACombinedSchedulingr  ztype(self)=)Fr   Z"check_stack_no_cycles_TESTING_ONLYZtorch._dynamo.convert_frame	tracebackextract_stackr   r"  rZ   filenameZ_dynamoZconvert_frame__file__linenor  r  r   r  r  rJ  r  rR   r  r   r  r8  rB  rA  r  r8   ra   rC   rw   rx   Zcodegen_device_guard_exitr,  Zcodegen_device_guard_enterr  r]  r   r  r   r"  r  codegen_templaterw  rx  r  r  rC  ri  Z codegen.cuda_combined_schedulingr  r  r  ru   ry  Zcodegen_combo_kernelr  r*  codegen_noderY  r   rN  Zdebug_sync_kernelcodegen_syncr  r'  rW  r#  ready_to_flush)rS   r  r   stackr  framer  rH   r<  r  r  r  Zbackend_r  r  r*  rU   rU   rV   r  Y  s   












zScheduler._codegen(tuple[float, float, list[Optional[str]]]c                 C  s:   |d   }| tj_|| _|dusJ | |}||S )r(  r   N)r8  rC   rw   rG   r  r  benchmark_combo_kernel)rS   r>  r<  r*  rU   rU   rV   r    s   

z Scheduler.benchmark_combo_kernelc                 C  s  t jsdS |}|d  }|du s|jdkrdS ddlm} dg }}t|D ]T\}}| }	| |	r9t	
d z| |	\}
}t|
rPt	
d| W  d	S W n  |yq } zd
t|v rlt	
d W Y d}~ dS  d}~ww ||
7 }|| q'z
| |\}}}W n |y } zd
t|v rt	
d W Y d}~dS  d}~ww || dk p|dk }t	tjr||ks|rt	
dt|| d nt	
dt|| d || |k p|S )rC  Tr   Nr:  rD  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFra  zCComboKernel benchmark: return True because of loop-carried variableg333333?z/can fuse (benchmark): fusing causes %sx speeduprI  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r  r8  ra   rh  rE  r  r"  rA  r  r  r)  rb  rc  rP   rf  r  r  rJ  r3   r4   )rS   r  Zsubkernel_nodesr<  rE  rG  Z
path1_listr  rJ  r>  msrZ  r  rH  Z	ms2_cloneZ_path2_listZsmall_kernelrU   rU   rV   ru    sr   


	z!Scheduler.speedup_by_combo_kernelr5  	ir.Layoutc                 C  s"   | j | }|jd usJ |j S rQ   )r}   rH   Z
get_layout)rS   r5  r   rU   rU   rV   get_buffer_layout  s   

zScheduler.get_buffer_layoutc                 C  sn   | j D ]1}| r4|jjD ]&}tjj|j}|r3t	|dkr3t
|jts3| g kr3tjj|j qqd S r9  )r  r>   r   r   rC   rw   r  r   rZ   r,   ru   rc   r/   r   Zzero_dim_cpu_tensor_listr  )rS   rH   rZ  r\  rU   rU   rV   r    s   

z$Scheduler.update_zero_dim_cpu_tensor)r  r  rO   rp   )rO   r  r  )r<  r7  rO   rp   r   )r  rP   rO   rp   )rH   r   rO   rD   r  )rJ  rD   rO   r  )rO   r  r  r1  rO   r'  r  r1  r+  r   rO   rP   )r-  r   r<  r_  rO   r'  )r>  r1  rO   r   )r  rD   r  rD   rO   rB  )rH   rD   rO   rD   rQ   )r  rt  rO   rp   r|  )r  r  rO   rw  r  rD   r  rD   rO   r   )r  rD   r  rD   r  r  rO   rP   r  rD   r  rD   rO   rX   r  )r  rD   r  rD   rz  r  rO   r   )r  r(   r  rD   r  rD   rO   r   )rZ  r%   rG  r&   rO   r   )r   r%   rO   rX   )ry  rw  rO   rw  )r  r  rO   r   )r  r  rO   rp   )r<  r_  rO   r  )r<  r7  rO   r  )rH   rD   rO   rp   )rZ   rP   r  r   rO   r   )rO   r  )r  r  r  r  rO   r  )rO   r  )r  r  r  r-   rO   rp   r>  r1  rO   r  )r  r  rO   r   )r5  rP   rO   r  )Frb   r   r   r   r   r  r  propertyr  setterr  r  r  r  r  r  r  r  r  r  r	  r  r  r)  r,  r.  r  rA  ri  rN  r&  r  r  rr  rm  r  r  r  r  r}  r  rr  r  r  r  r  r  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  rV  r  r  r  r  ru  r  r  r  rU   rU   r  rV   rF     s   
 

u




$ 
R
*










N
	  

j
0

2
.
9

>
K


; 

5
%
"


 











=




f

Kc                      s   e Zd ZdD fddZdEddZdFddZdGddZdGddZdGddZdHddZ	dIddZ
dJd%d&ZdKd*d+ZdLd.d/ZdEd0d1ZdMd2d3ZdEd4d5ZdNd7d8ZdOd;d<ZdPd>d?ZdQdBdCZ  ZS )Rr  rG   Optional[Scheduler]c                   s   t    || _d S rQ   )r  r   rG   r   r  rU   rV   r   ,  s   

zBaseScheduling.__init__rO   rp   c                 C  s   | j r
| j   d S d S rQ   )rG   r  r[   rU   rU   rV   free_buffers_in_scheduler0  s   z(BaseScheduling.free_buffers_in_schedulerr<  r_  OrderedSet[BackendFeature]c                 C  s   t  S )z0Return a set of .codegen.common.BackendFeature()r   r;  rU   rU   rV   get_backend_features4  s   z#BaseScheduling.get_backend_featuresr  rD   r  r   c                 C  r`  )zO
        Check whether node1 and node2 can be vertically fused or not.
        ra  r  rU   rU   rV   r  8     z BaseScheduling.can_fuse_verticalc                 C  r`  )zQ
        Check whether node1 and node2 can be horizontally fused or not.
        ra  r  rU   rU   rV   r  @  r  z"BaseScheduling.can_fuse_horizontalc                 C  r   )au  
        A Multi-Output Template (referenced in #144012) is a template node
        with MultiOutputLayout, and its output buffers are instances of MultiOutput.
        In this context, we verify whether node1 represents the Multi-Output Template
        and node2 corresponds to one of its outputs. If so, we further check if
        backend supports this fusion.
        FrU   r  rU   rU   rV   r  H  s   
z.BaseScheduling.can_fuse_multi_outputs_templater  c                 C  s(   |  s|  rt||S t||S )z 
        Fuse two nodes
        )rC  ri  rH  r  r  rU   rU   rV   rH  T  s   zBaseScheduling.fuser  r  "tuple[tuple[sympy.Expr, ...], ...]c                 C  r`  )z[
        Process the iteration sizes in case a transformation needs to be applied.
        ra  )rS   r  rU   rU   rV   r  _  r  zBaseScheduling.group_fnr  epilogue_nodesr1  r  Optional[str]c                 C  r`  )z
        Given a template node, generate a kernel.

        This function is only available for triton now. If the third-party backend behaves as a sub-class
        of TritonScheduling, it can override it or reuse it.
        ra  )rS   r  r  r  rU   rU   rV   r  g  s   zBaseScheduling.codegen_templater  r+  rP   c                 C  r`  zD
        Generate a kernel given a list of pre-fused nodes.
        ra  )rS   r  r+  rU   rU   rV   r,  u  r  z.BaseScheduling.generate_kernel_code_from_nodesrH   (Union[FusedSchedulerNode, SchedulerNode]c                 C  r`  r  ra  r   rU   rU   rV   r  }     zBaseScheduling.codegen_nodec                 C  r`  )zt
        Generate synchronization code for the kernel. This method depends on the hardware characteristics.
        ra  r[   rU   rU   rV   r    r  zBaseScheduling.codegen_syncc                 C  r   )z
        Check whether the backend is requesting the scheduler to flush the generated kernel.
        If not supported, please return False.
        FrU   r[   rU   rU   rV   r       zBaseScheduling.ready_to_flushc                 C  r`  )z]
        Flush the generated kernel and python wrapper code to the source code file.
        ra  r[   rU   rU   rV   r    r  zBaseScheduling.flushr'  c                 C  r`  )r(  ra  r  rU   rU   rV   r)       z$BaseScheduling.benchmark_fused_nodesr-  r   c                 C  r`  )z
        Benchmark a compiled module and return the execution time
        in milliseconds on randomly generated inputs.
        ra  )rS   r-  rU   rU   rV   r.    r  z)BaseScheduling.benchmark_codegened_modulerX   c                 C  r   )z
        Return an unsigned integer which represents the priority of this fusion pair.
        The smaller is with higher priority.
        r   rU   r  rU   rU   rV   r    r  z'BaseScheduling.get_fusion_pair_priorityr>  r  c                 C  r`  )z
        Benchmark the list of nodes to combine and return the execution time
        and memory copy time in milliseconds on randomly generated inputs.
        ra  r@  rU   rU   rV   r    r  z%BaseScheduling.benchmark_combo_kernel)rG   r
  r   )r<  r_  rO   r  r  rd  )r  r  rO   r  )r  rD   r  r1  r  r1  rO   r  r  )rH   r  rO   rp   r   r  )r-  r   rO   r'  r  r  )rb   r   r   r   r  r  r  r  r  rH  r  r  r,  r  r  r  r  r)  r.  r  r  r  rU   rU   r  rV   r  +  s&    














	
	r  )r  r   rO   rP   )rH   rD   r  r  r}   r  rO   rp   )r5  r6  rO   rp   )r5  r6  rG   rF   r  r  rO   rp   )rU   )r  r  r  r  r  r  rO   r  )
__future__r   r  r   r  r  r   r  rb  r  r  r  r  r  rw  r   r   r   r   r   r   r	   r
   r   collections.abcr   typesr   r#  r   Ztorch._inductor.async_compileZtorch._dynamo.utilsr   r   Ztorch._inductor.codecacher   r   Ztorch._inductor.metricsr   r   Z%torch.fx.experimental.symbolic_shapesr   Ztorch.utils._ordered_setr   Ztorch.utils._sympy.symbolr   r   Ztorch.utils._tritonr   r   r   r   r   r   r   Zanalyze_preserves_zero_maskr    Zcodegen.commonr!   r"   r#   Zcomm_analysisr$   r%   r&   r'   r(   excr)   r*   r+   r,   r-   r.   r/   Z	loop_bodyr0   r  r1   r2   Zruntime.runtime_utilsr3   r4   r  r5   utilsr6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   ZvirtualizedrC   	getLoggerrb   r   Z_loggingZgetArtifactLoggerr  r  r   r  	dataclassrE   r   rD   r  re   r   r  r  r  ZconvolutionmmZbmmZaddmmZ
_scaled_mmr  r  rY  r*  r?  rC  r  ri  rB  r  r  r  r  rF   r  rU   rU   rU   rV   <module>   s    $<
h     ?


+	  

 T  >E.                   