
    [Th                   F   S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKrS SKrS SK	r	S SK
r
S SKrS SKrS SKrS SKrS SKJrJr  S SKJrJrJrJrJrJrJr  \(       a  S SKJr  S SKJr  S SKrS SKrS SKrS SKJ r J!r!  S SK"J#r#J$r$  S S	K%J&r&J'r'  S S
K(J)r)  S SK*J+r+  S SK,J-r-J.r.  S SK/J0r0  SSK1J2r2J3r3J4r4J5r5J6r6  SSK7J8r8  SSK9J:r:J;r;J<r<  SSK=J>r>  SSK4J?r?J@r@JArAJBrB  SSKCJDrDJErE  SSK5JFrFJGrGJHrHJIrIJJrJ  SSKKJLrL  SSKMJNrNJOrO  SSKPJQrQJRrR  SSKSJTrT  SSKUJVrVJWrWJXrXJYrYJZrZJ[r[J\r\J]r]J^r^J_r_J`r`JaraJbrb  SSKcJdrd  \R                  " \f5      rg\R                  R                  \fS5      rj\R                  R                  \fS5      rk\lS   rm\R                   " S S 5      5       ro\R                   " S! S"\o5      5       rp " S# S5      rq " S$ S%5      rrS@S& jrs " S' S(5      rt        SAS) jru\R                  R                  R                  \R                  R                  R                  \R                  R                  R                  \R                  R                  R                  \R                  R                  R                  S*.r} " S+ S,\q5      r~ " S- S.\q5      r " S/ S0\q5      r    SBS1 jr        SCS3 jr " S4 S5\q5      r " S6 S7\5      r " S8 S9\q5      r SD       SES: jjr\R                   " S; S<5      5       r\GR                  " 5       r " S= S25      r " S> S?5      rg)F    )annotationsN)Counterdefaultdict)AnyCallableGenericOptionalTYPE_CHECKINGTypeVarUnion)Sequence)
ModuleType)countersdynamo_timed)LambdaFuturePyCodeCache)get_metric_tableis_metric_table_enabled)free_unbacked_symbols
OrderedSet)free_symbol_is_typeSymT)
has_triton   )commsconfigdependenciesirmetrics)can_codegen_without_upcasts)BackendFeatureget_scheduling_for_deviceKernel) estimate_nccl_collective_runtime)Dep	MemoryDepStarDepWeakDep)GPUTooOldForTritonTritonMissing)ComputedBufferget_device_typeGraphPartitionSignatureMultiOutputMultiOutputLayout)LoopBody)MemoryPlanningInfoForBufferMemoryPlanningInfoForNode)
green_textred_text)SimplifyIndexing)cache_on_selfcmpdevice_need_guardget_device_tflopsget_dtype_sizeget_gpu_dram_gbpsIndentedBufferis_collectiveis_gpuis_multi_outputs_template#is_output_of_multi_outputs_templateis_waitsympy_product)Vfusionloop_orderingBaseSchedulerNodec                      \ rS rSr% S\S'   S\S'   S\S'   \R                  " \S9rS	\S
'   \R                  " \	S9r
S\S'   SS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSS jrSrg)SchedulerBufferN   	Scheduler	schedulerz	ir.BuffernodeOptional[BaseSchedulerNode]defining_op)default_factorylist[NodeUser]usersr2   
mpi_bufferc                D    U R                   nUc   eUR                  5       $ N)rO   get_name)selfops     Q/var/www/auris/envauris/lib/python3.13/site-packages/torch/_inductor/scheduler.pydefining_op_name SchedulerBuffer.defining_op_nameX   s#    ~~{{}    c                @    [        U R                  R                  5      $ rU   )hashrM   namerW   s    rY   __hash__SchedulerBuffer.__hash__]   s    DIINN##r\   c                   [        5       nU R                  5       nUR                  U S[        U R                  5      R
                   35        UR                  U SU R                  R                   35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        U R                  5       (       a-  UR                  U S[        U R                  5       5       35        [        U R                  5      S::  a0  UR                  U SU R                   35        UR                  5       $ UR                  U S35        UR                  S5         U R                   H  nUR                  U S35        M     S S S 5        UR                  S	5        UR                  5       $ ! , (       d  f       N/= f)
N: z
.layout = z.aliases = z.mutations = r   z	.users = z
.users = [,])r=   rV   	writelinetyperM   __name__layoutget_aliasespformatget_mutationslenrR   indentgetrawvalue)rW   resultr_   users       rY   	debug_strSchedulerBuffer.debug_str`   s   !}}D6DO$<$<#=>?D6DII,<,<+=>?v[9I9I9K1L0MNOv]74;M;M;O3P2QRStzz?avYtzzl;< !!## vZ01q! JJD$$vQZ0 ' " S!!!##	 "!s   *(F;;
G	c                6    U R                   R                  5       $ rU   rM   rV   r`   s    rY   rV   SchedulerBuffer.get_namet       yy!!##r\   c                0   U R                   c   eU R                   R                  5       (       d  g U R                   R                  5       (       dV  U R                   R                  5       (       d7  [	        U R                   R                  5       [        R                  5      (       a4  [        R                  R                  R                  U R                   5        g [        [        R                  S5      (       a  U R                  5       [        R                  R                  ;   a  [        R                  R                  U R                  5          nXR                   R"                  ;   a$  U R                   R"                  U   R                   nO#U R                   R$                  U   R                   n[        R                  R                  R'                  UU R                   5        g [        R                  R                  R                  U R                   5        g )Nargs)rM   should_allocateget_inputs_that_alias_outputget_mutation_names
isinstanceget_output_specr   CommBufferLayoutrD   graphwrapper_codecodegen_allocationhasattrkernelrV   inplace_update_buffersrL   name_to_donated_buffername_to_bufcodegen_inplace_reuse)rW   input_buffer_nameinput_buffers      rY   allocateSchedulerBuffer.allocatew   sc   yy$$$yy((** II2244yy++--$))335r7J7JKKGG  33DII> AHHf%%188#B#BB !" ? ? P NN$I$II#~~DD% $   $~~99:KLQQGG  66		
 GG  33DII>r\   c                &   U R                   c   e[        U R                   R                  [        R                  5      (       d  [        U R                   5      (       a  gU R                   H$  n[        UR                   [        5      (       d  M$    g   gNFT)rM   r~   rj   r   
NoneLayoutr@   rR   
OutputNode)rW   uses     rY   can_freeSchedulerBuffer.can_free   sm    yy$$$dii&&66:SII;
 ;
 ::C#((J//  r\   c                4   0 nU Hr  n[        UR                  5      U;   a?  UR                  U[        UR                  5         5      U[        UR                  5      '   M[  X2[        UR                  5      '   Mt     [        UR	                  5       5      U l        g rU   )idrM   mergelistvaluesrR   )rW   rR   rq   r   s       rY   	set_usersSchedulerBuffer.set_users   sm    &(C#((|v%'*yy3881E'Fr#((|$'*r#((|$	 
 &--/*
r\   c                T    U R                   c   eU R                   R                  5       $ rU   )rM   r|   r`   s    rY   rk   SchedulerBuffer.get_aliases   s%    yy$$$yy5577r\   c                T    U R                   c   eU R                   R                  5       $ rU   )rM   r}   r`   s    rY   rm   SchedulerBuffer.get_mutations   %    yy$$$yy++--r\   )rR   Nreturnstrr   intr   Noner   bool)rR   rQ   r   r   r   zSequence[str])ri   
__module____qualname____firstlineno____annotations__dataclassesfieldr   rR   r2   rS   rZ   ra   rs   rV   r   r   r   rk   rm   __static_attributes__ r\   rY   rI   rI   N   sq    
O,,'--dCE>C.9.?.?3/J+ 
$$($?B
+8.r\   rI   c                  $    \ rS rSr% SrS\S'   Srg)SchedulerDonatedBuffer   NrN   rO   r   )ri   r   r   r   rO   r   r   r   r\   rY   r   r      s    /3K,3r\   r   c                     \ rS rSr% S\S'   S\S'   S\S'   S\S	'   S\S
'   S\S'   SAS jrSBS jrSCS jrSCS jrSCS jr	SDS jr
SCS jrSES jr      SFS jrSGS jrSHS jrSIS jrSJS jr      SKS jrSES jrSLS jrSLS jrSES jrSES jr    SMS  jrSCS! jrSCS" jr\SLS# j5       r\SLS$ j5       r\SIS% j5       r\SIS& j5       rSNS' jr SOS( jr!SPS) jr"SQS* jr#SIS+ jr$SIS, jr%SIS- jr&SIS. jr'SIS/ jr(SIS0 jr)SIS1 jr*SRS2 jr+SIS3 jr,SES4 jr- SS     STS5 jjr.\SUS6 j5       r/\SUS7 j5       r0\SUS8 j5       r1      SVS9 jr2      SWS: jr3\SXS; j5       r4SYS< jr5SZS= jr6\7    S[S> j5       r8S?r9g@)\rG      z7tuple[torch.device, tuple[tuple[sympy.Expr, ...], ...]]groupdependencies.ReadWritesread_writeszOrderedSet[Dep]unmet_dependenciesr   	min_order	max_orderr3   mpi_nodec                     Xl         S U l        g )Nc                     / $ rU   r   )rz   kwargss     rY   <lambda>,BaseSchedulerNode.__init__.<locals>.<lambda>   s    Br\   )rL   debug_device_str)rW   rL   s     rY   __init__BaseSchedulerNode.__init__   s    $-& 	r\   c           	     `   Xl         [        [           " 5       U l        [        [           " 5       U l        SU l        UR                  5        Vs/ s H  n[        U R                  UU S9PM     snU l	        U R                   Vs0 s H  o3R                  5       U_M     snU l        g s  snf s  snf )NF)rL   rM   rO   )rM   r   r   	ancestors
last_usagewrittenget_outputsrI   rL   outputsrV   outputs_by_name)rW   rM   outputbufs       rY   _init_from_node!BaseSchedulerNode._init_from_node   s    ,0	#C*$
   **,/
 - .. 
 -/
 ,0<<<
+7CLLNC<<
/
<
s   B&B+c                V    [        U 5      R                   SU R                  5       < S3$ )Nz(name=)rh   ri   rV   r`   s    rY   __repr__BaseSchedulerNode.__repr__   s'    t*%%&fT]]_,?qAAr\   c                P   U R                  5       n[        5       nUR                  U S[        U 5      R                   S[        [        U SS5      5      R                   SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR                  UR                  5       5        M$     SSS5        UR                  S5         UR                  U R                  5       5        UR'                  5       R)                  5       $ ! , (       d  f       N]= f! [          a    ["        R%                  SSS9   NOf = f)#Longer form printout for trace logsrd   (rM   N)

.writes = 
.unmet_dependencies = .met_dependencies = z.outputs = [
        rf   Ignoring error in debug_str()Texc_info)rV   r=   splicerh   ri   getattrrl   r   writesr   readsro   r   rs   rg   debug_str_extra	Exceptionlogwarningrp   rstrip)rW   r_   r   outs       rY   rs   BaseSchedulerNode.debug_str   sv   }}

bd		QtGD&$$?@IIJ Kj))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 		
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   %7E36F 3
FF%$F%c                    g)N r   r`   s    rY   r   !BaseSchedulerNode.debug_str_extra       r\   c                $    U R                  U 5      $ rU   )r   r`   s    rY   _debug_str_for_device'BaseSchedulerNode._debug_str_for_device  s    $$T**r\   c                   [        U R                  SS 5      nSn[        U[        R                  R
                  R                  5      (       a$  SUR                  UR                  5       /SSS9-   nOe[        U[        R                  R
                  R                  5      (       a2  SUR                  UR                  5       UR                  5       /SSS9-   nU  U 3$ )Ndatar   z, F)shorten	multiline)r   rM   r~   torch	_inductorr   	Pointwise
str_helperget_size	Reductionget_reduction_sizeget_reduction_type)rW   
maybe_datadata_strs      rY   debug_str_short!BaseSchedulerNode.debug_str_short  s    TYY5
j%//"4"4">">??j33$$&'% 4  H 
EOO$6$6$@$@AAj33..0*2O2O2QR 4  H
 z""r\   c                p    [         R                  SU U R                  U R                  R                  5        g )Nz(%s: unmet_dependencies = %s, writes = %s)r   infor   r   r   r`   s    rY   log_detailsBaseSchedulerNode.log_details  s,    6####		
r\   c                    g rU   r   )rW   self_dep	other_deps      rY   reorder_loops_by_dep_pair+BaseSchedulerNode.reorder_loops_by_dep_pair  s     	r\   c                X    U R                  U R                  R                  U5      5        g rU   )set_read_writesr   renamerW   renamess     rY   update_mutated_names&BaseSchedulerNode.update_mutated_names   s!    T--44W=>r\   c                X    U R                  U R                  R                  U5      5        g rU   )r  r   	with_readrW   deps     rY   add_fake_depBaseSchedulerNode.add_fake_dep#  s!    T--77<=r\   c                B    [        S U R                  5        5       5      $ )Nc              3  n   #    U  H+  oR                  5       =(       d    UR                  5       v   M-     g 7frU   )rk   rm   ).0r   s     rY   	<genexpr>=BaseSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>'  s*      
@ROO4!2!2!44@Rs   35)anyr   r`   s    rY   has_aliasing_or_mutation*BaseSchedulerNode.has_aliasing_or_mutation&  s%     
@D@P@P@R
 
 	
r\   c                f    Xl         U R                   R                  U l        U R                  5         g rU   )r   r   r   
prune_deps)rW   rws     rY   r  !BaseSchedulerNode.set_read_writes+  s&    "&"2"2"8"8r\   c                b   ^ U R                  5       n[        U4S jU 5       5      nX1-
  U l        g )Nc              3  F   >#    U  H  nTR                  X5      v   M     g 7frU   )get)r  kmutation_real_names     rY   r  3BaseSchedulerNode.set_last_usage.<locals>.<genexpr>4  s      !U1"4"8"8">">   !)used_or_aliased_buffer_namesr   r   )rW   future_used_buffersr*  used_bufferss     ` rY   set_last_usage BaseSchedulerNode.set_last_usage0  s-     88:!!U!UU&<r\   c                J    U R                    H  nUR                  5         M     g rU   )r   r   )rW   r   s     rY   mark_runBaseSchedulerNode.mark_run7  s    <<CLLN  r\   c                    [        S [        R                  " U R                  R                  U R                  R
                  5       5       5      $ )Nc              3  :   #    U  H  nUR                   v   M     g 7frU   r_   r  r  s     rY   r  6BaseSchedulerNode.used_buffer_names.<locals>.<genexpr><  s      
W HHW   )r   	itertoolschainr   r   r   r`   s    rY   used_buffer_names#BaseSchedulerNode.used_buffer_names;  s?     
 t'7'7'='=t?O?O?V?VW
 
 	
r\   c                N  ^ [         [           " 5       m[        R                  " U R                  R
                  U R                  R                  5       Vs/ s H  nUR                  PM     nn[        U5      S:  a  UR                  5       nTR                  U5        [        R                  R                  R                  U5      (       aD  UR                  U4S j[        R                  R                  U   R!                  5        5       5        [        U5      S:  a  M  T$ s  snf )Nr   c              3  8   >#    U  H  nUT;  d  M  Uv   M     g 7frU   r   )r  alias
used_namess     rY   r  ABaseSchedulerNode.used_or_aliased_buffer_names.<locals>.<genexpr>L  s(      "5 J.	 E"5s   
	)r   r   r;  r<  r   r   r   r_   rn   popaddrD   r   name_to_bufferr(  extendr|   )rW   r  depsrB  s      @rY   r-  .BaseSchedulerNode.used_or_aliased_buffer_namesA  s    _&
 !t'7'7'='=t?O?O?V?VW
W HHW 	 
 $i!m((*CNN3ww%%))#.. !"!7!7"224"5 	 $i!m 
s   D"c                N   ^  [        U 4S jT R                   5       5      T l        g )Nc              3  t   >#    U  H-  nUR                   TR                  R                  ;  d  M)  Uv   M/     g 7frU   )r_   rL   available_buffer_namesr  r  rW   s     rY   r  /BaseSchedulerNode.prune_deps.<locals>.<genexpr>V  s0      -
.xxt~~DDD C.s   (8	8r   r   r`   s   `rY   r#  BaseSchedulerNode.prune_depsU  s#    ", -
..-
 #
r\   c                   ^ ^ SU 4S jjm[        U4S jT R                  R                   5       5      nT R                  T R                  R	                  U5      5        g )Nc                   > [        U [        5      (       d  gTR                  R                  U R                     R                  5       nU[        R                  R                  ;   $ NF)	r~   r)   rL   r   r_   rZ   rD   r   removed_operations)r  op_namerW   s     rY   should_prune7BaseSchedulerNode.prune_weak_deps.<locals>.should_prune^  sI    c7++nn00:KKMGagg8888r\   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7frU   r   r  r  rV  s     rY   r  4BaseSchedulerNode.prune_weak_deps.<locals>.<genexpr>d  s      
1C\#5FCC1   !	!r  r&   r   r   )r   r   r   r  remove_reads)rW   	to_removerV  s   ` @rY   prune_weak_deps!BaseSchedulerNode.prune_weak_deps\  sN    	9  
++11
 
	 	T--::9EFr\   c                D    [        XU R                  R                  5        g rU   )_prune_redundant_depsrL   r   )rW   name_to_fused_nodes     rY   prune_redundant_deps&BaseSchedulerNode.prune_redundant_depsi  s     	d8R8RSr\   c                T    U R                   c   eU R                   R                  5       $ rU   )rM   get_operation_namer`   s    rY   rV   BaseSchedulerNode.get_namen  r   r\   c                "    U R                  5       $ rU   rV   r`   s    rY   get_first_name BaseSchedulerNode.get_first_namer  s    }}r\   c                B    [        S U R                  5        5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frU   rj  )r  rM   s     rY   r  8BaseSchedulerNode.get_operation_names.<locals>.<genexpr>w  s     G6Fd--//6F   )r   	get_nodesr`   s    rY   get_operation_names%BaseSchedulerNode.get_operation_namesu  s    Gdnn6FGGGr\   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frU   rj  r  r   s     rY   r  5BaseSchedulerNode.get_buffer_names.<locals>.<genexpr>{  s     ALS,,..Lrp  )r   r   r`   s    rY   get_buffer_names"BaseSchedulerNode.get_buffer_namesy  s    ADLLAAAr\   c                B    [        S U R                  5        5       5      $ )Nc              3  d   #    U  H&  n[        U[        5      =(       a
    [        US S9v   M(     g7f)T)disallow_fp32_opsNr~   SchedulerNoder!   r  ns     rY   r  ABaseSchedulerNode.can_codegen_in_low_precision.<locals>.<genexpr>  s6      
 & q-( G+AFG%s   .0allrq  r`   s    rY   can_codegen_in_low_precision.BaseSchedulerNode.can_codegen_in_low_precision}  s%     
 ^^%
 
 	
r\   c                B    [        S U R                  5        5       5      $ )Nc              3  f   #    U  H'  n[        U[        5      =(       a    [        U5      v   M)     g 7frU   r}  r  s     rY   r  @BaseSchedulerNode.can_codegen_without_upcasts.<locals>.<genexpr>  s,      
% q-(K-H-KK%s   /1r  r`   s    rY   r!   -BaseSchedulerNode.can_codegen_without_upcasts  s#     
^^%
 
 	
r\   c                    U /$ rU   r   r`   s    rY   rq  BaseSchedulerNode.get_nodes  s	    vr\   c                    U R                   $ rU   )r   r`   s    rY   r   BaseSchedulerNode.get_outputs  s    ||r\   c                     U R                   U   $ rU   )r   )rW   buf_names     rY   
get_outputBaseSchedulerNode.get_output  s    ##H--r\   c                T    U R                   c   eU R                   R                  5       $ rU   )rM   
get_devicer`   s    rY   r  BaseSchedulerNode.get_device  s%    yy$$$yy##%%r\   c                V    U R                  5       nUS L=(       a    UR                  S:H  $ Ncpu)r  rh   rW   devices     rY   is_cpuBaseSchedulerNode.is_cpu  s'    "T!:fkkU&::r\   c                b    U R                  5       nUS L=(       a    [        UR                  5      $ rU   )r  r?   rh   r  s     rY   r?   BaseSchedulerNode.is_gpu  s'    "T!9fV[[&99r\   c                    grS  r   r`   s    rY   is_reductionBaseSchedulerNode.is_reduction      r\   c                    grS  r   r`   s    rY   is_split_scanBaseSchedulerNode.is_split_scan  r  r\   c                    grS  r   r`   s    rY   is_templateBaseSchedulerNode.is_template  r  r\   c                    grS  r   r`   s    rY   	is_externBaseSchedulerNode.is_extern  r  r\   c                    grS  r   r`   s    rY   
is_foreachBaseSchedulerNode.is_foreach  r  r\   c                    grS  r   rW   read_deps     rY   can_inplaceBaseSchedulerNode.can_inplace  r  r\   c                    grS  r   r`   s    rY   has_side_effects"BaseSchedulerNode.has_side_effects  r  r\   c                X  ^  SSK Jn  [        T [        5      (       a  [        R
                  (       a  [        R                  R                  T R                  5       [        R                  5      (       a  [        [        R                  [        R                  R                  R                   R"                  5      (       a  [%        [        R                  SS5      b  ['        [        R                  S5      (       d  gT R(                  [        R                  R*                  -  T R,                  R.                  -  nSU 4S jjnT R1                  5        GHQ  nUR2                  nUc   eUR5                  5       (       aV  UR7                  5       (       dA  UR9                  5       (       d,  UR;                  5       [        R                  R<                  ;   a  M  T R>                  R@                   GH  nURB                  T R,                  RD                  ;   a$  T R,                  RD                  URB                     nO/T R,                  RF                  RI                  URB                  5      nU(       d  M  [        R                  RJ                  RM                  UT 5      (       d  M  [        URN                  [P        5      (       a  M  URR                  c   eURR                   Vs/ s H%  nUR2                  R;                  5       U;  d  M#  UPM'     n	n[U        U	5      S:X  d  GM3  U	S   RV                  (       d  GMJ  U	S   R2                  T L d  GM_  UR2                  c  GMo  [        UR2                  RY                  5       [Z        R\                  [Z        R^                  [Z        R`                  45      (       a  GM  URN                  (       am  [        URN                  R2                  [Z        Rb                  [Z        Rd                  45      (       a*  [U        UR2                  R7                  5       5      S:  a  GMF  U" UR2                  UR2                  5      (       d  GMk  U" U5      (       d  GM{  [        R                  Rf                  Ri                  UR;                  5       UR;                  5       5        [        [        R                  [        R                  R                  R                   R"                  5      (       an  [        R                  Rj                  Rm                  UR;                  5       5        [        R                  Rj                  Rm                  UR;                  5       5        UR;                  5       [        R                  Rn                  UR;                  5       '     GMO     GMT     gs  snf )	zf
Decide if there should be inplace updates for the node
and record the decision in the active kernel.
r   )can_match_buffer_size	mutationsNrz   c                  >^ U R                   R                  T5      nU R                  5       m[        5       nU R                   H~  nUR
                  n[        U[        5      (       d  M&  U R                   R                  U5      ULa  MF  UU4S jUR                  R                  5        5       -  n[        U5      S:  d  M~    g   g)Nc              3  L   >#    U  H  nUR                   T:X  d  M  Uv   M     g 7frU   r7  )r  or  s     rY   r  ^BaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node.<locals>.<genexpr>  s&      Evv) AEs   $	$r   FT)rL   get_fused_noderV   r   rR   rM   r~   rG   r   reads_and_writesrn   )buf_to_be_inplaced
fused_noderH  rr   	user_noder  rW   s        @rY   single_index_in_fused_nodeKBaseSchedulerNode.decide_inplace_update.<locals>.single_index_in_fused_node  s    
 ,55DDTJJ)224H %/LD*00 II	!)->?? '00??	J%&  &22CCE 
 t9q= # 1& r\   r   )r  rI   r   r   )8codegen.wrapperr  r~   r~  r   inplace_buffersrD   r   has_featurer  r"   INPLACE_BUFFERSr   r   r   codegensimd
SIMDKernelr   r   r   rT  rL   completed_operationsr   rM   r{   r|   r}   rV   removed_buffersr   r   r_   r   r   r(  r   	can_reuserO   NopKernelSchedulerNoderR   rn   r  r   r   r   r0   MutationLayoutSHOULDREMOVEFallbackKernelr/   rz   make_inplacer  rE  r   )
rW   r  inconsequential_nodesr  r   buf_noderead	input_bufxremaining_usess
   `         rY   decide_inplace_update'BaseSchedulerNode.decide_inplace_update  s   
 	; t]++&&##DOO$5~7U7UVVqxx)@)@)E)E)P)PQQ188[$7C &)) NNgg(()nn112 		@ ##%CxxH''',,..88::..00<<>QWW%<%<<((..99 E EE $ E Edii PI $ : : > >tyy II I,,66y$GG&y'<'<>TUU$??666 "+&!0A66??,4II !0 # & N+q0*1-999*1-22d:%NN6 *%NN::< " " 4 4 " = =! ! &11 * ) 5 5 : :!#!2!2BNN C! ! !$INN$O$O$Q RUV V1)..#((KK6yAA
 2293E3E3GX%HHeoo&=&=&B&B&M&M  HH..2293E3E3GHHH..223<<>B &..0 77G q / &0&s   "V'V'c                b   [         R                  (       d  g U(       a  U R                  (       a  g U R                  c   eU R                  R	                  5       n/ nU GH&  nUR
                  S:X  a  M  UR                  S5        UR                  S5        SUR
                   SUR                   3nSUR                  ;   a  USUR                  S    3-   nUR                  U5        SUR                  ;   d  M  UR                  S    nUR                  S	5      S
   nUR                  SUR                  SS5      R                  SS5      R                  SS5      -   5        UR                  S5        UR                  S5        GM)     [        U5      S:X  a  g UR                  U5        SU l        g )Nr   r   z#pragma CMT ORIGIN:z#pragma CMT  seq_nrz seq_nr:stack_trace|{z{{}z}}r   \z#pragma CMT END ORIGINr   T)r   comment_originr   rM   get_originsrX   appendtargetmetasplitreplacern   
writelines)	rW   buffer	only_onceorigins	out_linesr  op_info_strr  stack_trace_last_lines	            rY   codegen_originating_info*BaseSchedulerNode.codegen_originating_info6  s    $$yy$$$))'')	AttxR 23(az:K166!)hqvvh7G6H,II[)&!"!6 7(3(9(9#(>r(B%  "+33C>WS$'WT4()   !9:  $- 0 y>Q 	)$r\   c                "    U R                  SSS9$ )NTinclude_readsinclude_writes!get_read_write_buffers_sizes_implr`   s    rY   get_read_write_buffers_sizes.BaseSchedulerNode.get_read_write_buffers_sizesb  s    55t 6 
 	
r\   c                "    U R                  SSS9$ )NTFr  r  r`   s    rY   get_read_buffer_sizes'BaseSchedulerNode.get_read_buffer_sizesh  s    55u 6 
 	
r\   c                "    U R                  SSS9$ )NFTr  r  r`   s    rY   get_write_buffer_sizes(BaseSchedulerNode.get_write_buffer_sizesn  s    55 6 
 	
r\   c                L    [        U R                  XS9R                  5       SS9$ )Nr  r   )start)sumget_read_write_buffer_accessesr   )rW   r  r  s      rY   r  3BaseSchedulerNode.get_read_write_buffers_sizes_implt  s1     //+ 0 fh	
 	
r\   c                  ^ ^^^^^ [        T [        5      (       a  0 $ [        T [        5      (       a!  [        T R                  [        5      (       a  0 $ SS jm[        T [
        5      (       a@  T" [        T R                  5       S   5      [        T R                  5       S   5      -  5      mO[        S5      m[        R                  " [        5      nU(       a:  T R                  R                   H   nX4R                     R                  U5        M"     U(       a:  T R                  R                    H   nX4R                     R                  U5        M"     U(       a&  [#        S T R                  R                   5       5      O	[#        5       nU(       a&  [#        S T R                  R                    5       5      O	[#        5       nSU 4S jjm[        T [$        5      (       a  [#        UU 4S jU 5       5      nXg-
  nXW-
  n0 nXV-   H  n	['        U4S	 jX9    5       5      mU	[(        R*                  R,                  ;   a  [(        R*                  R,                  U	   n
O>U	[(        R*                  R.                  ;   a  [(        R*                  R.                  U	   n
OM      SUUU U4S
 jjmT" U
5      nX;  a  XU	'   M  X==   U-  ss'   M     U$ )a  
Counting the number of bytes accessed for a kernel is
surprisingly tricky. In particular, there is a differentiation
between 'theoretical' memory accesses and practical memory
accesses. For example, a layernorm kernel may actually access an
input 3 times, but in theory, it only needs to access its input
once (and may be optimized to do so through say, persistent
reductions)

Another example is that even though a buffer is passed in, we may
not access the entire buffer. This may occur if we are accessing
a slice of the buffer. Another tricky case is for indirect
indexing, where the amount of bytes accessed depends on the
values of the input.

What this function aims to compute is the memory accesses for
worst-case inputs, best-case optimization. What this means is
that for each buffer we compute the amount of potential accesses in two ways and take the minimum.

1. Numel in ranges multiplied by number of deps the buffer has
2. The buffer size

Returns memory accesses per buffer.
c                R    [         R                  R                  R                  U SS9$ )Nr   fallback)rD   r   sizevars	size_hint)ss    rY   try_size_hintGBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.try_size_hint  s"    77##--a!-<<r\   r   r       eAc              3  8   #    U  H  oR                   v   M     g 7frU   r7  r8  s     rY   r  CBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.<genexpr>  s     B+ACxx+A   c              3  8   #    U  H  oR                   v   M     g 7frU   r7  r8  s     rY   r  r    s     C+BCxx+Br  c                   > TR                   R                  U    R                  n[        S U 5       5      n[	        U[        U5      -
  5      S:  $ )Nc              3  8   #    U  H  oR                   v   M     g 7frU   rM   )r  rr   s     rY   r  \BaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized.<locals>.<genexpr>  s     !>))r  r   )rL   r   rR   r   rn   )r   snodesrR   buf_usesrW   s       rY   is_materializedIBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.is_materialized  sG    NN..s399E!!>!>>Hx*V"44599r\   c              3  \   >#    U  H!  nT" UTR                   5      (       a  M  Uv   M#     g 7frU   r  )r  r  r  rW   s     rY   r  r    s#      )%_S$++-Nvs   ,	,c              3  (   >#    U  H  nTv   M	     g 7frU   r   )r  r  
node_numels     rY   r  r    s     $R;QCZ;Qs   c                R  > U (       d  g[        U [        R                  5      (       a  U R                  5       $ [        U R                  [
        5      (       a  TR                  R                  U R                  5          R                  nSnU H  n[        UR                  [        5      (       d   e[        UR                  R                  [        5      (       a8  UR                  R                  5        H  nUT" UR                  5      -  nM     M    g   U$ [        U R                  [        R                  5      (       a#  [        U4S jU R!                  5        5       5      $ T	" [#        U R%                  5       5      5      n['        U R)                  5       5      [+        TU5      -  $ )Nr   c              3  n   >#    U  H*  nT" [         R                  R                  U5      5      v   M,     g 7frU   )rD   r   
get_buffer)r  mut_nameget_buf_bytess     rY   r  ZBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes.<locals>.<genexpr>  s/      (@H &agg&8&8&BCC(@   25)r~   r   TorchBindObjectr$  rj   r0   rL   r   rV   rR   rM   rG   r/   r   r   r  r}   rC   r   r;   	get_dtypemin)
r   rR   totrr   	sched_buf	buf_elemsbuf_accessed_elemsr$  rW   r  s
         rY   r$  GBaseSchedulerNode.get_read_write_buffer_accesses.<locals>.get_buf_bytes  sG    c2#5#566,,..

,=>> !NN66s||~FLLEC %)$))5FGGGG%diinnkBB-1YY-B-B-D	 #}Y^^'D D .E $% !& J

BMM:: (+(>(>(@  
 !.mCLLN.K LI)#--/:S*I>  r\   )r  z
sympy.Exprr   r   )r   r   r  Sequence[BaseSchedulerNode]r   r   )r   z<Optional[Union[ir.Buffer, ir.TensorBox, ir.TorchBindObject]]r   r   )r~   r  ExternKernelSchedulerNoderM   r/   r~  rC   
get_rangesr   collectionsr   r   r   r   r_   r  r   r   FusedSchedulerNoder  rD   r   rF  graph_inputs)rW   r  r  buf_accessesr  r   r   r  buf_byte_accessesr  r   	buf_bytesr-  r$  r  r  r  s   `           @@@@@rY   r  0BaseSchedulerNode.get_read_write_buffer_accesses~  s\   6 d233Id566:II{<
 <
 I	= dM**&doo/23 1! 456J
 SJ"..t4''--XX&--c2 . ''..XX&--c2 /
  B4+;+;+A+ABB 	  C4+;+;+B+BCC 		:
 d.//( )%) O -F+E,.H!$$R<;Q$R!R177111gg,,X6QWW111gg**84!Q!! !F &c*I0.7(+!+y8+c 'f ! r\   c                   U R                  5       S   R                  5       S   nUR                  R                  5       n[	        [        U5      5      (       d  g[        U R                  5      (       aA  [        U R                  [        R                  5      (       d   e [        U R                  5      $ [        U R                  5      (       a  gUR                  R!                  5       n [#        5       n[%        U5      S-  n[        U [(        5      (       Ga  [        U R                  [        R*                  5      (       d   S[-        U R                  5      < 35       e[.        R1                  [3        U R                  SS5      S5      nUGb\  SSKJn  SSKJn	  [=        S	 U R                  R>                   5       5      (       a  gU" 5        n
U	" S
S9 n[@        RB                  " U R                  RD                  5         [@        RF                  " U
5         SSKJ$n  U R                  R>                   Vs/ s H
  nU" US
S9PM     nnU R                  RJ                  nURL                  " U/UQ70 U R                  RN                  D6  SnURQ                  5       nU RS                  5       nUU-  U-  S-  nUU-  n[U        UU5      sSSS5        sSSS5        sSSS5        sSSS5        $  g[        U [V        5      (       d  [        U R                  [X        5      (       a  U RS                  5       U-  $ g! [         a  n[        R                  U5         SnAgSnAf[         a  n[        R                  U5         SnAgSnAff = f! [&         a     gf = fs  snf ! , (       d  f       O= f SSS5        O! , (       d  f       O= fSSS5        O! , (       d  f       O= fSSS5        g! , (       d  f       g= f)z2
Returns estimated op runtime in nanoseconds (ns)
r   Nl    J)type(self.node)=python_kernel_namer   )FakeTensorMode)FlopCounterModec              3  l   #    U  H*  n[        [        UR                  5       5      5      S :  v   M,     g7fr   N)rn   r   	get_numelr  s     rY   r  :BaseSchedulerNode.get_estimated_runtime.<locals>.<genexpr>6  s-      - -akkm<=A-s   24F)displayr   )ir_node_to_tensor)guard_shapeg      ?r  )-rq  r   rM   r   r?   r-   r>   r~   r   IRNoder%   
ValueErrorr   r  	TypeErrorrB   maybe_get_dtyper<   r:   r   r0  ExternKernelrh   kernel_name_to_opr(  r   torch._subclasses.fake_tensorr<  torch.utils.flop_counterr=  r  inputsrD   set_current_nodefx_nodeset_fake_moderC  	__class__process_kernelr   get_total_flopsr  maxr3  r,   )rW   r   rj   edtypegpu_memory_bandwidth	gpu_flopsrX   r<  r=  	fake_modeflop_counter_moderC  inputfake_inputsclsfactorcounted_flopscounted_bytescompute_timetransfer_times                        rY   get_estimated_runtime'BaseSchedulerNode.get_estimated_runtime  sN   
 nnq!--/2))+of-.. ##dii3333
7		BB TYY
 ((*	#4#6 )%069I d566dii99P>Nd499o=O;PP9"&&		#7<dB
 ~HD !YY--    #$	#E26G&&tyy'8'89OOI.5 &*YY%5%5#%5E *%UC%5   # ))--C&&rLKL499;K;KL !F$5$E$E$GM$($E$E$GM$*]$:Y$F##ML$14H$HM |];' /. :9 32 %$ V  011ZII~6
 6
 4469MMMU       		<#	 /.. :99 322 %< = %$< s   L4 *N O++O?N7N	5NA?N		N7	O	O+4
N>MN%M??N
NNN	
N*&N7.	O7
OO	O+
O	O++
O9c                    g rU   r   r`   s    rY   get_template_node#BaseSchedulerNode.get_template_node_      r\   c                0    U R                  5       nUc   eU$ rU   rf  )rW   templates     rY   get_template_node_or_throw,BaseSchedulerNode.get_template_node_or_throwb  s!    ))+###r\   c                `    [        S [        U 5       5       5      nU SU nX   nXS-   S nX#U4$ )zA
For the list of nodes, get the prologue, template, and epilogue
c              3  X   #    U  H   u  pUR                  5       (       d  M  Uv   M"     g 7frU   r  )r  ir  s      rY   r  CBaseSchedulerNode.get_prologue_template_epilogue.<locals>.<genexpr>n  s     P,<DAaa,<s   *	*Nr   )next	enumerate)nodestemplate_indexprologuetemplate_nodeepilogues        rY   get_prologue_template_epilogue0BaseSchedulerNode.get_prologue_template_epilogueg  sH     PIe,<PP.)-!+-.00r\   )
r   r   r   rM   r   r   r   rL   r   r   N)rL   rK   r   r   )rM   ir.Operationr   r   r   )r   z	list[str]r   r	  r'   r
  r'   r   r   r  dict[str, str]r   r   )r  r&   r   r   r   )r$  r   r   r   r.  OrderedSet[str]r*  r  r   r   r   r  rc  dict[str, BaseSchedulerNode]r   r   r   r/  )r   zSequence[SchedulerBuffer])r  r   r   rI   r   Optional[torch.device]r  zdependencies.Depr   r   T)r  r=   r  r   r   r   r   )r  r   r  r   r   r   )r  r   r  r   r   zdict[str, int])r   floatr   zOptional[ir.TemplateBuffer])r   zir.TemplateBuffer)ru  list[BaseSchedulerNode]r   zJtuple[list[BaseSchedulerNode], BaseSchedulerNode, list[BaseSchedulerNode]]):ri   r   r   r   r   r   r   r   rs   r   r   r  r  r  r  r  r   r  r0  r3  r=  r-  r#  r_  rd  rV   rk  r7   rr  rx  r  r!   rq  r   r  r  r  r?   r  r  r  r  r  r  r  r  r  r  r  r  r  r  rc  rf  rl  staticmethodrz  r   r   r\   rY   rG   rG      sX   BB(('' NN''

&B*2+#
!.7	
?>


=#2=HV=	=
(
GT">T	T
. H H B B 
 
 
 
.&;:~B 9=*$*15*	*X 
 

 
 

 
 


!
37
	
C!!C!37C!	C!J Y Yv
 1&1	S1 1r\   c                  P    \ rS rSr% / SQrS\S'   S\S'   SS jrSS jrSS	 jrS
r	g)	WhyNoFuseiv  )node1node2reasonrz   r   r  ztuple[Any, ...]rz   c                    Xl         X l        g rU   )r  r  rW   r  r  s      rY   r   WhyNoFuse.__init__}  s    

r\   c                F    Xl         X l        [        R                  U 5        g rU   )r  rz   
fusion_logdebug)rW   r  rz   s      rY   __call__WhyNoFuse.__call__  s    	r\   c                    SU R                   R                  5        SU R                  R                  5        S3U R                  U R                  -  -   $ )Nzcannot fuse z with rd   )r  rV   r  r  rz   r`   s    rY   __str__WhyNoFuse.__str__  sK    djj1134F4::;N;N;P:QQSTKK$))#
 	
r\   )rz   r  r  r  Nr  rG   r  rG   r   r   )r  r   rz   r   r   r   r   )
ri   r   r   r   	__slots__r   r   r  r  r   r   r\   rY   r  r  v  s#     5IK


r\   r  c                    [        U [        [        45      (       a  [        U [        S9n [
        R                  " U SS9nSU;   a  S[        R                  " US5       3$ U$ )Nkey   )ro   r       )	r~   r   setsortedr   pprintrl   textwrapro   )objrq   s     rY   rl   rl     sU    #
C())Sc"^^C*Fv~HOOFG4566Mr\   c                  @    \ rS rSrSS jrS	S jrS
S jrSS jr\rSr	g)r   i  c                &    [        U/5      U l        g rU   rO  r  s     rY   r   OutputNode.__init__  s    ",cU"3r\   c                    grS  r   r`   s    rY   r  OutputNode.is_reduction  r  r\   c                    g)Nr   r   r`   s    rY   r|   'OutputNode.get_inputs_that_alias_output  r   r\   c                    g)NOUTPUTr   r`   s    rY   rV   OutputNode.get_name  s    r\   )r   N)r  r(   r   r   r   r   r   )
ri   r   r   r   r   r  r|   rV   r   r   r   r\   rY   r   r     s    4 Hr\   r   c                  ^ ^^^^ [         R                  " 5       mT R                   HU  n[        U[        5      (       a  M  TUR
                     R                  5       nTTU   R                  5       ==   S-  ss'   MW     SUUUU 4S jjm[        U4S jT R                   5       5      nU(       a?  T R                  U-
  T l        T R                  T R                  R                  U5      5        gg)aU  
Prunes weakdeps intended for mutation ordering
on an upstream fused node if after fusion there is another dependency
on the fused upstream node, making the weakdep redundant

In essence this enforces an ordering on fusions. As fusions occur, weakdeps will
be incrementally removed, enabling other fusions, ensuring they are fused in order.
r   c                   > [        U [        5      (       aI  TU R                     R                  5       nTTU   R	                  5          S:  nTU   T:H  nU=(       d    U$ g)Nr   F)r~   r)   r_   rZ   rV   )r  rU  is_redundantis_self_depr   name_to_dep_countrc  rM   s       rY   rV  +_prune_redundant_deps.<locals>.should_prune  se    c7##!#((+<<>G,-?-H-Q-Q-STWXXL -W5=K.;.r\   c              3  F   >#    U  H  nT" U5      (       d  M  Uv   M     g 7frU   r   rY  s     rY   r  (_prune_redundant_deps.<locals>.<genexpr>  s      .,s2C.r[  Nr\  )r2  r   r   r~   r)   r_   rZ   rV   r   r  r   r]  )rM   rc  r   r  rU  deps_to_pruner  rV  s   ```   @@rY   rb  rb    s     '2&9&9&;&&#w''!#((+<<>G09BBDEJE '

 
  .. M "&"9"9M"IT--::=IJ r\   )zextern_kernels.convolutionzextern_kernels.mmzextern_kernels.bmmzextern_kernels.addmmzextern_kernels._scaled_mmc                  J   ^  \ rS rSrSU 4S jjrSS jrS	S jrS	S jrSrU =r	$ )
r0  i  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g rU   superr   r   r  get_read_writesrW   rL   rM   rQ  s      rY   r   "ExternKernelSchedulerNode.__init__  5    #T"T1134r\   c                V    U R                  5        S[        U R                  SS 5       3$ )Nz.node.kernel = r;  )rV   r   rM   r`   s    rY   r   )ExternKernelSchedulerNode.debug_str_extra  s*    --/"/'$))EY[_2`1abbr\   c                    gNTr   r`   s    rY   r  #ExternKernelSchedulerNode.is_extern  rh  r\   c                    U R                   c   e[        U R                   S5      =(       a    U R                   R                  5       $ )Nr  )rM   r   r  r`   s    rY   r  *ExternKernelSchedulerNode.has_side_effects  s6    yy$$$tyy"45V$)):T:T:VVr\   r   rL   rK   rM   r|  r   r   r   r   )
ri   r   r   r   r   r   r  r  r   __classcell__rQ  s   @rY   r0  r0    s    5
cW Wr\   r0  c                  ,   ^  \ rS rSrSU 4S jjrSrU =r$ )r  i  c                   > [         TU ]  U5        U R                  U5        U R                  UR	                  5       5        g rU   r  r  s      rY   r   NopKernelSchedulerNode.__init__  r  r\   r   r  )ri   r   r   r   r   r   r  r  s   @rY   r  r    s    5 5r\   r  c                    ^  \ rS rSr% S\S'   S\S'         SU 4S jjr  S     SS jjr  S     SS jjr      SS	 jrS S
 jr	S!S jr
      S"S jrS#S jrS$S jrS%S jrS%S jrS%S jrS&S jrS'S jr    S(S jrS)S jr S*   S+S jjr\S,S j5       r\S,S j5       rS-S jr\S.S j5       rSrU =r$ )/r~  i  z tuple[Sequence[sympy.Expr], ...]_sizesr1   _bodyc                f   > [         TU ]  U5        U R                  U5        U R                  5         g rU   )r  r   r   _compute_attrsr  s      rY   r   SchedulerNode.__init__  s,    
 	#T"r\   c                    [        U R                  [        R                  [        R                  45      (       d   eU R                  R                  UUS9u  U l        U l        U R                  R                  5       nU R                  R                  U5      R                  nX4" U R                  5      4U l        [        R                  (       + =(       d    [        UR                   5      (       + n[        U R                  [        R                  5      (       a)  U R#                  U R                  R%                  US95        g U R#                  [&        R$                  " U R                  /U R                  Q7SU065        g )Nextra_indexing_constraintsrecompute_sizes_body_func)	normalizer  )r~   rM   r   r,   TemplateBuffersimplify_and_reorderr  r  get_device_or_errorrL   get_backendgroup_fnr   r   loop_ordering_after_fusionr?   rh   r  extract_read_writesr   )rW   r  r  r  r  should_normalizes         rY   r  SchedulerNode._compute_attrs  s9   
 $))b&7&79J9J%KLLLL"&))"@"@'A&? #A #
TZ
 ..0>>--f5>>ht{{34
  &@@@ 
KKI
 E
 dii!2!233  		--8H-I   00JJ!%8Hr\   c                $    U R                  UUS9  g )Nr  )r  )rW   r  r  s      rY   recompute_size_and_body%SchedulerNode.recompute_size_and_body  s    
 	'A&? 	 	
r\   c                n   [        S U R                  R                   5       5      nU R                  [        R
                  " U R                  /U R                  Q7SU06R                  U5      5        U R                  R                  U 5        U(       a!  SSKJn  UR                  R                  5         g g )Nc              3  `   #    U  H$  n[        U[        [        45      (       d  M   Uv   M&     g 7frU   )r~   r)   r(   r8  s     rY   r  5SchedulerNode.refresh_dependencies.<locals>.<genexpr>-  s$      0
1CZgwEW5XCC1s   .	.r  r   SIMDScheduling)r   r   r   r  r   r  r  r  r  pointwise_read_writesclear_cachecodegen.simdr  candidate_tilingscache_clear)rW   r  need_clear_tiling_cache	fake_depsr  s        rY   refresh_dependencies"SchedulerNode.refresh_dependencies(  s    
 &0 0
++110
 &
	 	,,

![[4=i	"	
 	""..t4"4 ,,88: #r\   c                    U R                   R                  U5      U l         U R                   R                  U l        U R	                  SSS9  g )NFTr  r  )r  reorder_iter_loopssizesr  r  )rW   	new_orders     rY   apply_new_loop_order"SchedulerNode.apply_new_loop_orderD  sA    ZZ22

 jj&&!!E4!Pr\   c                    U R                   R                  5       U l         U R                   R                  U l        U R	                  SSS9  g )NTFr  )r  merge_loopsr  r  r  r`   s    rY   r  SchedulerNode.merge_loopsL  s<    ZZ++-
jj&& 	!!D%!Pr\   c                   S nU R                   S   n[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       aP  [        =R
                  S-  sl        [        R                  SU R                  5       U5        U R                  U5        g [        R                  SU R                  5       5        g )Nr   r   z"Reorder loops for %s with order %szEDon't reordering %s because we can not decide the suitable loop order)
r  rn   num_varsdecide_loop_order_to_matchr    num_loop_reorderingloop_ordering_logr  rV   r  )rW   r	  r
  r  
self_sizess        rY   r  'SchedulerNode.reorder_loops_by_dep_pairX  s     	[[^
z?h//E93E3EE ;;IFI''1,'##4dmmoy %%i0##Wr\   c                N   U R                  5       nU SU R                  S    3U SU R                  S    3U SU R                   3/nU R                  R	                  5        H  n[        U[        5      (       a  M  UR                  n[        R                  R                  U5      n[        U[        R                  5      (       a  Mf  UR                  U S[        UR                  5       35        M     [        U R                   ["        5      (       aS  UR                  SU S35        UR                  [$        R&                  " U R                   R)                  5       S	5      5        U R*                  c   eUR-                  U R/                  5       5        S
R1                  U5      $ )Nz.group.device = r   z.group.iteration = r   z	.sizes = z
_layout = zclass z_loop_body:r  r   )rV   r   r  r   r  r~   r)   r_   rD   r   r"  r   r'  r  rl   rj   r  r1   r  ro   rs   rM   rG  r   join)rW   r_   linesr  r  r   s         rY   r   SchedulerNode.debug_str_extral  sM   }}f$TZZ]O4f'

17fIdkk]+

 ##446Cc7++88gg((2!#r'9'9::LLH:Z

8K7L!MN 7 djj(++LL6${34LL)=)=)?HIyy$$$T//12yyr\   c                    U R                   $ rU   )r  r`   s    rY   r1  SchedulerNode.get_ranges      {{r\   c                    [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  R                  5       5      $ Nr:  )r~   rM   r   r,   r  rh   r   r   r`   s    rY   r  SchedulerNode.is_reduction  s^    $))b&7&79J9J%KLL 	
tDII !	
L DII00233r\   c                b   [        U R                  [        R                  [        R                  45      (       d   S[        U R                  5      < 35       e[        U R                  [        R                  5      =(       a.    [        U R                  R                  [        R                  5      $ r  )r~   rM   r   r,   r  rh   r   	SplitScanr`   s    rY   r  SchedulerNode.is_split_scan  s|    $))b&7&79J9J%KLL 	
tDII !	
L $))R%6%67 
JIINNBLL=
 	
r\   c                J    [        U R                  [        R                  5      $ rU   r~   rM   r   r  r`   s    rY   r  SchedulerNode.is_template  s    $))R%6%677r\   c                p    [        U R                  [        R                  5      (       a  U R                  $ S $ rU   r  r`   s    rY   rf  SchedulerNode.get_template_node  s'    &tyy"2C2CDDtyyN$Nr\   c                f    U R                  5         U R                  5         U R                  U5        g rU   )r  r3  r  )rW   
index_varss     rY   runSchedulerNode.run  s#    ""$Z r\   c                (   U R                   n[        [        [        U5      5      [        [        [        U5      5      :X  d   e[	        [        [        R                  R                  U5      [        R                  R                  U5      5      5      nU$ rU   )	r  r  maprn   dictzipr;  r<  from_iterable)rW   r  r  
var_rangess       rY   ranges_from_index_vars$SchedulerNode.ranges_from_index_vars  sp     3sE?#s3sJ+?'@@@@--j9--e4

 r\   c                   U R                  U5      n [        R                  " [        [        R                  " 5       U5      5         [        R
                  R                  U 5         U R                  " U6   S S S 5        S S S 5        g ! , (       d  f       N= f! , (       d  f       g = f! [         a"    [        R                  SU R                  5        e f = f)NzError in codegen for %s)r$  rD   set_ops_handlerr6   get_ops_handlerr   rN  r  r   r   fatalrM   )rW   r  r#  s      rY   r  SchedulerNode.codegen  s    00<
	!!"213D3D3F
"ST))$/

J' 0 UT// UT  	II/;	sA   3B)  B&B6B>B) 
B	B
B&"B) &B) ),Cc                    U(       a  U R                   O[        U R                   5      u  p#[        R                  " U R                  U[
        R                  R                  /[        U5      -  /S9$ )zL
Get the memory dependencies in either the pointwise or the reduction axes.
)hidden_args)	r  reversedr   r  r  sympySZerorn   )rW   	pointwise
keep_sizesignore_sizess       rY   "pointwise_or_reduction_read_writes0SchedulerNode.pointwise_or_reduction_read_writes  sR     3<4;;$++AV 
//JJ
%'',,#lBS1S0T
 	
r\   c                     U R                  SS9$ )z8
Get the memory dependencies in the non-reduction axes.
Tr1  r4  r`   s    rY   r  #SchedulerNode.pointwise_read_writes  s    
 666FFr\   c                     U R                  SS9$ )z4
Get the memory dependencies in the reduction axes.
Fr7  r8  r`   s    rY   reduction_read_writes#SchedulerNode.reduction_read_writes  s    
 666GGr\   c                (   U R                  5       (       a  g[        S U R                  5        5       5      (       a  g[        U R                  R
                  5      S:X  a  [        U[        R                  5      (       a  [        [        U R                  R
                  5      5      n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  UR                  :H  =(       a    UR                  UR                  :H  $ g)NFc              3  @   #    U  H  oR                  5       v   M     g 7frU   )rk   rv  s     rY   r  ,SchedulerNode.can_inplace.<locals>.<genexpr>  s     ?,>S  ,>rp  r   ztype(write_dep)=)r  r  r   rn   r   r   r~   r   r'   rs  iterrh   indexsize)rW   r  	write_deps      rY   r  SchedulerNode.can_inplace  s    ?D,<,<,>???t&&'1,l,,2
 2
 T$"2"2"9"9:;Ii)?)?@@WEUT)_DVBWW@>>Y__4X)..9XXr\   c                H   [         [           " 5       n[        U R                  [        5      (       a  U R                  R                  5        H  nUR                  S:X  d  M  UR                  S:X  d  M'  SUR                  ;   a  UR                  S   S:X  d0  [        UR                  5      S:X  d  Me  UR                  S   S:X  d  Mz  UR                  SUR                  ;   a  UR                  S   O)[        UR                  5      S:  a  UR                  S	   OS
5        M     U$ )Ncall_methodstoremode
atomic_add   r  r_      r   r   )r   r   r~   r  r1   rq  rX   r  r   rn   rz   rE  )rW   buffers_store_as_atomic_addrM   s      rY   _get_atomic_add_buffers%SchedulerNode._get_atomic_add_buffers  s    &0o&7#djj(++

,,.GG},w.4;;.4;;v3F,3V		Na/DIIaLL4P 033!T[[0 F+.1$))n.Adiilr / +*r\   )r  r  r   )rL   rK   rM   z+Union[ir.ComputedBuffer, ir.TemplateBuffer]r   r   NN)r  z*Optional[tuple[dict[Any, Any], list[Any]]]r  zOptional[Callable[..., Any]]r   r   )r  r   r  r   r   r   )r  zSequence[int]r   r   r   r}  r   )r   Sequence[Sequence[sympy.Expr]]r   r  )r  Sequence[sympy.Expr]r   r   )r  rP  r   zdict[sympy.Expr, sympy.Expr])r  rP  r   r   r  )r1  r   r   r   )r   r   r  r  )ri   r   r   r   r   r   r  r  r  r  r  r  r   r1  r  r  r  rf  r  r$  r  r4  r7   r  r;  r  rM  r   r  r  s   @rY   r~  r~    sk   ,,O : 
	 RVBF$N $@ 
	D RVBF
$N
 $@
 
	
;;8<;	;8Q
Q!.7	( ,4
8O!
8	%
 !%	
	
	 	
 G G H H + +r\   r~  c           	     z  ^  T R                   nT R                  [        R                  R	                  U Vs/ s H  o"R
                  PM     sn5      5        [        U 4S j[        R                  " U Vs/ s H  o"R                  PM     sn6  5       5      T R
                  R                  -
  T l        g s  snf s  snf )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7frU   r_   rx  )r  r  group_snodes     rY   r  2refresh_group_node_dependencies.<locals>.<genexpr>  s/      
Pxx{;;== CP   "2	2)
r  r  r   
ReadWrites
merge_listr   r   unionr   r   )rU  r  r  s   `  rY   refresh_group_node_dependenciesr[    s     F**6+J6aMM6+JK
 	 
!'')O1*>*>)OP
 	

 
!
!
(
(	) " ,K *Ps   B34B8rK   c                   [        U [        [        45      (       d   eX l        Xl        S U l        [        R                  " U Vs/ s H  o3R                  c  M  UR                  PM     sn6 U l        [        U 5        [        S U R                   5       5      U l        [        S U R                   5       5      U l        U R                  5        Vs0 s H  oDR                  5       U_M     snU l        g s  snf s  snf )Nc              3  8   #    U  H  oR                   v   M     g 7frU   r   r  r  s     rY   r  "init_group_node.<locals>.<genexpr>       H5G5Gr  c              3  8   #    U  H  oR                   v   M     g 7frU   )r   r_  s     rY   r  r`    ra  r  )r~   r3  GroupedSchedulerNoder  rL   rM   r   rZ  r   r[  r)  r   rT  r   r   rV   r   )rU  rL   r  r  r   s        rY   init_group_noderd    s    
 k$68L#MNNNN%K&,,%	Av!+!++v	AK $K0H[5G5GHHKH[5G5GHHK'2'>'>'@#'@'@#K 
B#s   C4C4C9c                    ^  \ rS rSr% SrS\S'   \      SS j5       r      SS jrS U 4S jjr	\
S!S j5       rS!S	 jr\
S"S
 j5       rS#S jrS!S jrS!S jr      S$U 4S jjr\
S"S j5       r\
S"S j5       rS%S jrS!S jr\
S&S j5       r\
S&S j5       r\
S&S j5       r\
S'S j5       rS(S jr\
S&S j5       rS)S jrS*S jrS+S jrS!S jrSr U =r!$ ),r3  i  z
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be fused together. The way it does this is by maintaining
its unmet dependencies as the union of its constituent nodes.
r  r  c           	        UR                   UR                   L d   e[        U[        [        45      (       d   eUR	                  5       (       Ga  [        U[
        5      (       Ga  [        UR                  [        5      (       d   e[        UR                  R                  5      S:X  d   e[        [        [        UR                  R                  5      5      [        5      (       d   e[        [        UR                  R                  5      5      R                  nUR                  5        Vs/ s H  oDR	                  5       (       d  M  UPM     nn[        U5      S:X  d   eUS   n[        UR                  R                  5      S:X  d   e[        [        UR                  R                  5      5      n[        U[         5      (       d   e[#        [!        X7R$                  UR&                  UR(                  UR*                  5      /5      UR                  l
        O[        U[        [        45      (       d   e[-        [.        R0                  " UR                  5       UR                  5       5      5      nU " UR                   U5      $ s  snf )Nr   r   )rL   r~   r~  r3  r  r0  rM   r/   rn   r   r   rs  r@  r(   r_   rq  r'   r   rA  	var_namesrB  rH  r   r;  r<  )	r]  r  r  r_   rM   template_nodesrx  writeru  s	            rY   fuseFusedSchedulerNode.fuse#  s    %//111%-1C!DEEEE:e5N#O#O ejj+6666u((//0A555d4(9(9(@(@#ABGLLLLU..5567<<D/4/@W/@tDTDTDVd/@NW~&!+++*1-M}00778A===m77>>?@EeY////'1kk5??EJJ

(E$ em5G%HIIIIY__U__%68IJK5??E**! Xs   ,JJc                   U R                  5       (       a  g S nU R                   Hh  n[        U[        5      (       d   eUb<  [	        U5      [	        UR
                  S   5      :w  a  [        R                  S5          g UR
                  S   nMj     S nUc   e[        U5      UR                  s=:X  a  UR                  :X  a  O  OUR                  U5      nU(       d%  [        R                  SU R                  5       5        g [        =R                  S-  sl        [        R                  SU R                  5       U5        U R                   H+  n[        U[        5      (       d   eUR                  U5        M-     [        U 5        g )Nr   z1Can not reorder fused node due to different sizeszODont reordering fused node %s because we can not decide the suitable loop orderr   z-Reorder loops for fused node %s with order %s)r  r  r~   r~  tupler  r  r  rn   r  r  rV   r    r  r  r[  )rW   r	  r
  r  snoder  s         rY   r  ,FusedSchedulerNode.reorder_loops_by_dep_pairE  sH    
[[Ee]3333%%
*;uU\\RS_?U*U!''G aJ ! 	%%%z?h//E93E3EE ;;IFI##a ##q(#;T]]_i	
 [[Ee]3333&&y1 ! 	(-r\   c                ~   > [         TU ]  U5        [        XU5        / U l        [	        US S9R
                  U l        g )Nc                4    [        U R                  5       5      $ rU   )r   r  r  s    rY   r   -FusedSchedulerNode.__init__.<locals>.<lambda>n  s    s1>>3C/Dr\   r  )r  r   rd  rR   rT  r   rW   rL   r  rQ  s      rY   r   FusedSchedulerNode.__init__j  s6    #0%'
%DEKK
r\   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf N_r	  r  rV   rW   r  s     rY   rV   FusedSchedulerNode.get_namep  +    xxt{{;{!{;<<;   :c                <    U R                   S   R                  5       $ Nr   r  rV   r`   s    rY   rk  !FusedSchedulerNode.get_first_namet      {{1~&&((r\   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rU   r   rZ  r  rx  rz  s     rY   rx  #FusedSchedulerNode.get_buffer_namesw  0    !L1"4"4"6!LMM!L   <c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ rU   r  rG  r   rW   rq   rM   s      rY   r   FusedSchedulerNode.get_outputs{  /    (*KKDMM$**,-  r\   c           
        [        U R                  5       VVs/ s H+  u  pU R                  5        SU SUR                  5        3PM-     nnnU R                  S   R                  nUb  UR                  U R                  5       5        [        R                  " SR                  U5      R                  5       S5      $ s  snnf )Nz.snodes[z] =
r   r   r  )rt  r  rV   rs   rM   rG  r   r  ro   r	  r   )rW   rq  rM   r
  s       rY   r   "FusedSchedulerNode.debug_str_extra  s     %T[[1
1 }}xs%0@/AB1 	 
 {{1~""LL3356tyy/668&AA
s   2B=c                l    U R                    Vs/ s H  oR                  5       PM     nnU  SU 3$ s  snf )Nz
, snodes: )r  r  )rW   rM   
snodes_strs      rY   r  "FusedSchedulerNode.debug_str_short  s8    9=E**,
Ez*.. Fs   1c                   > [         TU ]  X5        [        [           " 5       n[	        U R
                  5       H/  nUR                  X5        UR                  UR                  5        M1     g rU   )r  r0  r   r   r-  r  updater   )rW   r.  r*  rM   rQ  s       rY   r0  !FusedSchedulerNode.set_last_usage  sV    
 	2G )o/T[[)D 3H&&t7 *r\   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rU   )r   rZ  r  r=  rz  s     rY   r=  $FusedSchedulerNode.used_buffer_names  s0    !MA"5"5"7!MNN!Mr  c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rU   )r   rZ  r  r-  rz  s     rY   r-  /FusedSchedulerNode.used_or_aliased_buffer_names  s5    8<D1,,.D
 	
Dr  c                    U R                   $ rU   r  r`   s    rY   rq  FusedSchedulerNode.get_nodes  r  r\   c                T    [        U 5      R                   SU R                  5        S3$ )Nz(nodes=r   r   r`   s    rY   r   FusedSchedulerNode.__repr__  s'    t*%%&gdmmo->a@@r\   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frU   )r  r_  s     rY   r  2FusedSchedulerNode.is_reduction.<locals>.<genexpr>  s     9[>>##[rp  r  r  r`   s    rY   r  FusedSchedulerNode.is_reduction  s    9T[[999r\   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frU   )r  r_  s     rY   r  3FusedSchedulerNode.is_split_scan.<locals>.<genexpr>  s     :k??$$krp  r  r`   s    rY   r   FusedSchedulerNode.is_split_scan  s    :dkk:::r\   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frU   rp  r_  s     rY   r  1FusedSchedulerNode.is_template.<locals>.<genexpr>  s     8Kq==??Krp  r  r`   s    rY   r  FusedSchedulerNode.is_template  s    8DKK888r\   c                x    U R                    H*  nUR                  5       (       d  M  UR                  5       s  $    g rU   )r  r  rf  rW   rM   s     rY   rf  $FusedSchedulerNode.get_template_node  s3    KKD!!--//   r\   c                     U R                   S   $ r  )r   r`   s    rY   r  FusedSchedulerNode.get_device  s    zz!}r\   c                :    [        S U R                   5       5      $ )Nc              3  @   #    U  H  oR                  5       v   M     g 7frU   )r   r_  s     rY   r  >FusedSchedulerNode.has_aliasing_or_mutation.<locals>.<genexpr>  s     EA--//rp  r  r`   s    rY   r   +FusedSchedulerNode.has_aliasing_or_mutation  s    EEEEr\   c                    [         erU   NotImplementedErrorr  s     rY   r  'FusedSchedulerNode.update_mutated_names      !!r\   c                    [         erU   r  )rW   r_   s     rY   r  FusedSchedulerNode.add_fake_dep  r  r\   c                    [         erU   r  r  s     rY   r  FusedSchedulerNode.can_inplace  r  r\   c                X   U R                  5       nSR                  S U R                   5       5      n[        5       nUR	                  U S[        U 5      R                   SU SU S[        U R                  R                  5       SU S[        U R                  5       SU S	[        U R                  R                  U R                  -
  5       SU S
35        UR                  5          U R                  5        H"  nUR	                  UR                  5       5        M$     SSS5        UR                  S5         UR	                  U R!                  5       5        UR)                  5       R+                  5       $ ! , (       d  f       N]= f! ["         a    [$        R'                  SSS9   NOf = f)r   re   c              3  L   #    U  H  n[        U5      R                  v   M     g 7frU   )rh   ri   r  s     rY   r  /FusedSchedulerNode.debug_str.<locals>.<genexpr>  s     F+QQ 0 0+s   "$rd   r   r   r   r   r   r   z.outputs = [
            Nrf   r   Tr   )rV   r	  r  r=   r   rh   ri   rl   r   r   r   r   ro   r   rs   rg   r   r   r   r   rp   r   )rW   r_   node_typestrr   r   s        rY   rs   FusedSchedulerNode.debug_str  sx   }}xxF$++FF

bd		Q|n -j))0012 3WT%<%<=> ?74#3#3#9#9D<S<S#STU V 	
 ZZ\'')

3==?+ *  	c	HJJt++-.  '')) \  	HKK7$KG	Hs   )7E7:F 7
FF)(F))r   rR   r  rG   r  rG   r   r3  r}  rL   rK   r  r  r   r   r   r  r   zlist[SchedulerBuffer]r  r  r   r  )r   torch.devicer~  )r_   r&   r   r   r  )"ri   r   r   r   __doc__r   classmethodrj  r  r   r7   rV   rk  rx  r   r   r  r0  r=  r-  rq  r   r  r  r  rf  r  r   r  r  r  rs   r   r  r  s   @rY   r3  r3    st    $#+%+.?+	+ +B#.!#..7#.	#.JL = =) N N	B/8#28HV8	8 O O 
 

A : : ; ; 9 9   F F
"""* *r\   r3  c                  z  ^  \ rS rSr% Sr    SS jr    SS jr\SS j5       r\      SS j5       r	   S             SU 4S jjjr
\    SS j5       r\    SS	 j5       r\rS
\S'   \    SS j5       r\    SS j5       rSS jrSS jrS S jrS!S jrS"S jrS#S jr    S$S jrSrU =r$ )%ForeachKernelSchedulerNodei  z
This is a schedular node that consists of a set of scheduler nodes that
has no data dependencies among them and can be executed in parallel.
c                    UR                  5        H@  nUR                  5       U R                  ;   d  M#  U R                  UR                  5          s  $    g rU   )r   rV   read_to_node)rW   producerr   s      rY   get_consumer_subnode_for3ForeachKernelSchedulerNode.get_consumer_subnode_for  sG     '')C||~!2!22((88 * r\   c                   [         [           " 5       nUR                  R                   H  nUR                  U R
                  R                  ;  a  M)  U R
                  R                  UR                     R                  5       nX@R                  ;   d  Mk  UR                  U R                  U   5        M     [        U5      S:X  a  [        [        U5      5      $ g Nr   )r   rG   r   r   r_   rL   r   rZ   name_to_noderE  rn   rs  r@  )rW   consumer	producersrd	node_names        rY   get_producer_subnode_for3ForeachKernelSchedulerNode.get_producer_subnode_for  s     013	&&,,Bwwdnn88822277;LLNI---d//	:; - y>QY((r\   c                  ^ [        TU5      nTR                  5       (       a  UR                  5       (       a  [        R                  " [        T5      m[        R                  " [        U5      n[        TR                  5      [        UR                  5      :H  nU(       d  U" S5        U=(       a3    [        U4S j[        TR                  UR                  5       5       5      $ UR                  5       (       ar  TR                  5       (       a	  U" S5        g[        R                  " [        U5      nUR                  T5      nUb  UR                  R                  TU5      $ U" S5        gTR                  5       (       aq  UR                  5       (       a	  U" S5        g[        R                  " [        T5      mTR                  U5      nUb  TR                  R                  Xb5      $ U" S5        g[        S5      e)	Nzforeach do not have same lengthc              3  ^   >#    U  H"  u  pTR                   R                  X5      v   M$     g 7frU   )rL   can_fuse)r  lrr  s      rY   r  6ForeachKernelSchedulerNode.can_fuse.<locals>.<genexpr>  s.      )ADA ""++A11A   *-zXcandidate producer is a reduction, foreach ops cannot be fused with reductions currentlyFz5candidate producer is not dep of any foreach consumerzXcandidate consumer is a reduction, foreach ops cannot be fused with reductions currentlyz5candidate consumer has no dep in any foreach producerzXAt least one node passed to ForeachKernelSchedulerNode.can_fuse should be a foreach node)r  r  typingcastr  rn   r  r  r!  r  r  rL   r  r  AssertionError)r]  r  r  whyforeach_matchconsumer_subnodeproducer_subnodes    `     rY   r  #ForeachKernelSchedulerNode.can_fuse  s   (+  X%8%8%:%:{{#=xHH{{#=xHH0C4HHM 56  S )A) &    ""$$&&n {{#=xHH'@@J+))228=MNNGH  ""$$&&n {{#=xHH'@@J+))223CNNGHf
 	
r\   c           	     `   UR                  5       (       d  UR                  5       (       d   eUR                  5       (       a4  [        R                  " [        U5      nUR                  nUR
                  nO3[        R                  " [        U5      nUR                  nUR
                  nS nS nUR                  5       (       a  UR                  5       (       a  [        R                  " [        U5      n[        R                  " [        U5      n[        UR                  UR                  5       VVs/ s H  u  px[        R                  Xx5      PM     n	nnGO?UR                  5       (       a  [        R                  " [        U5      nUR                  U5      n
/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     OUR                  5       (       a  [        R                  " [        U5      nUR                  U5      n/ n	UnS nUR                   HB  nXL a*  [        R                  X5      nUnU	R                  U5        M1  U	R                  U5        MD     O[        S5      eU " UR                  U	UUUUS9$ s  snnf )NzTAt least one node passed to ForeachKernelSchedulerNode.fuse should be a foreach node)use_custom_partition_algoprev_node_1prev_node_2enable_autotune)r  r  r  r  r  r  r!  r  r3  rj  r  r  r  r  rL   )r]  r  r  r  r  r  r  r  r  fused_nodesr  rM   new_noder  s                 rY   rj  ForeachKernelSchedulerNode.fuse>  s\    ""$$(;(;(=(===  {{#=xHH(0(J(J%&66O{{#=xHH(0(J(J%&66O  X%8%8%:%:{{#=xHH{{#=xHH  AADA #''-A  K   ""{{#=xHH'@@JK"KK +166tFH"*K&&x0&&t, (   ""{{#=xHH'@@JK"KK +166xFH"*K&&x0&&t, ( !f  &?##+
 	
Ks   0!J*c                z  >^  0 T l         0 T l        Ub  Ucv  [        TT ]  X5        U H_  nUR                  R
                   H  nUT R                   UR                  '   M     UR                  5        H  n	UT R                  U	'   M     Ma     GOUT l        UT l	        S T l
        / T l        T R                  [        R                  R                  UR                  UR                  /5      5        [!        U 4S j[         R"                  " UR$                  UR$                  5       5       5      T R                  R&                  -
  T l        [)        UR*                  UR*                  /5      T l        [-        UR.                  UR.                  /5      T l        UR1                  5       (       a  [3        U[4        5      (       d   eXEpO[3        U[4        5      (       d   eXTpU
R6                  T l        T R6                  R9                  UR6                  5        U
R                  T l        UR                  5        H  n	UT R                  U	'   M     UT l        US   R=                  5       nU(       d   eU[>        R@                  " S5      444T l!        [         [D        RF                  RH                     " 5       T l%        UT l&        g )Nc              3  h   >#    U  H'  nUR                   TR                  5       ;  d  M#  Uv   M)     g 7frU   rT  rM  s     rY   r  6ForeachKernelSchedulerNode.__init__.<locals>.<genexpr>  s5        xxt'<'<'>>	 C rW  r   combo_kernel)'r  r  r  r   r   r   r_   rr  rL   r  rM   rR   r  r   rX  rY  r   rZ  r   r   r)  r   rT  r   r  r~   r  r   r  r  r  r.  Exprr   r   fxNoder  r  )rW   rL   r  r  r  r  r  rM   r  r_   foreach_node
other_noder  rQ  s   `            rY   r   #ForeachKernelSchedulerNode.__init__  s\    +"5GY/ ,,22D37D%%dii0 3 !446D.2D%%d+ 7	  'DN DKDI)+DJ  ''22 ,,k.E.EF  )//#668V8V   ""))* # !+"7"79N9N!OPDN +"7"79N9N!OPDN%%''!+/IJJJJ+6j!+/IJJJJ+6j)33DNNN!!*"6"67 , 9 9D"668*4!!$' 9 *C&%%'v

> :<>?
!%((--02.r\   c           	        U Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       aW  [        R                  S[	        U5      U Vs/ s H+  oDR
                  c  M  UR
                  R                  5       PM-     sn5        U Vs/ s H"  n[        U[        [        45      (       a  M   UPM$     nnU Vs/ s H  n[        U[        5      (       d  M  UPM     nnU(       a  [        R                  S[	        U5      5        U Vs/ s H  n[        U[        5      (       a  M  UPM     nnU Vs/ s H  o"R                  5       (       d  M  UPM     nnU(       a)  [        R                  S[        [	        U5      /5      5        U Vs/ s H  o"U;  d  M
  UPM     nnU$ s  snf s  snf s  snf s  snf s  snf s  snf s  snf )Nz/ComboKernels: %d external nodes are filtered %sz+ComboKernels: %d foreach nodes are filteredz,ComboKernels: %d template nodes are filtered)r~   r0  r   r  rn   rM   r  r  r  r  r   )r]  ru  r  externrM   filtered_nodesforeach_nodesrh  s           rY   combinable_nodes+ForeachKernelSchedulerNode.combinable_nodes  s    #OUj4M&N!UOIIAF5;UVTyy(&&(VU 
a"8:S!TU  	 
 &
%!A7Q)RA~ 	 
 IICSEWX%
%!Z;U-VA~ 	 
 &4G^}}!^GII>C/01 &4O^7N!^O7 P
 V




 H PsR   F"F"F'#F'F,/F,;F1F1
F6'F63F;F;	G G c           
         U R                  5       n/ nSnU H=  nUR                  [        S[        U5      U5       Vs/ s H	  nXEXS-    PM     sn5        M?     U$ s  snf )zC
Returns a list of lists of nodes that are to be grouped together.
   r   )_topological_sort_nodesrG  rangern   )rL   sorted_nodesgrouped_nodesmax_num_nodesru  rq  s         rY   &_default_group_nodes_for_combo_kernelsAForeachKernelSchedulerNode._default_group_nodes_for_combo_kernels  ss     !88:!E   #1c%j-@@ a/0@ " s   A
4Callable[[Scheduler], list[list[BaseSchedulerNode]]]!group_algorithm_for_combo_kernelsc                    U [         l        g rU   r  r  )custom_group_algorithms    rY   %set_group_algorithm_for_combo_kernels@ForeachKernelSchedulerNode.set_group_algorithm_for_combo_kernels  s    
 # 	#Dr\   c                ,    [         R                  U 5      $ rU   r
  rL   s    rY   group_nodes_for_combo_kernels8ForeachKernelSchedulerNode.group_nodes_for_combo_kernels  s     *KKIVVr\   c                    [         erU   r  r`   s    rY   r3  #ForeachKernelSchedulerNode.mark_run
  r  r\   c                    [         erU   r  r`   s    rY   r  "ForeachKernelSchedulerNode.codegen  r  r\   c                    gr  r   r`   s    rY   r  %ForeachKernelSchedulerNode.is_foreach  rh  r\   c                ,    [        U R                  5      $ )z]Returns a list of nodes which comprise the combo kernel.
These nodes may be vertically fused.)r   r  r`   s    rY   get_subkernel_nodes.ForeachKernelSchedulerNode.get_subkernel_nodes  s     DKK  r\   c                t    [        [        R                  R                  S U R                   5       5      5      $ )ziReturns all nodes contained in this kernel, unpacking fused nodes
into their constituent scheduler nodes.c              3  @   #    U  H  oR                  5       v   M     g 7frU   )rq  r_  s     rY   r  7ForeachKernelSchedulerNode.get_nodes.<locals>.<genexpr>  s     1UA++--rp  )r   r;  r<  r"  r  r`   s    rY   rq  $ForeachKernelSchedulerNode.get_nodes  s(     IOO111U1UUVVr\   c                <    U R                   S   R                  5       $ r  )r  rk  r`   s    rY   rk  )ForeachKernelSchedulerNode.get_first_name  s    {{1~,,..r\   c                    [        XU R                  R                  5        U R                   H  nUR	                  U5        M     g rU   )rb  rL   r   r  rd  )rW   rc  rM   s      rY   rd  /ForeachKernelSchedulerNode.prune_redundant_deps   s5     	d8R8RSKKD%%&89  r\   )r   r  r   r   r   r  rM   r  r  rL   r  r   r  rR   )r  rG   r   rN   )r  rG   r   rN   r  rG   r  rG   r   r   )r  rG   r  rG   r   r  )NNF)rL   rK   r  r  r  r   r  rN   r  rN   r  r   r   r   ru  r  r   r  )rL   rK   r   list[list[BaseSchedulerNode]])r  r  r   r   r   r   r   r  r  r   r  )ri   r   r   r   r  r  r  r  r  rj  r   r  r  r  r  r   r  r  r3  r  r  r  rq  rk  rd  r   r  r  s   @rY   r  r    s   
)	$)	$& ,
 ,
\ >
(>
4E>
	#>
 >
J 4837 %B/B/ (B/ $(	B/
 1B/ 1B/ B/ 
B/ B/H +	  @ 	& * 	/ & ( / 
 T
	
 
 WW	&W W
""!
W
/:">:	: :r\   r  c                     ^  \ rS rSr% SrS\S'   \SS j5       rSU 4S jjrSS jr	SS jr
\SS	 j5       rSS
 jr\SS j5       rSS jrSS jr\SS j5       rSrU =r$ )rc  i)  a'  
This is a "fake" scheduler node that represents a group of scheduler nodes
that are meant to be *grouped* together (it does not allow another node to be scheduled
in between its constituent nodes, nor does it allow another node to fuse into any of its constituent nodes).
The way it does this is by maintaining its unmet dependencies as the union of its constituent nodes.
Fusion will still happen among the nodes within each GroupedSchedulerNode.
At codegen time, this scheduler node will be unpacked and codegen is called on each constituent node.
r  r  c                   ^ US   R                   m[        U4S jU 5       5      (       d   eU " TU5      nU H   nUTR                  UR                  5       '   M"     UTR                  UR                  5       '   U$ )Nr   c              3  >   >#    U  H  oR                   TL v   M     g 7frU   r  )r  rM   rL   s     rY   r  .GroupedSchedulerNode.create.<locals>.<genexpr>8  s     B64>>Y.6s   )rL   r  rc  rV   )r]  r  grouped_snodern  rL   s       @rY   createGroupedSchedulerNode.create5  su    1I''	B6BBBBBIv.E=JI(()9: AN	$$]%;%;%=>r\   c                <   > [         TU ]  U5        [        XU5        g rU   )r  r   rd  rt  s      rY   r   GroupedSchedulerNode.__init__?  s    #0r\   c                   U R                    H)  nXR                  R                  UR                  5       '   M+     U R                  R                  U R                  5       	 U R                  R	                  U R                   5      $ )zw
Do fusion among nodes within this GroupedSchedulerNode,
and then unpack this GroupedSchedulerNode into regular nodes.
)r  rL   rc  rV   
fuse_nodes)rW   rn  s     rY   unpackGroupedSchedulerNode.unpackC  s\    
 [[EBGNN--enn.>? !NN--dmmo>~~((55r\   c                    U R                  U R                  R                  U5      5        U R                  R	                  U5        g rU   )r  r   r  r   rE  )rW   fake_deps     rY   r  !GroupedSchedulerNode.add_fake_depM  s5    T--77AB##H-r\   c                ~    SR                  U R                   Vs/ s H  oR                  5       PM     sn5      $ s  snf rw  ry  rz  s     rY   rV   GroupedSchedulerNode.get_nameQ  r|  r}  c                <    U R                   S   R                  5       $ r  r  r`   s    rY   rk  #GroupedSchedulerNode.get_first_nameU  r  r\   c                    [         R                  " U R                   Vs/ s H  oR                  5       PM     sn6 $ s  snf rU   r  rz  s     rY   rx  %GroupedSchedulerNode.get_buffer_namesX  r  r  c                n    / nU R                    H"  nUR                  UR                  5       5        M$     U$ rU   r  r  s      rY   r    GroupedSchedulerNode.get_outputs\  r  r\   c                    U R                   $ rU   r  r`   s    rY   rq  GroupedSchedulerNode.get_nodesb  r  r\   c                    grS  r   )r]  r  r  s      rY   r  GroupedSchedulerNode.can_fusee  s     r\   r   )r  r  r   rc  r  r&  )r5  r&   r   r   r   r  r  r  r#  )ri   r   r   r   r  r   r  r,  r   r2  r  r7   rV   rk  rx  r   rq  r  r   r  r  s   @rY   rc  rc  )  s~     $# 16. = =) N N  r\   rc  c           
     0  ^ ^ [         R                  SUU 4S jj5       n[        [        [	        [        T S   5      5      5      5      n[        U5      S:  a  U Vs/ s H  nT U   PM
     snm [        R                  (       a  UR                  US9  U$ s  snf )zu
A heuristic to decide loop iteration orders.  This has not been well
tuned and may be something we should autotune.
c                z  > TU    S:X  d	  TU   S:X  a  [        TU    S:H  TU   S:H  5      $ T Vs/ s H  n[        X    5      PM     nnT Vs/ s H  n[        X!   5      PM     nn[        S [        X45       5       5      n[        S [        X45       5       5      nXV:  a  gXe:  a  g[        X5      $ s  snf s  snf )Nr   c              3  F   #    U  H  u  pUS :H  =(       d    X:  v   M     g7fr?  r   r  sl_asl_bs      rY   r  5pick_loop_order.<locals>.index_cmp.<locals>.<genexpr>  $      
7VDAI$$7V   !c              3  F   #    U  H  u  pUS :H  =(       d    X!:  v   M     g7fr?  r   rF  s      rY   r  rI    rJ  rK  r  )r8   absr  r!  )	abslstride_len_astride_len_ba_firstb_firstr  stride_lengthss	          rY   	index_cmp"pick_loop_order.<locals>.index_cmpu  s    8q=E!HMuQx1}eAh!m44 .<<^rBE
^<-;<^rBE
^<  
7:<7V
 
  
7:<7V
 
  1y# =<s   B3B8r   r  )rN  r   rO  r   r   r   )		functools
cmp_to_keyr   r-  r  rn   r   pick_loop_orderssort)rU  r  priority_idxrV  orderpis   ``    rY   pick_loop_orderr_  k  s      4 %N1$5 6789E
<17CD|.,|D

y
!L Es   Bc                  d    \ rS rSr% S\S'   SrS\S'   SrS\S'   SS jrSS	 jrSS
 jr	SS jr
Srg)NodeUseri  $Union[BaseSchedulerNode, OutputNode]rM   Fr   r  is_weakc                v    [        U R                  R                  5       U R                  U R                  45      $ rU   )r^   rM   rV   r  rc  r`   s    rY   ra   NodeUser.__hash__  s+    TYY'')4+;+;T\\JKKr\   c                    [        U[        5      =(       aa    U R                  5       UR                  5       :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ rU   )r~   ra  rV   r  rc  rW   others     rY   __eq__NodeUser.__eq__  s[    uh' .5>>#33.  E$5$55. -		
r\   c                6    U R                   R                  5       $ rU   rv   r`   s    rY   rV   NodeUser.get_name  rx   r\   c                    U R                   UR                   L d   e[        U R                   U R                  =(       a    UR                  U R                  =(       a    UR                  5      $ rU   )rM   ra  r  rc  rg  s     rY   r   NodeUser.merge  sP    yyEJJ&&&II2!2!2LL*U]]
 	
r\   r   Nr   )rh  objectr   r   r   )rh  ra  r   ra  )ri   r   r   r   r   r  rc  ra   ri  rV   r   r   r   r\   rY   ra  ra    s3    
..K GTL
$
r\   ra  c                    ^  \ rS rSr% S\S'   SES jrSEU 4S jjrSFS jr\SGS j5       r	\	R                  SHS j5       r	SIS	 jrSJS
 jrSKS jrSIS jrSIS jrSIS jr    SLS jrSMS jrSNS jrSIS jrSIS jrSLS jrSIS jr    SOS jr      SPS jr      SQS jrSIS jrSRS jr      SSS jrSTS jr    SLS jrSUSVS jjr SWS jr!    SXS  jr"      SYS! jr#      SYS" jr$      SYS# jr%        SZS$ jr&      S[S% jr'S\S& jr(        S]S' jr)SYS( jr*      SYS) jr+        S^S* jr,S_S+ jr-S`S, jr.      S[S- jr/    SaS. jr0    SbS/ jr1SIS0 jr2SIS1 jr3SIS2 jr4ScS3 jr5SdS4 jr6SeS5 jr7SfS6 jr8      SgS7 jr9S\S8 jr:  ShS9 jr;      SiS: jr<  SjS; jr=SIS< jr>      SkS= jr?SIS> jr@SWS? jrA    SlS@ jrBSmSA jrCSnSB jrDSISC jrESDrFU =rG$ )orK   i  zdict[Dep, int]_Scheduler__dep_size_hint_cachec                p    [        S5         U R                  U5        S S S 5        g ! , (       d  f       g = f)NzScheduler.__init__)r   _initrW   ru  s     rY   r   Scheduler.__init__  s#    ./JJu 0//s   '
5c           
       >^  [         TT ]  5         0 T l        T [        R                  l        0 T l        [        [        5      T l	        [        R                  " 5       T l        [        [           " 5       T l        [        / [        R                  R                   R#                  5       Q[        R                  R$                  R#                  5       Q[        R                  R&                  R#                  5       Q5      T l        U Vs/ s H  nT R+                  U5      PM     snT l        T R/                  5         T R(                  R1                  [        R                  R$                  R#                  5       5        T R,                   H  nUR3                  5         M     T R5                  5       T l        T R,                   Vs0 s H  o"R9                  5       U_M     snT l        T R,                   VVs0 s H*  o3R=                  5         H  oDR9                  5       U_M     M,     snnT l        T R:                  RA                  5       T l!        0 T l"        0 T l#        [H        RJ                  " T R,                  T R>                  T RB                  5      T l        T RM                  5         T RO                  T R,                  5      T l        T RQ                  5         T R,                   Vs0 s H  o"R9                  5       U_M     snT l!        T RS                  5         [T        =RV                  [Y        T R,                  5      -  sl+        SSK-J.nJ/n  U" T R,                  5        [Y        T R,                  5      T l0        T Rc                  5         T RO                  T R,                  5      T l        [        [d        [        [        4      " 5       T l3        [h        Rj                  b%  [h        Rj                  " T R,                  5      T l        T Rm                  T R,                  5      T l        T Ro                  5         T Rq                  5         [h        Rr                  (       a  T Ru                  S S9  [h        Rv                  (       a  SSK<J;n  U" T R,                  T R>                  T RB                  [        [        R                  R                   R#                  5       5      [        [        R                  R{                  5       5      5      T l        [h        R|                  (       a%  [H        R~                  " T R,                  5      T l        T R                  5         T R                  5         U" T R,                  5        [        R                  R                  T R,                  5        T R                  5         [        [           " 5       T lE        0 T lF        [        S5      R                  U 4S j5        g s  snf s  snf s  snnf s  snf )Nr   )log_ir_post_fusionlog_ir_pre_fusion)num_ck_nodesr   )reorder_for_peak_memorygraph_statsc                 ^   > T R                   T R                  [        T R                  5      S.$ )N)graph_idnum_nodes_before_fusionnum_nodes_after_fusion)post_grad_graph_idnum_orig_nodesrn   ru  r`   s   rY   r   !Scheduler._init.<locals>.<lambda>1  s%     33+/+>+>*-djj/r\   )Ir  r   rq  rD   r   rL   backendsrs  _post_grad_graph_counterr  r;  count_graph_partition_counterr   r   r  r4  keys	constantstorchbind_constantsrL  create_scheduler_noderu  update_zero_dim_cpu_tensorr  r#  get_donated_buffersr   rV   r  r   r   copyrc  r*  mutation_renamesr   decide_global_ordering_of_commscompute_dependenciestopological_sort_scheduledead_node_eliminationcompute_ancestorsr    ir_nodes_pre_fusionrn   torch._inductor.debugrw  rx  r  create_foreach_nodesrm  logged_slow_fusionr   _pre_fusion_custom_passr1  r  finalize_multi_template_bufferscombo_kernelscreate_combo_kernel_nodesrz  memoryget_output_names reorder_for_compute_comm_overlap$reorder_compute_and_comm_for_overlapprocess_grouped_nodescompute_last_usager  graph_diagramdebug_draw_graphbuffer_names_to_freeorigin_to_indexr   add_row)	rW   ru  r  rM   r   rw  rx  rz  rQ  s	   `       rY   rs  Scheduler._init  sN   %'" <>"&'?"@(1(9%$.sO$5!&0%%**,""'') ,,113'
# >CCUd003UC
'')##**177+<+<+A+A+CDJJDOO  $$& 	# &*ZZ;
%/JJL!OZ;
 -1JJ8
,6DBRBRBT3LLNCBTNJ8
 AE@Q@Q@V@V@X 35 13 ::JJ##

 	!!#33DJJ?
""$<@JJ"GJq::<?J"G ##s4::6#O$**%!$**o!!#33DJJ?
",U38_"="?))577

CDJ__TZZ0
,,.***= ))70

  ''177//44671773356DJ 22CCDJJODJ""$!4::&	djj) %/sO$5! :<'//	
y D;
8
B #Hs   W ?W%01W* W0c                   0 n[         R                  R                   Hg  n[        [         R                  R                  U   [        R
                  5      (       d  M?  [        U [         R                  R                  U   S S9X'   Mi     U$ )N)rO   )rD   r   graph_inputs_originalr~   r   DonatedBufferr   )rW   name_to_donated_bufr_   s      rY   r  Scheduler.get_donated_buffers8  sl     GG11D!''77=r?O?OPP,BGG11$7 $-#) 2 #"r\   c                6    [         R                  R                  $ rU   rD   r   current_devicer`   s    rY   r  Scheduler.current_deviceC  s    ww%%%r\   c                .    U[         R                  l        g rU   r  r  s     rY   r  r  G  s    !'r\   c                |    [         R                  R                  SS5      S:X  a  SSKJn  U" U R
                  SS9  gg)z,Generate an image of the graph for debuggingINDUCTOR_WRITE_SCHEDULER_GRAPHN1r   )draw_buffersT)print_graph)osenvironr(  r  r  ru  )rW   r  s     rY   r  Scheduler.debug_draw_graphK  s1    ::>>:DASH+6 Ir\   c                    [         R                  [        R                  5      (       a:  [         R	                  SU5        U R
                   H  nUR                  5         M     g g )Nz%s:)r   isEnabledForloggingINFOr  ru  r  )rW   labelrM   s      rY   debug_print_nodesScheduler.debug_print_nodesR  sD    GLL))HHUE"

  " # *r\   c                P   UR                  5       c   S5       eUR                  5       (       a  [        X5      $ [        U[        R
                  [        R                  45      (       a  [        X5      $ [        U[        R                  5      (       a  [        X5      $ [        U5      e)Nz2All nodes passed to scheduling must have an origin)r  is_no_opr  r~   r   r,   r  r~  rI  r0  r  r  s     rY   r  Scheduler.create_scheduler_nodeX  s    !- 	
@	
- ==??)$55r00"2C2CDEE ,,boo..,T88%d++r\   c                   [         [           " 5       n/ nU R                  R                  5       n[        R
                  R                  R                  5        H  nU Vs/ s H0  nXS;   d  M
  [        U R                  U   [        5      (       a  M.  UPM2     nnU(       d  MI  UR                  U5        U Vs/ s H  oPR                  U   PM     nn[        R                  S:  n[        U USUS9nUR                  U5        U H  nXR                  U'   M     M     U R                    V	s/ s H  oR#                  5       U;  d  M  U	PM     sn	[%        U5      -   U l        g s  snf s  snf s  sn	f )Nr   Fr  r  )r   r   rc  r  rD   r   listsr   r~   r  r  r  r   combo_kernels_autotuner  r  ru  rV   r   )
rW   removed_node_namesfe_nodeskept_node_namesnamesr_   r  r  fe_noderM   s
             rY   r  Scheduler.create_foreach_nodese  sP   '_.11668WW]]))+E "!D*  #4#4#4T#:<RS !   %%e,:?@%$''-%F@$;;a?O0*/ /	G OOG$07''- 1 ,8 "ZZ
'T==?BT+TDZ
N
5 A
s$   	E+ EE5E#(E(E(c                  ^ ^^^  [        S5      n " U4S jS[        U   5      m[        R                  " T5      mT R                   H  nUR                  5        H  nUR                  5       nUR                  5        He  nUT;   aD  UT;   a>  TU   nTU   nXg-   nTR                  5        H  n	TU	   UL d
  TU	   UL d  M  UTU	'   M     MM  UT;   a
  TU   TU'   M]  TU   TU'   Mg     M     M     SU U 4S jjm   S         SUU 4S jjjn
0 n[        R                  R                  R                  5        H=  u  p[        U[        R                  5      (       d  M&  UR                    H  nSX'   M	     M?     T R                   GH  n["        R%                  SUR&                  5        UR&                  c   e[)        UR&                  R+                  5       S S	9nU H?  n[        U[        R,                  5      (       d   eUU;  d  M,  UR                  5       UU'   MA     [)        UR&                  R/                  5       S
 S	9nU Hk  nUU;   d   U SU 35       eUU   =nc  M  T R0                  U   R                  5        H+  nUR3                  [5        UR                  5       5      5        M-     Mm     [7        UR8                  R:                  5      S:X  aQ  [=        [?        UR8                  R:                  5      5      =n(       a"  [        U[@        5      (       a  URB                  nOSnUR                  5        GH  n[7        URE                  5       5      S::  d   eURE                  5        H  nT " U5      nU
" UU5        UR3                  [5        UUS95        TU   R                   H  nUR                  5       UR                  5       :X  a  M'  [        UR&                  [F        5      (       d   eUR&                  RI                  5        H:  nT " U5      nUR3                  [K        UUR                  5       S95        U
" UUSS9  M<     M     M     GM     UR8                  RL                   H<  n[        U[J        5      (       a  M  U
" URN                  X"RQ                  U5      5        M>     URS                  T RT                  5        UR                  5        H  nURE                  5        Hz  nUR                  5       T RT                  T " U5      '   UR                  5       T RT                  U'   T RV                  RY                  UU5      T RV                  UR                  5       '   M|     M     GM     [        R                  R[                  5        H4  n["        R%                  SU5        U
" U[]        [5        U5      5      5        M6     [        R                  R^                   H  nUR/                  5        H  nUU;   d   U SUR                  5        35       eUU   =n(       d  M1  T R0                  U   RI                  5        H5  n["        R%                  SUU5        U
" U[]        [5        U5      5      5        M7     M     M     T RT                   H  nU[        R                  R                  ;   aF  U
" U[]        [5        U5      5      5        [        R                  R`                  Rc                  U5        Mg  U[        R                  Rd                  ;   d  M  U
" U[]        [5        U5      5      5        M     [g        [        R                  R                  R                  5       5       VVs0 s H  u  noU_M
     nnn[        R                  R`                   Vs/ s H  nUU   PM
     sn[        R                  l4        T R                   HF  nUR                  5        H/  nURk                  TUR                  5          R                  5        M1     MH     T Rl                   H.  nT Rl                  U   Rk                  TU   R                  5        M0     gs  snnf s  snf )zQ
Create dependency edges between nodes, handling aliasing and
mutation properly.
Tc                  P   > \ rS rSrSr  S     S	S jjrS
S jrSU 4S jjrSrg)1Scheduler.compute_dependencies.<locals>.DedupListi  a  
This data structure behaves like a list except it makes sure the
elements remain unique.
Normally one could use a OrderedSet/dict for this purpose however
the list in question gets elements appended as it is being
iterated over which means that we need to keep the list
semantics.
Nc                T    U=(       d    / U l         U=(       d
    [        5       U l        g rU   )itemsr   
membership)rW   r  r  s      rY   r   :Scheduler.compute_dependencies.<locals>.DedupList.__init__  s    
 #[b
","<
r\   c                    XR                   ;   a  g U R                  R                  U5        U R                   R                  U5        g rU   )r  r  r  rE  )rW   	node_users     rY   r  8Scheduler.compute_dependencies.<locals>.DedupList.append  s3    /

!!),##I.r\   c                   > [         R                  " U R                  UR                  5      nU R                  UR                   Vs/ s H  o3U R                  ;  d  M  UPM     sn-   nT" XB5      $ s  snf rU   )r   rZ  r  r  )rW   rh  new_membershipr  	new_items	DedupLists        rY   __add__9Scheduler.compute_dependencies.<locals>.DedupList.__add__  sc    !+!1!1$//5CSCS!T JJ${{**!t.FA{* 	 !;;*s   A0A0)r  r  rO  )r  zOptional[list[T]]r  zOptional[OrderedSet[T]]r   r   )r  r  r   r   )rh  DedupList[T]r   r  )	ri   r   r   r   r  r   r  r  r   )r  s   rY   r  r    s@     ,06:=(= 4= 	=/< <r\   r  c                R   > U TR                   ;   a  T" TR                   U    5      $ U $ rU   )r  )r  r  rW   s    rY   r  .Scheduler.compute_dependencies.<locals>.rename  s,    D)))d33A677Hr\   c                N   > TT" U 5         R                  [        XU5      5        g rU   )r  ra  )used_by_namer  r  rc  name_to_usersr  s       rY   add_user0Scheduler.compute_dependencies.<locals>.add_user  s'     &./669r\   Nzscheduling %sc                    U R                   $ rU   r7  rr  s    rY   r   0Scheduler.compute_dependencies.<locals>.<lambda>      AFFr\   r  c                    U R                   $ rU   r7  rr  s    rY   r   r    r  r\   z not in r   )rH  )mutating_bufT)rc  zscheduling output %sz+scheduling output %s for unbacked symint %s)r  r   r   r   )FF)
r  r   r  rb  r  r   rc  r   r   r   )7r   r   r2  r   ru  r   rV   rk   r  rD   r   r4  r  r~   r.  r  free_symbolsr   r  rM   r  get_unbacked_symbol_defsSymbolget_unbacked_symbol_usesr  r  r(   rn   r   r   rs  r@  r'   rH  rm   rG   rx  r)   r   r_   r  r  r  r*  r(  r  r   graph_outputsmutated_inputsrE  r  rt  mutated_input_idxsr   r   )!rW   r  rM   buf1	buf1_name	buf2_namelist1list2combinedr  r  unbacked_symbol_to_origin_noder_   valfsunbacked_symbol_defsr  unbacked_symbol_usesr  r   r  	node_modealt_namerr   
other_namer  r  r   rA  	inp_namesr  r  r  s!   `                             @@@rY   r  Scheduler.compute_dependencies  sN    CL	<
 	<> @K?V?V@
 JJD((* MMO	!%!1!1!3I M1i=6P -i 8 -i 8#(=#0#5#5#7C -c 2e ;#0#5#>5=c 2 $8 #m33@3Ki03@3Ki0 "4 + (	 	 !&!			;	 	 		
 	 	 MO&
 --335ID#uzz****B9=26 + 6
 JJDIIotyy1 99(((#)		224:J$  *!!U\\2222 ::8<215 * $*		224:J$  *:: c"@!AB: 8::AG#003??A))'#,,.*AB  B * D$$++,1 d&6&6&=&=!>??S?sI..HH	 	 '')3,,./1444 # 1 1 3H%h/HXt,%%ghY&GH -h 7 = ===?dmmo=$)$))5FGGGG*.))*D*D*FJ)/
);J -- '
 P %ZtD +G !> !4 *, ((..!$00TYY.>.>t.DE / %%d&;&;< '') # 1 1 3H>AllnD))&*:;69llnD))(3//33HhG ++CLLN; !4 *I Z 002HII,h7Xz'(*;<= 3
 77((C113:: c"@"E"E"G!HI: 7q9919$($5$5a$8$I$I$K		I8UV !:gh6G+HI	 %L 4 ) ))Dqww+++z'$-89&&**40***z'$-89 * ,5QWW5I5I5N5N5P+Q
+QKE4%K+Q 	 
 )*(>(>&
(>IdO(>&
"
 JJD'')mCLLN;AAB *  //D''-77d8K8Q8QR 0
&
s   (aac                  ^	 / n[        U R                  5       GH  nSS jm	SnUR                  5        H  n[        U	4S jUR                   5       5      nU(       a]  [
        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        M  SnM     UR                  5       (       + =(       a    U(       + nU(       d  UR                  U5        M  [
        R                  SUR                  5       5        [        R                  R                  R                  UR                  5       5        UR                  R                    H  nUR"                  U R$                  ;   d  M  U R$                  UR"                     R                  nU Vs/ s H2  oR&                  R                  5       UR                  5       :w  d  M0  UPM4     snU R$                  UR"                     l        M     GM     [)        [        U5      5      U l        U R                   H  nUR+                  5         M     gs  snf )	z 
Remove any nodes without users
c                ~    U R                   =(       d+    U R                  5       [        R                  R                  ;   $ rU   )rc  rV   rD   r   rT  )rr   s    rY   can_eliminate_user;Scheduler.dead_node_elimination.<locals>.can_eliminate_userd	  s&    ||Tt}}!'':T:T'TTr\   Fc              3  4   >#    U  H  nT" U5      v   M     g 7frU   r   )r  ur  s     rY   r  2Scheduler.dead_node_elimination.<locals>.<genexpr>i	  s     #M9a$6q$9$99   zremoved dead buffer: %sTzremoved dead operation: %sN)rr   ra  r   r   )r-  ru  r   r  rR   r   r  rV   rD   r   r  rE  r  r  rT  r   r   r_   r   rM   r   r_  )
rW   updated_nodesrM   active_buffersr   can_eliminater  rR   r  r  s
            @rY   r  Scheduler.dead_node_eliminationZ	  s    TZZ(DU #N'') ##M399#M M II7HGG++//?%)N * !% 5 5 77N<NM $$T* 		6H**..t}}? ,,22DyyD$4$44 $ 0 0 ; A A',=',!0AT]]_0TAu=((39 3- )8 (=12
 JJD  " =s   4/I'Ic                   ^^^^ [         [           " 5       m[        5       m/ mSUUUU4S jjmU H  nUR                  5        H  nUTU'   M
     M!     U H  nT" U5        M     T$ )z/
Ensure nodes is in topologically sorted order
c                   > U T;  af  TR                  U 5        [        U R                  S S9 H*  nUR                  T;  a  M  T" TUR                     5        M,     TR	                  U 5        g g )Nc                    U R                   $ rU   r7  )ds    rY   r   DScheduler.topological_sort_schedule.<locals>.visit.<locals>.<lambda>	  s    affr\   r  )rE  r  r   r_   r  )r  r  r  rq   seenvisits     rY   r  2Scheduler.topological_sort_schedule.<locals>.visit	  sa    }!!"6"6<LMCxx|3 ,sxx01	 N
 a  r\   )r  rG   r   r   )r   rG   r   rx  )rW   ru  rM   r_   r  rq   r  r  s       @@@@rY   r  #Scheduler.topological_sort_schedule	  sj     +,.59V*,	! 	! D--/%)T" 0  D$K r\   c                T  ^  [         [           " 5       n[        U[        [        [
        [        45      (       a/  UR                   H  nUR                  UR                  5        M      O[        S[        U5       S35      eU 4S jU 5       n[        [        U 4S jU 5       5      5      $ )Nz+get_unmet_dep_nodes is not implemented for .c              3  ^   >#    U  H"  nTR                   U   R                  5       v   M$     g 7frU   )r   rZ   rM  s     rY   r  1Scheduler._get_unmet_dep_nodes.<locals>.<genexpr>	  s(     XZc))#.??AAZr  c              3  B   >#    U  H  nTR                   U   v   M     g 7frU   rc  )r  r  rW   s     rY   r  r  	  s     Q=at66q9=s   )r   r   r~   r~  r0  r  r3  r   rE  r_   RuntimeErrorrh   r   )rW   rn  
unmet_depsr  unmet_dep_opss   `    rY   _get_unmet_dep_nodesScheduler._get_unmet_dep_nodes	  s    _&
)&"	
 
 //sxx( 0 =d5k]!L  YZXJQ=QQRRr\   c                   / n[         R                  U R                  S5      n0 nU R                   HQ  nU R                  U5      n[	        U5      X$'   U H*  nUR                  U/ 5      nUR                  U5        XsU'   M,     MS     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  UR                  U
5        U
 H9  nUR                  U/ 5       H  nX+==   S-  ss'   M     UR                  U5        M;     UR                  5        VV	s/ s H  u  pU	S:X  d  M  UPM     n
nn	U
(       a  M  U(       a   S5       eU$ s  sn	nf s  sn	nf )zE
Sort nodes by their topological order, return a list of node lists.
r   r   zTopological sort failed!)	r   fromkeysru  r!  rn   r(  r  r  rD  )rW   r]  ru  childrenrM   rH  r  cr  vzero_deg_nodesrr   s               rY   r   !Scheduler._topological_sort_nodes	  s,    djj!,#%JJD,,T2Dd)EKLLb) !   ).@a!@LL(#$LLB/DK1$K 0		! $ -2KKMDMDAQ!VaMND n 444y A Es   E)EE,Ec                z   0 nU R                    H  n[        [           " 5       nUR                   HB  nU R                  UR
                     R                  5       nUR                  U5        X1U   -  nMD     X1UR                  5       '   X2l	        M     [        U R                   5       H  u  pbXbl        Xbl        M     g)z
Populate each node.ancestors
N)ru  r   r   r   r   r_   rZ   rE  rV   r   rt  r   r   )rW   name_to_ancestorsrM   r   r  dep_node_namer]  s          rY   r  Scheduler.compute_ancestors	  s    
 9;JJD"3)I.. $ 0 0 : K K Mm,}==	 / 2;dmmo.&N  %TZZ0KE"N"N 1r\   c                   U R                    H  n[        R                  (       d  M  [        U[        [
        45      (       a)  UR                  5       (       d  [        R                  S:w  a  M`  UR                  5        H?  n[        U[        5      (       a  UR                  5       (       a  M/  UR                  5         MA     M     g )Nhalide)ru  r   r  r~   r~  r3  r?   cpu_backendrq  r  r  )rW   rM   rn  s      rY   r  Scheduler.merge_loops	  s    JJD44 d]4F$GHHKKMMf&8&8H&D)!%775;L;L;N;N!!# * r\   c                z   [        S5         [        S5       H  n[        U5      n[        R	                  SUS-   U5        U R                  U5      n[        U5      n[        R	                  SUS-   UU5        XC:X  d  US:X  d  Ml  [        R	                  SUS-   5          O   UsSSS5        $ ! , (       d  f       g= f)z2
Combine eligible nodes into FusedSchedulerNodes.
zScheduler.fused_nodes
   z/===== attempting fusion (%d/10): %d nodes =====r   z=completed fusion round (%d/10): fused %d nodes into %d nodes
z+===== fusion complete (%d iterations) =====N)r   r  rn   r  r  fuse_nodes_once)rW   ru  rq  old_lennew_lens        rY   r1  Scheduler.fuse_nodes	  s     122Ye*  EE
 ,,U3e*  TE	 %A$$Eq1u ' ( + 322s   A4B,B,,
B:c                    / nU R                    H:  nUR                  [        U[        5      (       a  UR	                  5       OU/5        M<     Xl         g)z1
Unpack GroupedSchedulerNode into regular nodes.
N)ru  rG  r~   rc  r2  )rW   	new_nodesrM   s      rY   r  Scheduler.process_grouped_nodes
  sF     .0	JJD!+D2F!G!GdV  
r\   c                    [        U5      S:  d   eUS   R                  5       nX l        U R                  U5      n[	        SSSS9   UR                  U5      sSSS5        $ ! , (       d  f       g= f)k
Benchmark fused list of nodes and return the execution time
in milliseconds on randomly generated inputs.
r   benchmark_fused_nodesTcompile_time_autotune_time_us)log_pt2_compile_eventdynamo_compile_column_usN)rn   r  r  r  r   r=  )rW   ru  r  backends       rY   r=  Scheduler.benchmark_fused_nodes!
  sm     5zA~~q$$&$""6*#"&%D

 007
 
 
   A""
A0c                    [        U5      S:  d   eUS   R                  5       nX0l        U R                  U5      n[	        S5         UR                  X5      sSSS5        $ ! , (       d  f       g= f)r<  r   r=  N)rn   r  r  r  r   generate_kernel_code_from_nodes)rW   ru  benchmark_kernelr  rA  s        rY   rE  )Scheduler.generate_kernel_code_from_nodes3
  s_     5zA~~q$$&$""6*12::5S 322rC  c                    X l         U R                  U5      n[        S5         UR                  U5      sSSS5        $ ! , (       d  f       g= f)r<  r=  N)r  r  r   benchmark_codegened_module)rW   moduler  rA  s       rY   rI  $Scheduler.benchmark_codegened_moduleA
  s=     %""6*1255f= 322s	   >
Ac                         SS jn[        U R                  5       GH]  u  p#[        U[        5      (       d  M  [        UR                  [
        R                  5      (       d  MH  UR                  n[        R                  R                  (       d  UR                  5       u  pVO[        S UR                   5       5      n[        U[        R                  R
                  R                  5      (       a  UR                  R!                  U5        M  UR#                  5       nUR$                  n[        U[
        R&                  5      (       d   eUR$                  n	[        U	[
        R(                  5      (       d   eUR*                  U	l        U" XI5        U R-                  U	5      n
XR                  U'   XR.                  UR1                  5       '   XR2                  UR1                  5       '   [5        U
R7                  5       UR7                  5       5       H2  u  pXR8                  UR1                  5       '   UR:                  Ul        M4     UR<                  U
l        UR>                  U
l        UR@                  U
l         GM`     g )Nc                   UR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   eUR                  5       nU R                  5       n[        U[        5      (       a  [        U[        5      (       d   e[        R
                  R                  U	 X1l        [        R
                  R                  U	 XQl	        [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   [        R
                  R                  R                  U 5      n[        R
                  R                  R                  U5        U[        R
                  R                  U'   U[        R
                  R                  U'   g rU   )rV   r~   r   rg  rD   r   rF  r_   
name_to_opoperation_namebuffersrA  remove
operations)	orig_noder  replaced_buf_nameorig_buf_namereplaced_op_nameorig_op_nameorigs          rY   replace_operation_bufferKScheduler.finalize_multi_template_buffers.<locals>.replace_operation_bufferN
  s_    !) 1 1 3%..0MmS11jARTW6X6XXX'::<$779LlC00Z@PRU5V5VVV&&'89)M""#34&2#77??((3DGGOO""8,$,AGGOOD!4<AGG""=177%%++I6DGG%%h/'/AGGt$/7AGG|,r\   c              3     #    U  H<  n[        U[        R                  R                  R                  5      (       d  M8  Uv   M>     g 7frU   )r~   r   r   select_algorithmExternKernelCaller)r  timings     rY   r  <Scheduler.finalize_multi_template_buffers.<locals>.<genexpr>r
  s7      *C) & % @ @ S S  #F*Cs
   7A	A)rS  zir.MultiTemplateBufferr  zir.OperationBufferr   r   )!rt  ru  r~   r~  rM   r   MultiTemplateBufferr   test_configs%force_extern_kernel_in_multi_templateget_min_choicers  choice_timingsr   r   TritonTemplateCallerBasefinalize_as_triton_calleroutput_noder   
StorageBoxOperationBufferrj   r  r  rV   rc  r!  r   r   rR   r   r   r   )rW   rY  rq  rM   
multi_nodemin_node_unfusedrx  out_tensorboxout_storage
out_buffernew_scheduler_nodenew_outold_outs                rY   r  )Scheduler.finalize_multi_template_buffersM
  s   	8-	89K	8	86 !,GA$..:		2114 4 "YY
**PP*4*C*C*E'$a'+*4*C*C	($ $OO&&??  II778HI 0 < < >+00!+r}}====(--
!*b.@.@AAAA$.$5$5
!(@%)%?%?
%K" 2

15G!!$--/2;M''8(+&224d6F6F6H)$G <C$$W%5%5%78$+MMGM	) 04~~",/3~~",04"-a -r\   c                &    [        S U 5       5      $ )Nc              3    #    U  H  n[        UR                  S 5      =(       a_    UR                  SL=(       aJ    [        UR                  R                  S5      =(       a#    UR                  R                  R                  S:H  v   M     g7f)r   Nscatter_moderI  )r   rM   r   ru  r  s     rY   r  ,Scheduler._any_atomic_add.<locals>.<genexpr>
  sp      

 	 AFFF# 9d"9^49 ((L89 s   B	B)r  rW   	node_lists     rY   _any_atomic_addScheduler._any_atomic_add
  s     

 
 
 	
r\   c           	     v  ^ ^^^^^^^^^^^^^^^ [        S TT4 5       5      n[        R                  (       d  U(       d  gTR                  5       (       a-  [	        TR                  5       [        R                  5      (       a*  TR                  5       (       d  TR                  5       (       a  gTR                  5       nUS   R                  5       mT(       d   eTR                  S:X  a  gTR                  5       n[        [        R                  " XE5      5      nT R                  U5      (       a  gSSKJm  [%        TT5      mUS   R                  5       mTc   eSUU4S jjm[&        R(                  R*                  R-                  5       m    SUU 4S jjnU(       Ga  [        S	 TT4 5       5      (       Ga  TR                  5       SLmT(       a  TR                  5       OTR                  5       m[	        T[        R.                  5      (       d   eTR0                  nTR3                  5       u  n	mTR3                  5       u  n	mT(       a  T R5                  U5      OT R5                  U5      u  mn
/ mSn[7        UR9                  5       S
 S9 H  u  p[	        U[&        R(                  R                  R:                  5      (       d  M:  T(       d-  [=        US5      (       a  UR>                  TR>                  :w  a  Mn  UTT-   :  a    OTUS-  nU[        R@                  :  a    O9TRC                  U5         TRE                  U/U" U5      Q75        SSS5        M     [G        T5      S:X  a  gSUUUUUUUU 4S jjnU$ U" U5      mU" U5      mU" U5      mSUUUUUUU U4S jjnU$ ! , (       d  f       GM'  = f)o
If config.benchmark_fusion is False, always return True.
Otherwise, return True if fusion can brings speedup.
c              3     #    U  HD  nUR                  5       =(       a(    [        UR                  5       [        R                  5      v   MF     g 7frU   )r  r~   rf  r   r`  r  s     rY   r  .Scheduler.speedup_by_fusion.<locals>.<genexpr>
  sD       
 $ MMO J1..0"2H2HIJ#s   AATr   r  CompilationErrorNc           
     z  > [         R                  [        R                  5      (       a  XU-   :  aE  [         R	                  STR                  5       TR                  5       [        X-   U -  S 5      5        g [         R	                  STR                  5       TR                  5       [        XU-   -  S 5      5        g g )Nz9can fuse (benchmark): fusing %s with %s cause %sx speedup.3fz=cannot fuse (benchmark): fusing %s with %s cause %sx slowdown)r  r  r  DEBUGr  rx  r4   r5   )ms_fusedms1ms2r  r  s      rY   
log_fusion/Scheduler.speedup_by_fusion.<locals>.log_fusion
  s    &&w}}55Ci'$$S..0..0"syH&<S%AC	 $$W..0..0 Hc	$:3#?A	 6r\   c                   > TR                  U SS9n[        R                  " U5      nTR                  5       (       d  S nX24$ TR	                  SUS9n[        U[        5      (       d   eX24$ )NT)rF  triton_)kernel_namesource_code)rE  r   loaduse_process_pooltritonr~   r   )ru  src_codemodfutasync_compilerW   s       rY   compile_kernel3Scheduler.speedup_by_fusion.<locals>.compile_kernel
  s     ;; < H ""8,C 1133
 : $**yh*W!#|4444:r\   c              3  D   #    U  H  oR                  5       S Lv   M     g 7frU   rj  r  s     rY   r  r~  
  s      %
7E!!-~s    c                    U S   $ r  r   rr  s    rY   r   -Scheduler.speedup_by_fusion.<locals>.<lambda>  s    adr\   r  allowed_prologue_inpsr   Fc            	     0  > [        S5      n S n0 nT HU  u  p4n Ub  UR                  5         TR                  U5         TR                  UT	5      u  pxXrU'   Xp:  a  Un UnS S S 5        MW     T" U TT5        U TT-   :  a  Ub  TR                  U5        UTl        gg! [         a\  n[        R	                  [
        R                  5      (       a)  [        R                  ST
(       d  SOS[        U5      5         S nAM  S nAff = f! , (       d  f       GM  = f)NinfzException in compiling %s: %srw  ry  TF)r  rq   r   r  r  r  r  r  r   swap_as_triton_callerrI  rf  _choice_timings)min_ms_fusedms_fused_choicenew_timingschoicefuture	mod_fusedrU  r  pathr  epilogue_fusionfuture_choicesr  r  r  rj  rW   s            rY   benchmark_when_ready9Scheduler.speedup_by_fusion.<locals>.benchmark_when_ready1  s   $U|"& 1?-FI!!-"MMO $99&A)-)H)H%v* /7F+#2+3L.4O BA 2@0 <c239-/2M88I1<J. 1 % !%227==AA&,, ?2A
z #A
 !! BAs#   B"D
D&AC==D
D	c                 (  >^^^^^^ SSK Jn    TS   TS   TS   4 H  nUc  M  UR                  5         M     TR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gTR                  TS   T
5      u  mm[        R
                  " T5      (       a	  T" S5        gT" TTT5        [        S5      (       a[  TTT-   :  aR  TT4TR                  ;  a@  TR                  R                  TT45        [        S5      R                  UUUUUU4S	 j5        TTT-   :  $ ! U  a     gT	 a  nS
[        U5      ;   a   S nAge S nAff = f)Nr   )NoTritonConfigsErrorr   z%register spilling of the first kernelFz&register spilling of the second kernelz%register spilling of the fused kernelslow_fusionc            	     $   > TT TTTTTT T-   -  S.$ )N)kernel1_pathkernel1_latencykernel2_pathkernel2_latencyfused_kernel_pathfused_kernel_latencyslow_down_ratior   )r  r  r  path1path2
path_fuseds   rY   r   KScheduler.speedup_by_fusion.<locals>.benchmark_when_ready.<locals>.<lambda>  s&    053605365?8@3;sSy3I%r\   Loop-carried variableT))torch._inductor.runtime.triton_heuristicsr  rq   rI  mathisinfr   r  rE  r   r  r   )r  r  rU  r  r  r  r  r  r  r  r  future_and_mod_l1future_and_mod_l1_fusedfuture_and_mod_l2r  rW   r  s      @@@@@@rY   r  r  `  s   ; *!,)!,/2 
 ?JJL  "&!@!@)!,f"JC zz#CD$!%!@!@)!,f"JC zz#DE$+/+J+J/2F,(Hj zz(++CD$xc2 0>>$c	1"EN$2I2II//33UENC(7?? 
 $cCi//+ ! ' .#a&8#s<   E* AE* 5;E* 1;E* -A<E* *F2F7FFF)r  r  r  r  r  r  r   r   )ru  r/  r   z)tuple[Optional[LambdaFuture], ModuleType]r   )$r  r   benchmark_fusionr  r~   rf  r   TritonTemplateBufferr  rq  r  rh   r   r;  r<  ry  triton.compiler.errorsr  r  r   r   r  AsyncCompiler`  rd  rc  r=  r  r  re  r   r   max_epilogue_benchmarked_choicesr  r  rn   )rW   r  r  is_multi_templatenode_list_1node_list_2node_list_fusedr  rd  rx  r  triton_choicesr  unfused_timer  r  r  r  r  r  r  r  r  r  r  r  rj  r  s   ```            @@@@@@@@@@@@@rY   speedup_by_fusionScheduler.speedup_by_fusion
  sk       
 U^ 
 

 &&/@ u668":Q:QRR!!!! oo'Q**,v ;;%oo'y{HI
 00;u% #..0!!!	 	" 55BBD	.	6	 	$  %
8=u~%
 "
 "
 $557tCO # ''),,. 
 j"*@*@AAAA'66N..0FAs  ..0FAs # **;7//< C TVNN(.$$&N)$ "&%//*<*<*U*UVV ((?@@44
8X8XX39,!#!F$K$KK55f="))6*TN?4S*TU >=3)8 >"a'%! %!N (' !/{ ; .{ ;&4_&E#@ @D ('o >=s   6N((
N8	c                <    U R                   UR                  5          $ )z0Look up the node in Scheduler name_to_fused_node)rc  rk  r  s     rY   r  Scheduler.get_fused_node  s    &&t':':'<==r\   c                0  ^ ^^^ [        U5      m[        R                  [        R                  5      (       aD  [        R                  S5        T H)  n[        R                  SUR                  5       -   5        M+     0 m      SUU 4S jjm      SUUU 4S jjnT R                  U5       H  u  pEU" XE5        T R                  U5      nT R                  U5      nT R                  XE5      (       d  MG  T R                  XE5      (       a  M_  T R                  XE5      n[        U5      (       a  XdU4TU'   XdU4TU'   M  U(       d  M  T" XE5        M     [        5       nTR                  5        Hx  u  pn
X;   a  M  UR                  U5        T R                  U	5      U	L d   eT R                  U
5      U
L d   eU" 5       (       d  MX  T R                  X5      (       a  Mp  T" X5        Mz     [        TS S9nT R!                  U5      nT R#                  U5        U$ )	z
Combine eligible nodes into FusedSchedulerNodes.

This relies on two key functions to control the logic:
    - self.can_fuse(): checks if a fusion is legal
    - self.score_fusion(): assigns priority to a given fusion
zfuse_nodes_once, candidates:z  c                  > [         R                  SU R                  5       UR                  5       5        U R                  5       nUR                  5       U:X  d   eTR	                  U5      R                  X5      nTR                  U 5        TR                  U5        TR                  U5        TR                  R                  UR                  5        Vs0 s H  oDR                  5       U_M     sn5        U$ s  snf )Nzfusing %s with %s)r  r  rV   r  r  rj  rQ  rE  rc  r  rq  )r  r  r  node3r  r  rW   s        rY   fuse_two_nodes1Scheduler.fuse_nodes_once.<locals>.fuse_two_nodes  s     0%..2BENNDTU%%'F##%///$$V,11%?Eu%u%OOE"##**.3oo.?@.?u$.?@ L As   C8c                D  > TR                  U 5      T;   d  TR                  U5      T;   a  TR                  TR                  U 5      TR                  TR                  U5      S 5      5      nUc   eUu  p4nTR                  US 5        TR                  US 5        TR                  U5      UL d   eTR                  U5      UL d   eU" 5       (       a  TR                  X5      (       a  M  T" XE5        TR                  U 5      T;   a  M  TR                  U5      T;   a  M  g g rU   )r  r(  rD  will_fusion_create_cycle)	r  r  pending_fusion
is_speedup	node_key1	node_key2r  pending_fusionsrW   s	         rY   resolve_pending_fusions:Scheduler.fuse_nodes_once.<locals>.resolve_pending_fusions  s    ##E*o=&&u-@!0!4!4''.#''(;(;E(BDI" &1113A0
y##It4##It4**95BBB**95BBB!||t'D'DU'R'Ry4' ##E*o=&&u-@r\   c                    U R                   $ rU   r^  rr  s    rY   r   +Scheduler.fuse_nodes_once.<locals>.<lambda>  s    !++r\   r  )r  rG   r  rG   r   rG   r  )r   r  r  r  r  r  r  get_possible_fusionsr  r  r  r  callabler   rE  r  r  rd  )rW   ru  rM   r  r  r  speedupseen_pair_speedup_fnis_speedup_fnr  r  r  r  r  s   `          @@@rY   r4  Scheduler.fuse_nodes_once  s     !'""7==11;<#  (<(<(>!>? $  	
	$	->		 	 	5$	5->	5	5 	52 !55e<LE $E1''.E''.E}}U**43P3P4 4 00>G$$.5e-DOE*.5e-DOE*u,) =, @J|3B3I3I3K/Mi4 $$]3&&y1Y>>>&&y1Y>>>t'D'D( ( y4 4L {(=>..u5!!%(r\   c                   [        U R                  5      nSn[        U R                  5      n[        R	                  SU5        [        [        R                  U 5      5       GH(  u  pV[        R                  U5      n[        U5      S:  a  M,  Ub  X1:  a    OU R                  U5      (       d  [        R	                  SU5        Md  US-  n[        R                  S:  n[        US   R                  USUS9n[        R                  S	[        U5      U5        U H  n	UR                  U	5        M     UR                  U5        U R                   R#                  UR%                  5        V
s0 s H  oR'                  5       U_M     sn
5        GM+     [)        US
 S9U l        U R+                  U R                  5      U l        [        R                  SUU[        U R                  5      5        U R-                  U R                  5        gs  sn
f )z
Groups parallel nodes
r   z2ComboKernels: Generating with num_ck_nodes = %d...rK  Nz)ComboKernels: Not speeding up %d-th groupr   Tr  z0ComboKernels: Combining %d nodes for %d-th groupc                    U R                   $ rU   r^  rr  s    rY   r   5Scheduler.create_combo_kernel_nodes.<locals>.<lambda>8  s    q{{r\   r  zEGenerated ComboKernel nodes: %d ComboKernels, totally %d -> %d nodels)r   ru  rn   r   r  rt  r  r  r  speedup_by_combo_kernelr   r  rL   r  rQ  rE  rc  r  rq  rV   r  r  rd  )rW   ry  r  r  num_nodes_orignumrx  r  rU  rM   r  s              rY   r  #Scheduler.create_combo_kernel_nodes  s    !,TZZ		FU'&DDTJ
NC 3CCINI9~!'E,@//	::		EsKQJE$;;a?O4!&&*. /	K HHBI
 """4( "OOK(##**4?4I4I4KL4Kq{*4KL7
< K-BC
33DJJ?
S

O		
 	!!$**- Ms   (H
c                L    U H  nUR                  U R                  5        M      g rU   )rd  rc  )rW   ru  rM   s      rY   rd  Scheduler.prune_redundant_depsB  s     D%%d&=&=> r\   c                  ^ ^	^
 / m	[         [        [        [        4      " 5       m
SU	U
U 4S jjn[        R                  " [
        5      nU HE  nT R                  U5      (       a  M  UR                  5        H  nX5   R                  U5        M     MG     UR                  5        H  nU" U5        M     [        R                  (       ak  [        R                  " [
        5      nU H,  n[        USS5      nU(       d  M  Xx   R                  U5        M.     UR                  5        H  nU" U5        M     T R                  T	5      m	T	R                  T R                  SS9  [         R#                  S[%        T	5      5        T	$ )zN
Helper to find all legal fusion opportunities, sorted by self.score_fusion()
c                  > [        U 5       H  u  pXS-   S   H  nX#4nUT;   a  M  TR                  U5        TR                  X#5      (       a  TR                  U5        MH  UR	                  5       (       d  UR                  5       (       d  Mt  TR                  X25      (       d  M  TR                  X245        M     M     g r  )rt  rE  r  r  r  r  )ru  node1_indexr  r  r  possible_fusionsr  rW   s        rY   check_all_pairs7Scheduler.get_possible_fusions.<locals>.check_all_pairsO  s    &/&6""?#45E .Cd{ HHSM}}U22(//4++--1A1A1C1CJ J )//? 6 '7r\   r   NT)r  reversezfound %d possible fusionsru  r  r   r   )r   rm  rG   r2  r   r   unfusable_noder=  r  r   r   aggressive_fusionr   *get_possible_fusions_with_highest_priorityr[  score_fusion_keyr  r  rn   )rW   ru  r  buffer_names_groupingrM   r   node_groupinggroup_groupingr   r  r  s   `        @@rY   r  Scheduler.get_possible_fusionsF  sV    % 13D DEFH	@ 	@  !, 7 7 =D""4((--/%*11$7 0 
 399;MM* < ##(44T:Ngt45")006  "0!6!6!8. "9  JJ
 	$"7"7F4c:J6KLr\   c                  ^ ^^^^ [         [           " 5       mSUUUU U4S jjmUR                  5       R                  R	                  5       UR                  5       R                  R	                  5       -  mUR
                  R                  R	                  5       UR
                  R                  R	                  5       -  T-
  m[        UU 4S jT 5       5      nU(       a  [        X5      " S5        U$ )zf
Finds whether there's a path from node1 to node2 (or vice-versa)
caused indirectly by other fusions.
c                ,  > [        U [        5      (       a~  U T;  ax  TR                  U 5        U R                  5       R	                  T5      (       a  g[        TU R                  -  5      =(       d#    [        UU4S jU R                  T-
   5       5      $ g)NFc              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7frU   r  r  r  
found_pathrW   s     rY   r  IScheduler.will_fusion_create_cycle.<locals>.found_path.<locals>.<genexpr>  s,      H!DA #4#:#:1#=>>!D   "%)r~   r3  rE  rr  issubsetr   r   r  )rM   combined_ancestorscombined_namesr  rW   visiteds    rY   r  6Scheduler.will_fusion_create_cycle.<locals>.found_path  s    $ 233G8KD!++-667IJJ !   ?@ C H!%2D!DH E  r\   c              3  N   >#    U  H  nT" TR                   U   5      v   M     g 7frU   r  r   s     rY   r  5Scheduler.will_fusion_create_cycle.<locals>.<genexpr>  s&     WDVqJt66q9::DVr  zwill create cyclerM   rG   r   r   )r   r3  rr  _dictr  r   r  r  )rW   r  r  cycler  r  r  r  s   `   @@@@rY   r  "Scheduler.will_fusion_create_cyclex  s     /02	 	2 %%'--224'')//4467 	
 OO!!&&(5??+@+@+E+E+GG WDVWWe#$78r\   c                  ^ ^ SSK Jm      SU 4S jjnU" U5      nU" U5      n[        U4S jU 5       5      n[        U4S jU 5       5      nUR                  U5      nSn	U H  n
 U	[	        U
S   5      -  n	M     T R                  X5      n[        R                  R                  R                  U	S	U-  5      (       a  g
g! [
         a       gf = f)a  
Return true if fusing the two nodes can potentially increasing peak memory.

The implementation is more like a heuristic since we don't really know if we are at peak
or not when trying to fuse these two ndoes. The order of nodes may change later which makes the
peak memory estimation hard.

Here is how we decide the LOWER BOUND of extra memory allocation if we fuse these 2 nodes:
1. find all buffers read by each node with a single user. These buffers are supposed to
   be reused if we don't fuses these 2 nodes
2. find the intersection of these buffers for the two node and sum the total buffer size.
   If we don't fuse these two nodes, we can at lease avoid this much memory allocation.
   Note that the extra memory allocation is not necessarily causing peak memory increase.
   This is just a heuristic.

We return true only if the saving for fusion can not trade off the extra memory allocation.
r   )buffer_reuse_keyc                P  > / nU R                   R                   H  nTR                  R                  UR                  5      nU(       d  M1  [        UR                  5      S:X  d  ML  UR                  R                  5       (       d  Mm  UR                  UR                  5        M     U$ r  )
r   r   r   r(  r_   rn   rR   rM   has_tensor_outputr  )rM   r   r  r   rW   s       rY   _find_single_user_inputsKScheduler.can_fusion_increase_peak_memory.<locals>._find_single_user_inputs  sw     F&&,,&&**277333syy>Q.3883M3M3O3OMM#((+ - Mr\   c              3  4   >#    U  H  nT" U5      v   M     g 7frU   r   r  r   r  s     rY   r  <Scheduler.can_fusion_increase_peak_memory.<locals>.<genexpr>       #S]c$4S$9$9]r
  c              3  4   >#    U  H  nT" U5      v   M     g 7frU   r   r  s     rY   r  r    r  r
  r   rK  F    T)rM   rG   r   zlist[ir.Buffer])r  r  r   intersectionr   rF  score_fusion_memoryrD   r   r
  statically_known_gt)rW   r  r  r  lhs_dep_nodesrhs_dep_nodeslhs_reuse_keysrhs_reuse_keyscommon_reuse_keysmemory_overheadr  	bw_savingr  s   `           @rY   can_fusion_increase_peak_memory)Scheduler.can_fusion_increase_peak_memory  s    * 	6	#		 1707##S]#SS##S]#SS*77G$C3s1v;. % ,,U:	 77//iPP  s   (C
CCc                    [        [        UR                  UR                  -
  5      [        UR                  UR                  -
  5      5      nUS:  $ )a  
This function prevents fusion for nodes that can increase memory
footprint. This problem is more common in horizontal fusion, where nodes
that are far apart in the original order get fused, lengthening the live
intervals of tensors. This is very evident in models with activation
checkpointing, where the recomputed nodes from different checkpointed
regions get fused and significantly increase the memory footprint.

The current attempt is a quick, possibly hacky, heuristic to prevent the
fusion of nodes that are far away in the original order.

A better but difficult to implement heurisitic would be to use live
intervals of the buffers, find region of peak pressure in the original
program and prevent fusion that crosses that peak region. We might need
special care or good approximation in this implementation, as fusion of
node changes live intervals, and re-computing live intervals and peak
memory after each fusion can introduce large compilation overhead.
@   )rT  rM  r   r   )rW   r  r  proximity_scores       rY   are_long_distant_nodes Scheduler.are_long_distant_nodes  sE    * %//12%//12
 ##r\   c                (   0 nUR                   R                  5        Vs0 s H  oUR                  U_M     nnUR                   R                  5        Vs0 s H  oUR                  U_M     nnU GH  n[        R                  R                  U5      n	Xh   n
Xx   n[        U
[        5      (       a  [        U[        5      (       d  S[        U
5       S[        U5       3XH'   Ms  U
R                  5       UR                  5       :w  a(  SU
R                  5        SUR                  5        3XH'   M  [        U
R                  5      [        UR                  5      :w  a  SXH'   M  U
R                  5       nUR                  5       nX:w  a  SU SU 3XH'   GM!  U
R                  5       UR                  5       :X  a  SU
 SU 3XH'   GMP  Sn[        U	[        R                  5      (       d  SU	R                    3nS	U
 SU S
U 3XH'   GM     [#        U5      $ s  snf s  snf )ze
Try to decide reasons why fusion fail due to no shared memory even though
there are common buffers.
znot MemoryDep: z v.s. zdifferent numel: 	broadcastzdifferent offset: zMismatch loop orders: r   zLayout: zUnknown reason: z. )r   r  r_   rD   r   r"  r~   r'   rh   r@  rC   rB  
get_offsetnormalize_with_stride_orderr   r'  rj   r   )rW   r  r  common_buf_namesreasonsr  node1_name2depnode2_name2depr  r   lhs_deprhs_deplhs_offrhs_off
layout_strs                  rY   decide_fusion_fail_reason#Scheduler.decide_fusion_fail_reason  s    383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX(H''$$X.C$.G$.Ggy11GY9W9W%d7m_F4=/J !   "g&7&7&99'(9(9(;'<F7CTCTCVBWX !  W\\*mGLL.II$/!((*G((*G! '9	y$Q! 3356689 '=WIVG9$U! Jc2#5#566'

|4
"7)6'"ZLI U )\ 7|c YXs   H
Hc                   [         R                  (       a  [        S X4 5       5      (       a  gUR                  R	                  5       nUR                  R	                  5       nX4-  nU(       d  gUR                  R                  5        Vs0 s H  ofR                  U_M     nnUR                  R                  5        Vs0 s H  ofR                  U_M     nn/ n	U Hw  n
Xz   nX   nUR                  5       UR                  5       :X  d  M/  U	R                  [        R                  R                  R                  UR                  5       SS9UU45        My     [        U	5      S:X  a  g[        U	S S9u  pn[!        U["        5      (       a  [!        U["        5      (       d  gUR$                  UR$                  :w  a4  UR'                  5       UR'                  5       :X  a  U R)                  U5      $ gUR+                  5       (       d  UR-                  X5        OZUR+                  5       (       d  UR-                  X5        O3[.        R1                  SUR3                  5       UR3                  5       5        U R5                  X5      $ s  snf s  snf )z
Right now just greedily reorder the loop of node1 to be compatible with node2,
but ideally we should have some heuristics to reorder the loop for node2
to be compatibile with node1 if that's more efficient.
c              3  @   #    U  H  oR                  5       v   M     g 7frU   )r  r  s     rY   r  >Scheduler.shared_data_after_reordering_loop.<locals>.<genexpr>C  s      8
 .1HHJJrp  r   r  c                    U S   $ r  r   rr  s    rY   r   =Scheduler.shared_data_after_reordering_loop.<locals>.<lambda>g  s    1r\   r  z?Don't reorder loops since both nodes are reductions: %s v.s. %s)r   r  r  r   buffer_namesr  r_   r/  r  rD   r   r
  r  r@  rn   rT  r~   r'   r  r  dep_size_hintr  r  r  r  rV   r  )rW   r  r  node1_buffer_namesnode2_buffer_namescommon_buffer_namesr  r2  r3  
candidatesbuffer_namer4  r5  _numels                 rY   !shared_data_after_reordering_loop+Scheduler.shared_data_after_reordering_loop8  s>    00C 8
!&8
 5
 5
 "..;;="..;;=0E"383D3D3U3U3WX3WC((C-3WX383D3D3U3U3WX3WC((C-3WX 
.K$1G$1G3356689 !!((2273D3D3FQR2S / z?a $'z~#F '9--Z5S5Sw///
   "g&7&7&99))'22 !!##++G=##%%++G=##Q   ''55e YXs   I??Jc                    [        U[        [        45      =(       a6    UR                  5       (       + =(       a    [	        UR
                  5      (       + $ )z.
Is this node unfusable under any conditions.
)r~   r0  r  r  rA   rM   r  s     rY   r  Scheduler.unfusable_node  sD    
 t79OPQ C$$&&C7		BB	
r\   c                   UR                  5       [        R                  R                  ::  a  gUR	                  5       nUR                  5       nSnXEU-  :  a	  U" S5        g[        S UR                  5        5       5      nU[        R                  R                  R                  R                  4:X  a	  U" S5        gS	S jnU" UR                  5       R                  5      (       a  UR                  5       (       d	  U" S5        gg)
zD
Heuristics to avoid benchmarking predictably slow prologue fusions
Tg?z@prologue fusion will not increase amount of bytes read in kernelFc              3     #    U  HT  nUR                   c  M  UR                   R                  5         H#  nUR                  S:X  d  M  UR                  v   M%     MV     g 7f)Ncall_function)rM   r  rX   r  )r  r  rU  s      rY   r  EScheduler.check_prologue_fusion_heuristics_fusable.<locals>.<genexpr>  sS      
.vv  VV'')tt&	 AHH * .s   A,AAz\prologue fusion will not increase attempt to fuse in padding bc it increases unaligned readsc                F    U R                   S:*  =(       a    U R                  $ )NrK  )itemsizeis_floating_point)rV  s    rY   low_prec_fpGScheduler.check_prologue_fusion_heuristics_fusable.<locals>.low_prec_fp  s    >>Q&B5+B+BBr\   zVprologue fusion that must be upcast to fp32 not profitable for low precision templates)rV  ztorch.dtyper   r   )rr  rD   r   invoke_quant_opsr  r  rm  rq  r   opsatenconstant_pad_nddefaultrl  rV  r  )	rW   prologue_noderx  r  
read_byteswrite_bytesBYTES_THRESHOLD_MULTIPLIERr  rS  s	            rY   (check_prologue_fusion_heuristics_fusable2Scheduler.check_prologue_fusion_heuristics_fusable  s     ,,.!''2J2JJ"88:
#::< &)"'AABRS  
",,.
 
 uyy~~55==??n 	C @@BHHII!>>@@h r\   c                h  ^ XL a  g[        X5      nUR                  5       (       a4  U R                  UR                  5       5      R	                  X5      (       a  g[        U[        5      (       d  [        U[        5      (       a	  U" S5        g[        U[        [        45      (       a  UR                  5       (       d	  U" S5        g[        U[        [        45      (       a  UR                  5       (       d	  U" S5        gUR                  5       UR                  -  (       a	  U" S5        gUR                  5       (       Gac  [        R                  (       d	  U" S5        gUR                  5       (       d  UR                  5       (       a	  U" S5        gUR                  5       n[        U[        R                   5      (       d	  U" S	5        gUR#                  5       n[%        S
 UR&                   5       5      U-
  nUR)                  5       U-  (       a	  U" S5        gUR+                  5       (       d  UR+                  5       (       a	  U" S5        gUR-                  5       mTSS  HK  nUR/                  5       nU H2  n	[1        U4S jU	R2                   5       5      (       a  M)  U" S5            g   MM     [        U[4        5      (       d  U/O2UR6                   V
s/ s H  oR                  5       (       d  M  U
PM     sn
n[9        U5      S:X  d   eUS   n[9        TS   R:                  5      S:X  aU  [9        TS   R:                  S   R2                  5      S:X  a,  TS   R:                  S   R2                  S   R<                  UL d	  U" S5        gU R?                  XU5      (       d  gUR                  5       (       aH  UR+                  5       (       d*  UR                  5       (       d  [        R@                  (       d	  U" S5        gUR)                  5       [B        RD                  RF                  -  (       d0  UR)                  5       [B        RD                  RF                  -  (       a	  U" S5        gUR                  5       nUR                  5       nX:w  a
  U" SX5        gAU RI                  X5      nU[        RJ                  :  a&  [        RL                  (       a  U RO                  X5      n[P        RS                  [T        RV                  5      (       a4  [P        RY                  SUR[                  5       UR[                  5       U5        [B        R\                  R_                  XX/5      (       d  gUR                  5       UR                  -  (       a_  U Ra                  X5      =(       aG    [B        R\                  Ra                  XX/5      =(       a     U R                  U5      Ra                  X5      $ [B        R\                  Rc                  XX/5      =(       a     U R                  U5      Rc                  X5      $ s  sn
f )zR
Determine if it is possible to combine node1 and node2 into a
single fused node.
FTz/grouped node must not be fused with other nodesznode1 is extern or nopznode2 is extern or nopznode1 must go before node2zprologue fusion turned offz2prologue fusion only supported for pointwise nodesz2prologue fusion only supported for TritonTemplatesc              3  @   #    U  H  oR                  5       v   M     g 7frU   rj  )r  inps     rY   r  %Scheduler.can_fuse.<locals>.<genexpr>  s     E_c<<>>_rp  z;prologue fusion not implemented for kernel for these inputsz:template prologue can only fuse functional pointwise nodesNr  c              3  @   >#    U  H  oR                   T;   v   M     g 7frU   r  )r  rr   prologue_nodess     rY   r  rc    s     QytyyN:ys   z7template prologue can only fuse nodes with a single user   r   zEtemplate prologue can only fuse nodes with a single use into templateztemplate epilogue not satisfiedz#fusion for buffer explicit disabledzdevice mismatch (%s vs %s)z%s and %s has %s shared data)2r  r  r  r  can_fuse_multi_outputs_templater~   rc  r0  r  rr  r   r   prologue_fusionr  rl  r   r  get_allowed_prologue_inpsr   rM  rx  r   rq  r   r  rR   r3  r  rn   r   rM   r^  r  rD   r   no_fuse_buffer_namesr  score_fusion_memory_thresholdr  rH  r  r  r  r  r  rV   choicesr  can_fuse_verticalcan_fuse_horizontal)rW   r  r  r  rk  r  unsupported_prologue_argsrM   	node_outsr   r  template_snodestemplate_snoder  device2shared_data_scorere  s                   @rY   r  Scheduler.can_fuse  s    >%4#3#3$

)
)%
7$8 e122j'7
 7
 ABu8:PQRR%%''()u8:PQRR%%''()$$&8,-))01!!##u'8'8':':HI779Hh(?(?@@HI$,$F$F$H! EX__EE'( &
 %%'*CCQR--//53Q3Q3S3SPQ"__.N&s+ ,,.	$CQsyyQQQUV$ % , "%);<< !&AAaA 
 '1,,,,Q/N N2&../14r*2215;;<A"2&..q177:??>Q[ @@sSS**,,!!##))12""$qww'C'CC""$qww'C'CC56!!#""$,f> 44UB D DD11 $ F Fu T))'--88##.  !	 yy!!$uHH$$&8 &&u4 MII//UVM$$V,>>uL 9900U M""6*>>uLMC Bs   X/3X/c                   UR                  5       n[        X5      n[        [        5      nUR                   Ht  nU R
                  R                  UR                  UR                  5      n[        U[        5      (       a  U R                  XaU5      (       a  Ma  XW   R                  U5        Mv     UR                  R                   H  n[        U[        5      (       d  M  UR                  U R
                  R                  UR                  UR                  5      5      n	U	(       d  Mb  U	 H,  n
U R                  X5      (       d  M  U	R!                  U
5        M.     M     [#        S [$        R&                  R)                  UR+                  5       5       5       5      nX-  (       a	  U" S5        gUR-                  5       nU HJ  nU R.                  U   R1                  5       nXR2                  U   R4                  -  (       d  MB  U" S5          g   g)z
Check if it is legal to fuse a consumer (node2) into a producer (node1).

We can fuse them if all the reads of node2 either match
corresponding writes in node1, or are written by nodes that can
be scheduled before the fusion of node1 and node2.
c              3  :   #    U  H  nUR                   v   M     g 7frU   r7  r8  s     rY   r  .Scheduler.can_fuse_vertical.<locals>.<genexpr>z  s      $
U HHUr:  zmemory deps did not matchFz(intermediate nodes between node1 & node2T)rx  r  r   r   r   r  r(  r_   r~   r)   fusable_weak_depr  r   r   r'   fusable_read_and_writerQ  r   r;  r<  r"  r   rr  r   rZ   rc  r   )rW   r  r  node1_buf_namesr  remaining_deps_by_namer  r_   cd	remainingr  remaining_depsnode1_op_namesrU  s                 rY   rl  Scheduler.can_fuse_vertical[  s     002%7B47H++C((,,SXXsxx@D#w''D,A,A#e,T,T"(//4	 , ##**Bb),,.22%%))"''277;I y#B222::!((, $ + $ $
 445K5R5R5TU$
 

 +
 +,224"D&&t,==?G 7 7 @ J JJJ>?	 # r\   c                P  ^ UR                   UR                  5       ;  a  gUR                  R                   Vs/ s H!  nUR                   UR                  :X  d  M  UPM#     nn[        U5      S:w  a  gUS   m[        T[        5      (       d   e[        TR                  [        R                  5      (       a  gU R                  UR                     nUR                  R                   Vs/ s H  owR                   U:X  d  M  UPM     nn[        U4S jU 5       5      $ s  snf s  snf )NFr   r   c              3  $  >#    U  H  n[        U[        5      =(       ai    [        UR                  [        R
                  5      (       + =(       a9    UR                  TR                  :H  =(       a    UR                  TR                  :H  v   M     g 7frU   )r~   r'   r   rA  r   TMPrB  )r  r  ri  s     rY   r  -Scheduler.fusable_weak_dep.<locals>.<genexpr>  sn      

 '	 tY' ('

DHH==(

ekk)( 		UZZ'( 's   BB)r_   rx  r   r   r  rn   r~   r'   r   rA  r   r  r*  r   r  )	rW   weak_depr  r  ri  mutating_writes	real_namer  relevant_readss	       `    rY   rx  Scheduler.fusable_weak_dep  s    == 6 6 88 **11
1zzX222 1 	 

 1$"%++++u{{DHH55++H,A,AB	"..44
4T		Y8ND4 	 
  

 '
 
 	
#

s   DD*D#D#c                8   [        U[        5      (       Gab  U R                  R                  UR                  UR                  5      nX2R                  :w  dR  [        UR                  [        R                  5      (       d)  [        UR                  [        R                  5      (       a  g[        R                  (       a:  UR                  UR                  :w  a   UR                  5       nUR                  5       nUR                  UR                  :H  =(       aa    [        UR                  5      [        UR                  5      :  =(       a/    UR                  S [        UR                  5       UR                  :H  $ [        U[        5      (       a  U R                  R                  UR                  UR                  5      nU R                  R                  UR                  UR                  5      nUR                   UR                   :X  a  UR                   b  X4:X  a  ggr   )r~   r'   r  r(  r_   r   rA  r   r  r   r  r  r  rn   rB  r(   rH  )rW   r  ri  	read_name
write_names        rY   ry   Scheduler.fusable_read_and_write  sh   dI&&--11$))TYYGI ZZ'&tzz488<<&u{{DHH==00T]]enn5T ~~') 

ekk) ?		Nc%**o5?II/EJJ0EJJ>
 g&&--11$))TYYGI..225::uzzJJ		UZZ'JJ*+r\   c                    SnXR                   ;  a6   UR                  5       (       d  UR                  5       nX R                   U'   U$ U R                   U   nU$ ! [         a     N-f = fr  )rq  has_unbacked_symbolsnumbytes_hintKeyError)rW   r  ress      rY   rA  Scheduler.dep_size_hint  sy    000//11++-C /2&&s+ 
 ,,S1C
   	s   %A 
A&%A&c                B  ^  [        UR                  R                  5      [        UR                  R                  5      -   n[        UR                  R                  5      [        UR                  R                  5      -   n[	        X45      S-  [        X45      :  a  X4:  a  UnUnUnUR                  R                  UR                  R                  -   Vs/ s H9  nXbR                  R                  ;   d  XbR                  R                  ;   d  M7  UPM;     nn[        U 4S jU 5       5      $ UR                  R                  UR                  R                  -  UR                  R                  UR                  R                  -  -  n[        U 4S jU 5       5      $ s  snf )zV
The first term in our fusion score that estimates number of saved
memory operations.
r  c              3  F   >#    U  H  nTR                  U5      v   M     g 7frU   rA  rM  s     rY   r  0Scheduler.score_fusion_memory.<locals>.<genexpr>  s     ?$3t))#..$r,  c              3  F   >#    U  H  nTR                  U5      v   M     g 7frU   r  rM  s     rY   r  r    s!     I6Hs4%%c**6Hr,  )rn   r   r   r   r)  rT  r  )	rW   r  r  node1_dep_lennode2_dep_lentmpr  rH  common_memory_depss	   `        rY   r  Scheduler.score_fusion_memory  sa    E--334s5;L;L;S;S7TTE--334s5;L;L;S;S7TT },q03}3TT, !,,22U5F5F5M5MMMC++111S<M<M<T<T5T M   ?$???#//558I8I8P8PP##e&7&7&>&>>
 I6HIIIs   6FFc                   [        U5      S:X  a  U$ 0 nU H  u  p4UR                  5       UR                  5       :X  d   eUR                  5       n[        U R                  U5      R	                  X45      5      nXb;  a  X44/X&'   Mo  X&   R                  X445        M     [        UR                  5       [        R                  " S5      S9S   n[        U5      S:  d   eU$ )Nr   r  r   )
rn   r  r   r  get_fusion_pair_priorityr  r)  r  operator
itemgetter)rW   r  "possible_fusions_group_by_priorityr  r  r  fusion_pair_priority&possible_fusions_with_highest_prioritys           rY   r  4Scheduler.get_possible_fusions_with_highest_priority  s    
  A%##  	+ -LE##%)9)9);;;;%%'F#&  (AA%O$  $MNL2H 3HOON - 25.446H<O<OPQ<R2

2. 9:Q>>>55r\   c                D    [         R                  R                  " U /UQ76 $ )z
Shim for list.sort(key=...)
)rD   rk  score_fusionrt  s     rY   r  Scheduler.score_fusion_key%  s     yy%%d3U33r\   c                    [        [        R                  R                  5       5      n[	        U R
                  5       H9  nUR                  XR                  5        UR                  UR                  5        M;     g)zW
Populate node.last_usage recursively (also for the nodes within a FusedSchedulerNode)
N)
r   rD   r   r  r-  ru  r0  r*  r  r   )rW   r.  rM   s      rY   r  Scheduler.compute_last_usage-  sV    
 ))A)A)CDTZZ(D 35L5LM&&t7 )r\   c                   [        U R                  [        R                  R                  -
  [        R                  R
                  R                  -
  5       GHj  nXR                  ;   a[  U R                  U   nUR                  5       (       a5  [        R                  R
                  R                  UR                  5        Ml  Mn  U[        R                  R                  ;   d  M  [        R                  R                  U   n[        U[        R                  5      (       a+  [        R                  R
                  R                  U5        M  UR                  n[        U[        R                   5      (       a  UR#                  5       (       d   e[        R                  R
                  R                  UR                  5        GMm     U R                  R%                  5         g)z*Free any buffers that are no longer neededN)r  r  rD   r   r  r   freedr   r   codegen_freerM   r4  r~   r   r'  r   rh  is_input_bufferclear)rW   r_   r   rb  storages        rY   free_buffersScheduler.free_buffers8  s?   %%gg%%&gg""(()
D
 '''&&t,<<>>GG((55chh? "---gg**40c2#5#566GG((55c:!hhG"7BMM::w?V?V?X?XXGG((55gllC%
( 	!!'')r\   c                    U R                   R                  5        H  nUR                  5         M     U R                  5         g rU   )r  r   flushr  )rW   rA  s     rY   r  Scheduler.flushP  s.    }}++-GMMO .r\   c                   [        U[        5      (       d   e[        S   S==   S-  ss'   [        R                  " [        SS95         UR                  5         UR                  5         S S S 5        UR                  n[        U[        R                  5      (       d   S[        U5      < 35       eUR                  [        R                  R                  5        U R                  5         g ! , (       d  f       N= f)Ninductorextern_callsr   F)increase_kernel_countztype(node)=)r~   r0  r   rD   set_kernel_handlerr$   r  r3  rM   r   rI  rh   r  r   r   r  )rW   scheduler_noderM   s      rY   codegen_extern_callScheduler.codegen_extern_callU  s    .*CDDDD
 	^,1,!!&u"EF002##% G ""$00B[T$ZM2BB0QWW))* GFs   	!C++
C9c                |   [        UR                  5      (       a  UR                  c
   U S35       e[        R                  R                  U5        [        UR                  5      nUc  [        SUR                   35      e[        5       (       d  UR                  S:X  aN  [        R                  R                  U5      =nR                  S:  a  [        U[        R                  " 5       5      e[        UR                  5      (       a.  UR                  S:X  d  [!        [        R                  " 5       5      eU" U 5      $ )Nz( should have been normalized in loweringzUnsupported device type: cuda   mps)r?   rh   rA  rD   r   add_device_infor#   r  r   r   r  get_device_propertiesmajorr*   inspectcurrentframer+   )rW   r  device_schedulingdevice_propss       rY   create_backendScheduler.create_backendd  s    &++&&&,,*B 	
h>?	
B 	
'5fkkB$!:6;;-HII||v%%*ZZ%E%Ef%MM\TTWXX(w7K7K7MNN$$V[[E-A#G$8$8$:;; &&r\   c                    Uc   eXR                   ;  a  U R                  U5      U R                   U'   U R                   U   $ rU   )r  r  r  s     rY   r  Scheduler.get_backendy  s@    !!!&$($7$7$?DMM&!}}V$$r\   c                  ^  SU 4S jjnUR                  5        VVs0 s H?  nUR                  c  M  UR                  R                  5         H  nU" U5      U4S _M     MA     nnn[        UR	                  5       5      nU(       aJ  [        U[        R                  " S5      S9u  pg[        R                  R                  R                  U5        g g s  snnf )Nc                   > U TR                   ;  aM  TR                   R                  [        U R                  R                  5       VV s0 s H  u  pX_M	     sn n5        TR                   W    $ s  sn nf rU   )r  r  rt  r   ru  )r  rq  rW   s     rY   	get_order*Scheduler.enter_context.<locals>.get_order  s^    ,,,$$++i>V,W>VdaQT>V,WX''** -Xs   	A.
r   r  )r  ztorch.fx.Noder   r   )rq  rM   r  r   r  rT  r  r  rD   r   r   enter_context)rW   rM   r  r  rU  r  rx  lasts   `       rY   r  Scheduler.enter_context  s    	+ ^^%
%vv  VV'') q\1t# * % 	 
 w||~&'x':':1'=>GAGG  ..t4 
s
   C1Cc                   ^  U R                   U   R                  n[        U4S jU 5       5      =(       a#    XR                  ;  =(       a    XR
                  ;  $ ! [         a     gf = f)NFc              3  n   >#    U  H*  oR                   =(       d    UR                  5       T;   v   M,     g 7frU   )rc  rV   )r  rr   fused_node_namess     rY   r  AScheduler.can_buffer_be_removed_through_fusion.<locals>.<genexpr>  s)     VPUC3C CCPUr&  )r   rR   r  r  r  r*  )rW   r_   r  rR   s     ` rY   $can_buffer_be_removed_through_fusion.Scheduler.can_buffer_be_removed_through_fusion  sj    	$$T*00E VPUVV 41114333	
  		s   A 
A('A(c                   UR                  5       (       d  gUR                  c  g[        UR                  [        R                  5      (       a  g[        UR                  [        R
                  5      (       a  g[        UR                  SS5      (       a  g[        UR                  S5      (       a6  [        S UR                  R                  R                   5       5      (       a  gg)zBReturn True if we should partition the inductor graph on this nodeTNunbacked_bindingsrj   c              3  |   #    U  H2  n[        U[        R                  5      =(       a    UR                  v   M4     g 7frU   )r~   r.  r  r  )r  exprs     rY   r  -Scheduler.should_partition.<locals>.<genexpr>  s0      0
- tUZZ(>T->->>-s   :<F)r?   rM   r~   r   
DeviceCopyConditionalr   r   r  rj   rB  r  s     rY   should_partitionScheduler.should_partition  s    {{}}99dii//dii004991488499h''C 0
		((--0
 -
 -
 r\   c                    0 nUR                  [        R                  R                  5        U R                   H4  nUR
                  R                  5        H  u  p4UR                  X'   M     M6     U$ )zf
Return a mapping from name strings to the corresponding graph inputs or
base scheduler node outputs.
)r  rD   r   r4  ru  r   r  rM   )rW   r  rM   r_   scheduler_buffers        rY   get_name_to_nodesScheduler.get_name_to_nodes  sd     UWAGG001JJD*.*>*>*D*D*F&%5%:%:" +G  r\   c           	        / n[        [        R                  R                  5       5      nU R	                  5       n[        [        U5      [        U5      5       GHz  u  pg[        5       nU H,  n	UR                  U	R                  R                  5       5        M.     UR                  U5      n
[        R                  R                  U V	s/ s H  oR                  PM     sn	5      n[        UR                  UR                   -   Vs/ s H  oR"                  PM     sn5      U-
  n[        5       nU H  n	UR                  U	R$                  5        M      U Vs0 s H  nX;   d  M
  XU   _M     nnU Vs0 s H  nX;   d  M
  XU;   a  SOS_M     nnU
 Vs/ s H  oU   PM	     nnUR'                  [)        UUUU5      5        UR+                  XJ-
  5      nGM}     USSS2   $ s  sn	f s  snf s  snf s  snf s  snf )z
Gets signature for each graph partition, including input nodes, output nodes, and
whether deallocating an input within graph partition.
TFNr  )r   rD   r   r  r  r!  r-  r  r   r  r  r   rX  rY  r   r   r   r_   r   r  r.   rZ  )rW   
partitionsskip_cudagraphs
signaturesunmet_output_namesr  	partitionskip_cudagraphoutput_namesrM   returned_output_namesr   r  partition_input_namesr  r_   input_nodesinput_deallocationoutput_nodess                      rY   get_graph_partition_signature'Scheduler.get_graph_partition_signature  s    
'(@(@(BC--/),Z (?";*
%I -7LL!##D$8$8$=$=$?@ " %1$=$=>P$Q! '11<<.78id!!i8K K,=,=@R@R,RS,RqFF,RST "
 5?L !$++DOO< "
 21D' )4((1   2"1D' F&::dE1  "
 <QQ;P4.;PLQ' &"	 "7!<!<":"W*
^ $B$E 9 T
"
 Rs*   G
G$
	G)"	G)1	G.>G.G3c                R   / nSn/ n/ nU R                    HW  nU R                  U5      nU(       a)  X&:w  a$  UR                  U5        UR                  U5        / nUnUR                  U5        MY     U(       a"  UR                  U5        UR                  U5        XR                  XS94$ )zz
Given a list of BaseSchedulerNodes, split into a list of
graph partitions and compute partition input/output signatures.
T)r  r  )ru  r  r  r  )rW   r  r  cur_partitionr  rM   r  s          rY   graph_partitionScheduler.graph_partition  s     +-
')JJD#44T:!C!!-0&&~6 "-N  &  m,"">2==! > 
 
 	
r\   c                    [        S5         [        R                  R                  R                  (       a  U R                  5       OU R                  U R                  5       sS S S 5        $ ! , (       d  f       g = f)NzScheduler.codegen)r   r   r   r   r  _codegen_partitions_codegenru  r`   s    rY   r  Scheduler.codegen"  sO    -. ??))99 ((*]]4::. /..s   AA++
A9c                8   [         R                  R                  n[        U R                  5      n[         R                  R                  5          [         R                  R                  SSU 3UUS9  U R                  U5        [         R                  R                  R                  [         R                  R                  5      u  pVSSS5        [         R                  R                  R                  WR                  5        [         R                  R                  R                  XB5        [         R                  R                  R                  R                  UR                   Vs/ s H  owR!                  5       PM     sn5        g! , (       d  f       N= fs  snf )z,Codegen a partition given its inputs/outputsT
partition_)is_subgraphsubgraph_nameparent_wrapper_codepartition_signaturesN)rD   r   r   rs  r  set_current_wrapper_codeinit_wrapper_coder  generateis_inferencedefine_subgraph_launcher_fnvaluecodegen_partition_call	allocatedr  r  rV   )rW   r  	signaturer  graph_partition_idpartition_coderx  rM   s           rY   _codegen_partition_wrapper$Scheduler._codegen_partition_wrapper*  s     gg22!$"?"?@WW--/GG%%  *+=*>?$7%.	 &  MM)$ ! 4 4 = =agg>R>R SN 0 	
889M9MN	334FR	&&--)2)?)?@)?]]_)?@	
 0/ As   A8F%F
Fc                z   U R                  5       u  p[        X5       H\  u  p4[        U5      S:  d   S[        U5       35       eUR                  (       a  U R	                  U5        MK  U R                  X45        M^     [        U R                  5      n[        R                  R                  R                  U5        g)z
Split nodes into partitions and codegen each partition into separate functions.
This allows further applying different optimizations (e.g., cudagraph) to
each function.
r   z5Each partition must have at least one node but found N)r  r!  rn   r  r  r  rs  r  rD   r   r   set_all_partition_names)rW   r  r  r  r  num_partitionss         rY   r  Scheduler._codegen_partitionsD  s     "&!5!5!7
$'
$? Iy>Q& GIGWX& ''i(//	E %@ d;;<	44^Dr\   c                   [         R                  (       a  SS Kn[        R                  " 5       n[        5       n[        U5       H  nUR                  S:X  a0  UR                  UR                  R                  R                  :X  a    OTUR                  UR                  4nXd;  d"   SUR                   SUR                   S35       eUR                  U5        M     S U l        U GHS  n[        R!                  ["        R$                  5      (       a4   [        R'                  SUR)                  5       UR+                  5       5        U R/                  U5        UR1                  5       =n(       Ga  XR                  :w  d*  UR3                  5       (       d  UR5                  5       (       a  U R7                  5         XR                  :w  a  U R                  (       aL  [9        U R                  R:                  5      (       a(  [<        R>                  R@                  RC                  5         Xl        [9        UR:                  5      (       aG  URD                  c   S5       e[<        R>                  R@                  RG                  URD                  5        U RH                  RK                  URL                  5        UR5                  5       (       aN  URO                  [Q        URS                  5       5      5      u  pnU RU                  U5      RW                  XU	5        GO1UR3                  5       (       a-  [X        RZ                  " [\        U5      nU R_                  U5        OURa                  5       (       aw  [X        RZ                  " [b        U5      nU RU                  U5      nS	S
K2J3n  S	SK4J5n  [m        XU45      (       a  UnO[o        S[;        U 5      < 35      eURq                  U5        Oc[m        U[r        [t        45      (       a!  U RU                  U5      Rw                  U5        O'[m        U[x        5      (       d   eUR{                  5         [         R|                  R~                  (       a  U RU                  U5      R                  5         U R                  RK                  UR                  5       5        U R                  RK                  UR                  5       5        [m        U[x        5      (       a  GM  UR1                  5       nUc  GM  U RU                  U5      R                  5       (       d  GMC  U R7                  5         GMV     U R                  (       aL  [9        U R                  R:                  5      (       a(  [<        R>                  R@                  RC                  5         U R7                  5         g ! [,         a(    [        R'                  SUR)                  5       5         GNf = f)Nr   _compile_innerzDuplicate stack frame :zs; did you add a decorator to one of the functions in this stack trace?  If so, try using a context manager instead.z5Generating code for node %s with estimated runtime %fz6Generating code for node %s with estimated runtime 0.0zdevice should have an indexr   )CUDACombinedSchedulingr  ztype(self)=)Fr   "check_stack_no_cycles_TESTING_ONLYtorch._dynamo.convert_frame	tracebackextract_stackr   r-  r_   filename_dynamoconvert_frame__file__linenorE  r  r   r  r  r  r  rV   rc  r   r  r  r  r  r  r9   rh   rD   r   r   codegen_device_guard_exitrA  codegen_device_guard_enterr  r  r   rz  r   rq  r  codegen_templater  r  r0  r  r  r   codegen.cuda_combined_schedulingr  r  r  r~   r  codegen_combo_kernelr3  r~  codegen_noder  r3  r  debug_sync_kernelcodegen_syncrL  rx  r  rr  ready_to_flush)rW   ru  r   stackr  framer  rM   r  rw  rx  ry  backend_r  r  rA  s                   rY   r  Scheduler._codegenY  sF   44.++-E7A|D!% JJ"22%--*E*E*N*NN~~u||4 ,U^^,<Aell^ LJ J
  ) #D..
IIO224 t$**v*111~~''''))JJL000**/@++000 0 ,,FFH*0'(55%||7V9VV7,,GGU%%,,T__=!!484W4W)*51   (99!X !!{{#<dC((.""{{#=tD++F3T8h9O(PQQ&G(KDJ=)9::,,T2D#5}"EFF  (55d;!$(>????}}..  (557''..t/D/D/FG%%,,T-E-E-GHd$:;;*%$*:*:6*B*Q*Q*S*SJJLO R #4T5H5H5M5M#N#N GG  ::<

M ! IIPs   3V**.WWc                    US   R                  5       nU [        R                  l        X l        Uc   eU R                  U5      nUR                  U5      $ )r<  r   )r  rD   r   rL   r  r  benchmark_combo_kernel)rW   rx  r  rA  s       rY   r0   Scheduler.benchmark_combo_kernel  sU     1((* $!!!""6*--i88r\   c                2   [         R                  (       d  gUnUS   R                  5       nUb  UR                  S:X  a  gSSKJn  S/ pe[        U5       H  u  pxUR                  5       n	U R                  U	5      (       a  [        R                  S5         U R                  U	5      u  p[        R                  " U
5      (       a  [        R                  SU5          g	 XZ-  nUR                  U5        M      U R                  U5      u  pnX-
  S:  =(       d    US:  n[        R!                  ["        R$                  5      (       aS  X]:  d  U(       a$  [        R                  S['        X]-  S 5      5        O#[        R                  S[)        X]-  S 5      5        X-
  U:  =(       d    U$ ! U a0  nS
[        U5      ;   a  [        R                  S5         SnA  ge SnAff = f! U a/  nS
[        U5      ;   a  [        R                  S5         SnAge SnAff = f)r|  Tr   Nr  r  g        z<ComboKernel: benchmarking may not accurate due to atomic_addz;ComboKernel benchmark: register spilling of %d-th subkernelFr  zCComboKernel benchmark: return True because of loop-carried variableg333333?z/can fuse (benchmark): fusing causes %sx speedupr  z3cannot fuse (benchmark): fusing causes %sx slowdown)r   r0  r  rh   r  r  rt  rq  ry  r  r  r=  r  r  r   r  r  r  r  r4   r5   )rW   ru  subkernel_nodesr  r  r  
path1_listrq  rn  rx  msr  rU  r  	ms2_clone_path2_listsmall_kernels                    rY   r  !Scheduler.speedup_by_combo_kernel  s   
 ,, #..0 >V[[E1;rZ!/2HA)I ##I..  R55i@::b>>$$U ! " ICd#7 3:
	*.*E*Eo*V'CK ,9c	""7==11yL  E#)C2
   I	#0
 $44M $ *c!f4$$]     	&#a&0  Y 	s=   AF(6G! (G.$GGG!H'$HHHc                r    U R                   U   nUR                  c   eUR                  R                  5       $ rU   )r   rM   
get_layout)rW   r  r   s      rY   get_buffer_layoutScheduler.get_buffer_layout  s5    x(xx###xx""$$r\   c                    U R                    H  nUR                  5       (       d  M  UR                  R                   H  n[        R
                  R                  R                  UR                  5      nU(       d  M?  [        U5      S:X  d  MP  [        UR                  [        5      (       a  Mq  UR                  5       / :X  d  M  [        R
                  R                  R                  UR                  5        M     M     g r  )ru  r?   r   r   rD   r   rF  r(  r_   r-   r~   rj   r0   r   zero_dim_cpu_tensor_listrE  )rW   rM   r  r  s       rY   r  $Scheduler.update_zero_dim_cpu_tensor  s    JJD{{}} ,,22DWW3377		BF+F3u< *6==:K L L"OO-388<<TYYG 3 r\   )__dep_size_hint_cacher  rL  r  r  r  r  r  r*  r  r   r   rc  r  ru  r  r  r  )ru  zlist[ir.Operation]r   r   )r   z!dict[str, SchedulerDonatedBuffer]r  )r  r  r   r   r   )r  r   r   r   )rM   r|  r   rG   r$  )rn  rG   r   r  )r   r%  ru  r/  r   tuple[float, str]ru  r/  rF  r   r   r   )rJ  r   r  r  r   rC  )rx  r/  r   r   )r  rG   r  rG   r   zUnion[bool, Callable[[], bool]])rM   rG   r   rG   rU   )ry  zOptional[int]r   r   r  )ru  r  r   1list[tuple[BaseSchedulerNode, BaseSchedulerNode]]r  rG   r  rG   r   r   )r  rG   r  rG   r0  z"Union[tuple[str], OrderedSet[str]]r   r   r  rG   r  rG   r   r   r  )rZ  rG   rx  rG   r  r  r   r   )r  r)   r  rG   r  rG   r   r   )r  r&   ri  r'   r   r   )r  r&   r   r   )r  rE  r   rE  )ru  z+tuple[BaseSchedulerNode, BaseSchedulerNode]r   r   )r  r0  r   r   )r  r  r   BaseScheduling)r  r  r   rH  )rM   rG   r   r   )r_   r   r  r  r   r   )r   z;dict[str, Union[ir.IRNode, ir.TorchBindObject, sympy.Expr]])r  zlist[PartitionType]r  z
list[bool]r   zlist[GraphPartitionSignature])r   z9tuple[list[PartitionType], list[GraphPartitionSignature]])r  PartitionTyper  r.   r   r   rx  r/  r   z(tuple[float, float, list[Optional[str]]])ru  r  r   r   )r  r   r   z	ir.Layout)Hri   r   r   r   r   r   rs  r  propertyr  setterr  r  r  r  r  r  r  r!  r   r  r  r1  r  r=  rE  rI  r  ry  r  r  r4  r  rd  r  r  r%  r*  r9  rH  r  r^  r  rl  rx  ry  rA  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r0  r  r<  r  r   r  r  s   @rY   rK   rK     s   ))s
j	# & & ( (7#,"HOSb(#T,	 6S(4#&$66	808	8$T0TDHT	T
> 
>*6
>	
>L@\
~(&~(/@~(	(~(@>h,h	 hT..`?0 ,0 	:0 d,&,/@,	,\7&7/@7	7r$&$/@$	$6< < !< =	<
 
<|I6&I6/@I6	I6V
9(9 )9 	9
 
9vQMf3&3/@3	3j

(9
BS
	
J D J&J/@J	J<6 Q6	:6@4@4	4	8*0
'*%5$

+:
	
2	D ; -; @J; 	&; z
	B
<
 
 +
 
	
4E*dL949	19I5V%
H Hr\   c                  F  ^  \ rS rSrSU 4S jjrSS jrSS jr      SS jr      SS jr      SS jr	      SS jr
    SS	 jr        SS
 jr      SS jrSS jrSS jrSS jrSS jr    SS jrS S jr      S!S jr    S"S jrSrU =r$ )#rH  i+  c                .   > [         TU ]  5         Xl        g rU   )r  r   rL   )rW   rL   rQ  s     rY   r   BaseScheduling.__init__,  s    "r\   c                \    U R                   (       a  U R                   R                  5         g g rU   )rL   r  r`   s    rY   free_buffers_in_scheduler(BaseScheduling.free_buffers_in_scheduler0  s    >>NN'') r\   c                    [        5       $ )z0Return a set of .codegen.common.BackendFeature()r   r  s     rY   get_backend_features#BaseScheduling.get_backend_features4  s
    |r\   c                    [         e)z?
Check whether node1 and node2 can be vertically fused or not.
r  r  s      rY   rl   BaseScheduling.can_fuse_vertical8  
     "!r\   c                    [         e)zA
Check whether node1 and node2 can be horizontally fused or not.
r  r  s      rY   rm  "BaseScheduling.can_fuse_horizontal@  rX  r\   c                    g)aE  
A Multi-Output Template (referenced in #144012) is a template node
with MultiOutputLayout, and its output buffers are instances of MultiOutput.
In this context, we verify whether node1 represents the Multi-Output Template
and node2 corresponds to one of its outputs. If so, we further check if
backend supports this fusion.
Fr   r  s      rY   rf  .BaseScheduling.can_fuse_multi_outputs_templateH  s     r\   c                    UR                  5       (       d  UR                  5       (       a  [        R                  X5      $ [        R                  X5      $ )z
Fuse two nodes
)r  r  rj  r3  r  s      rY   rj  BaseScheduling.fuseT  sC     !1!1!3!3-225@@%**588r\   c                    [         e)zK
Process the iteration sizes in case a transformation needs to be applied.
r  )rW   r  s     rY   r  BaseScheduling.group_fn_  rX  r\   c                    [         e)z
Given a template node, generate a kernel.

This function is only available for triton now. If the third-party backend behaves as a sub-class
of TritonScheduling, it can override it or reuse it.
r  )rW   rx  epilogue_nodesre  s       rY   r$  BaseScheduling.codegen_templateg  s
     "!r\   c                    [         ez4
Generate a kernel given a list of pre-fused nodes.
r  )rW   ru  rF  s      rY   rE  .BaseScheduling.generate_kernel_code_from_nodesu  rX  r\   c                    [         ere  r  r  s     rY   r'  BaseScheduling.codegen_node}  
     "!r\   c                    [         e)zd
Generate synchronization code for the kernel. This method depends on the hardware characteristics.
r  r`   s    rY   r)  BaseScheduling.codegen_sync  ri  r\   c                    g)z}
Check whether the backend is requesting the scheduler to flush the generated kernel.
If not supported, please return False.
Fr   r`   s    rY   r*  BaseScheduling.ready_to_flush  s    
 r\   c                    [         e)zM
Flush the generated kernel and python wrapper code to the source code file.
r  r`   s    rY   r  BaseScheduling.flush  ri  r\   c                    [         e)r<  r  rt  s     rY   r=  $BaseScheduling.benchmark_fused_nodes  
     "!r\   c                    [         e)zi
Benchmark a compiled module and return the execution time
in milliseconds on randomly generated inputs.
r  )rW   rJ  s     rY   rI  )BaseScheduling.benchmark_codegened_module  s
    
 "!r\   c                    g)zt
Return an unsigned integer which represents the priority of this fusion pair.
The smaller is with higher priority.
r   r   r  s      rY   r  'BaseScheduling.get_fusion_pair_priority  s     r\   c                    [         e)z
Benchmark the list of nodes to combine and return the execution time
and memory copy time in milliseconds on randomly generated inputs.
r  rw  s     rY   r0  %BaseScheduling.benchmark_combo_kernel  rr  r\   r  )rL   zOptional[Scheduler]r   )r  r  r   zOrderedSet[BackendFeature]rF  r  )r  rP  r   z"tuple[tuple[sympy.Expr, ...], ...])rx  rG   rb  r/  re  r/  r   zOptional[str]rD  )rM   z(Union[FusedSchedulerNode, SchedulerNode]r   r   r   rB  )rJ  r   r   rC  rG  rJ  )ri   r   r   r   r   rQ  rT  rl  rm  rf  rj  r  r$  rE  r'  r)  r*  r  r=  rI  r  r0  r   r  r  s   @rY   rH  rH  +  sH   #*"&"/@"	""&"/@"	"
&
/@
	
	9&	9/@	9		9"3"	+""(" 4" 4	"
 
""0"DH"	"""""0"	""&/@	"4"	1" "r\   rH  )r  r   r   r   )rM   rG   rc  r  r   zdict[str, SchedulerBuffer]r   r   )rU  /Union[FusedSchedulerNode, GroupedSchedulerNode]r   r   )rU  ry  rL   rK   r  r  r   r   )r   )rU  zlist[list[int]]r  rQ  r\  ztuple[int, ...]r   z	list[int])
__future__r   r2  r   rX  r  r;  r  r  r  r  r  r  r  r  r   r   r   r   r   r	   r
   r   r   collections.abcr   typesr   r.  r   torch._inductor.async_compiletorch._dynamo.utilsr   r   torch._inductor.codecacher   r   torch._inductor.metricsr   r   %torch.fx.experimental.symbolic_shapesr   torch.utils._ordered_setr   torch.utils._sympy.symbolr   r   torch.utils._tritonr   r   r   r   r   r   r    analyze_preserves_zero_maskr!   codegen.commonr"   r#   r$   comm_analysisr%   r&   r'   r(   r)   excr*   r+   r,   r-   r.   r/   r0   	loop_bodyr1   r  r2   r3   runtime.runtime_utilsr4   r5   r
  r6   utilsr7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   virtualizedrD   	getLoggerri   r   _logginggetArtifactLoggerr  r  r   rI  	dataclassrI   r   rG   r  rl   r   rb  rV  rW  convolutionmmbmmaddmm
_scaled_mmrJ  r0  r  r~  r[  rd  r3  r  rc  r_  ra  r  r  rK   rH  r   r\   rY   <module>r     s   "         	     , R R R (    $ 6 ? M G / ? * 6 6 D M M ; : : 2    J 7 &     !^^--hA
NN44XO () e. e. e.P 4_ 4 4w
1 w
1t
 
,  &K
&K4&K ,&K 
	&KV #()).."<"<**))..,,!IINN00!&!:!: W 1 W"5. 5~+% ~+B@	$@ $ 
	,P** P*fy:!3 y:x	?, ?J %'+#++ "+ 	+\ 
 
 
> %??, l%H l%H^KK" K"r\   