o
    Zhn                    @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlmZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlZd dlmZmZ d dlmZ d d	lm Z m!Z! d d
l"m#Z# d dlm$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z= d dl>m?Z? ddl@mAZAmBZBmCZC ddlDmEZEmFZFmGZGmHZHmIZImJZJmKZK ddlLmMZMmNZNmOZOmPZP ddlBmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[ ddl\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZemfZfmgZg ddlhmiZi ddljmkZk ddllmmZm ddlnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZx ddlymzZzm{Z{ erd d l|m}Z}m~Z~mZ d d!lmZ d d"lmZ d d#lmZ d d$lmZ dd%lmZ dd&lmZ d d'lmZ eeZejed(ZejjZe ZeA rd d)lmZ ndMd/d0ZdNd4d5ZdOd8d9ZdPd?d@ZdQdDdEZdRdGdHZG dIdJ dJejjZG dKdL dLeZdS )S    )annotationsN)defaultdict)contextmanager)AnyCallableNoReturnOptionalTYPE_CHECKINGUnion)Expr)deviceTensor)get_decompositions)defakedynamo_timed)FakeScriptObject)
LazyStringtrace_structured)compute_required_storage_lengthmake_channels_last_strides_for)
FakeTensor)BackwardState)magic_methodsmethod_to_operator)free_unbacked_symbolshas_free_symbolsresolve_unbacked_bindingsRuntimeAssertShapeEnvSympyBooleanSymTypes)Node)no_dispatch)
OrderedSet)int_oo   )configirmetrics)BackendFeatureDeviceOpOverridesget_backend_featuresget_device_op_overridesget_wrapper_codegen_for_deviceinit_backend_registrationWorkspaceArg)CppWrapperCodegenErrorLoweringExceptionMissingOperatorWithDecompMissingOperatorWithoutDecomp)ConstantDonatedBufferFixedLayoutget_device_typeGraphPartitionSignatureInputBuffer	Pointwise	Reduction
StorageBox	TensorBoxTorchBindObject)constrain_to_fake_tensorsconstrain_to_fx_stridesFALLBACK_ALLOW_LISTfallback_handler%fallback_node_due_to_unsupported_type	loweringsmake_fallbackmaybe_layout_constraintsneeds_realized_inputsrequire_contiguousunsupported_output_tensor)autotune_cache)AutotuneCacheBundler)SizeVarAllocator)
convert_shape_to_inductorgather_origins get_cloned_parameter_buffer_nameget_donated_idxsget_sympy_Expr_dtypeis_same_tensor#maybe_get_suppress_shape_guards_ctxnormalize_nameshould_assume_input_alignedValueWithLineMap)NullHandlerV)IterableIteratorSequence)
ModuleType)_EffectType)GraphModule)Graph)PythonWrapperCodegen)BaseSchedulerNode)output_code_logZ
perf_hints)log_module_codeargsr   kwargsreturnNonec                  O  s   d S N )rd   re   ri   ri   D/var/www/auris/lib/python3.10/site-packages/torch/_inductor/graph.pyrc         rc   constant_buffer
sympy.ExprOptional[torch.dtype]c                 C  sh   t | tjtjtjjjfsJ dt | tjjjrtjS t | tjr&t	| S | j
r,tjS | jr2tjS d S )Nzgget_constant_buffer_dtype only supports input of sympy.Symbol, sympy.Expr or sympy.core.numbers.Integer)
isinstancesympySymbolr   corenumbersIntegertorchZint64rQ   
is_integerZis_floatZfloat32)rl   ri   ri   rj   may_get_constant_buffer_dtype   s   rw   opboolc                 C  s   t dd tD }| |v S )Nc                 s  s    | ]}t |V  qd S rh   )r   ).0mri   ri   rj   	<genexpr>   s    z"is_magic_method.<locals>.<genexpr>)r#   r   )rx   Z	magic_opsri   ri   rj   is_magic_method   s   r}   objr^   targetstr1Union[Tensor, torch._C.ScriptObject, GraphModule]c                 C  sT   | d}| }t|D ]\}}t||s"tdd|d |  t||}q|S )N.z#Node referenced nonexistent target )split	enumeratehasattrRuntimeErrorjoingetattr)r~   r   Ztarget_atomsZattr_itriZatomri   ri   rj   getattr_recursive   s   

r   gr_   dict[Node, tuple[int, ...]]c                 C  s^   i }| j ddd }d|jvr|S t|jd D ]\}}||jd v r,|jd | ||< q|S )Noutputrx   r   Zuser_visible_output_idxsZoriginal_output_strides)Z
find_nodesmetar   rd   )r   retZoutput_nodeidxnoderi   ri   rj   get_user_visible_output_strides   s   
r   user_visible_output_stridesc           	      C  s  t jsdS ttjtjtjg}ttjtjtj	tj
tjtjtjtjtjtjtjtjg}ddd}t| jD ]J}t|jtjjjrFd|jd	< q5||}|sMq5||v rVd|jd	< |jd	d
rs|jD ]}||}|siq`||vrrd|jd	< q`t js||v rd|jd	< q5dS )a  
    Nodes like convolution/convolution_backward want its input to be dense.
    If we pad their inputs, we result in extra calls to copy kernels!  On the other hand, padding usually helps reduction.

    The pass finds nodes that dislike padding. These are nodes that can be reached
    from a convolution/convolution_backward in the backward direction without
    going thru a reduction.
    Nr   torch.fx.Noderf   %Optional[torch._ops.OpOverloadPacket]c                 S  s2   | j dkrt| jtjjrt| jdr| jjS d S )Ncall_function_overloadpacket)rx   ro   r   ru   _ops
OpOverloadr   r   )r   ri   ri   rj   _get_overload_packet   s   

z8mark_nodes_dislike_padding.<locals>._get_overload_packetTZdislike_paddingF)r   r   rf   r   )r&   Zcomprehensive_paddingr#   atenconvolutionconvolution_backwardZ
_scaled_mmZvar_meansummeanprodanyZaminZamaxminmaxZargminZargmaxZscatter_reducereversednodesro   r   ru   _higher_order_opstriton_kernel_wrapZTritonKernelWrapperMutationr   getZall_input_nodespad_outputs)	r   r   Zops_dislike_paddingZops_like_paddingr   currx   priorZprior_opri   ri   rj   mark_nodes_dislike_padding   s`   





r   c                      s  e Zd ZU ded< 																dd fd#d$Zdd%d&Zdd*d+Zdd-d.Zdd2d3Zdd6d7Z	dd<d=Z
dd?d@ZejddBdCZddEdFZeddHdIZddJdKZddOdPZddRdSZddTdUZddVdWZeddYdZZdd]d^ZddadbZddddeZddgdhZddjdkZd fdndoZddrdsZddtddxdyZ dd|d}Z!dddZ"dddZ#dddZ$dddZ%	ddddZ&dddZ'd  fddZ(d fddZ)edddZ*dddZ+dddZ,dddZ-d fddZ.dddZ/edddZ0edddZ1dddZ2d	 fddZ3dddZ4				d
dddƄZ5dddɄZ6ddd˄Z7ddd̈́Z8dddЄZ9dddӄZ:dZ;ded< ddd؄Z<dddڄZ=ddd܄Z>dddބZ?  Z@S (  GraphLoweringzlist[ir.IRNode]graph_outputsNFgmtorch.fx.GraphModuleexample_inputsOptional[Sequence[object]]	shape_envOptional[ShapeEnv]graph_idOptional[int]cpp_wrapperry   aot_mode
layout_optOptional[bool]extern_node_serializer4Optional[Callable[[list[ir.ExternKernelNode]], Any]]is_inferenceis_backwardis_const_graphconst_output_indexOptional[dict[str, int]]const_wrapper_codeOptional[str]const_kernel_codeconst_moduleOptional[GraphLowering]nameinputs_to_checkOptional[Sequence[int]]rf   rg   c                   s6  t  | || _|d ur|n| j||	d| _d| _|	| _|
| _|| _|| _	|| _
|| _|| _d| _|d u r=t }d| _nd| _|| _|j | _ttj  | _t|| _g | _i | _i | _tt  | _|ri|jnt | _|rr|j nt | _ d| _!i | _"g | _#g | _$|r|ni | _%|rt|& nt | _'|r|j(ni | _(i | _)i | _*i | _+tt  | _,tt  | _-tt  | _.tt  | _/tt  | _0tt  | _1d | _2d | _3g | _4ddl5m6} t78 r|r|n|| _9d | _:i | _;tt  | _<g | _=i | _>t?t@| _Ai | _BtCC | _D|| _E|| _F|| _Gi | _H|| _I|| _JtKtL| _Md | _Nd | _O| jr(| P nt | _Qtdg| _RtS|jT| _UtV|jT| jU d| _Wd| _Xg | _Yd | _Zi | _[|\ | _]| j^j_`d	i | _a|d urd|jbni | _btc  tded tf| _fi | _gt | _htt  | _it | _jt | _ktt  | _ltmn | _od
| _ptq | _rd S )N)r   r   FTcpu)extern_node_json_serializerzaten.convolution_backward  dynamo_flat_name_to_original_fqn)ssuper__init__r   decide_layout_optr   num_channels_last_convr   r   r   r   r   r   r   Zextra_tracebackr   reuse_shape_env
_shape_envZdeferred_runtime_assertscopyras_by_symbolr#   rp   rq   bound_unbacked_symbolsrL   sizevarsgraph_input_namesgraph_inputsgraph_inputs_originalr   zero_dim_cpu_tensor_listdevice_typesdevice_idxsdevice_typebuffer_to_padded_sizebuffers
operationsr   keysZfolded_constants	constantstorchbind_constantsseen_subgraphsconstant_reprsremoved_operationsremoved_buffersZremoved_inplace_buffersmutated_buffersZnever_reuse_buffersinplaced_to_remove
device_opswrapper_codeZextern_kernel_nodesZ&torch._inductor.extern_node_serializerr   r&   	is_fbcoder   current_nodelistsmutated_inputsmutated_input_idxsname_to_bufferr   listname_to_users
name_to_optimeZcreation_timer   r   Zrecord_multi_kernel_choiceZmulti_kernel_to_choicer   r   next_post_grad_graph_counterZpost_grad_graph_id	schedulercurrent_devicefind_nodes_prefer_channels_lastnodes_prefer_channels_last_warned_fallbackr   graphr   r   	cache_key
cache_pathcache_linemapZdisable_cudagraphs_reasondevice_node_mapping__copy__orig_gmmoduler   r   r   allocated_constant_namer.   	functools	lru_cacher+   Zeffectful_opsaligned_inputsZno_fuse_buffer_namesZlow_precision_codegen_opsZinvoke_quant_opsall_codegen_kernel_names	itertoolscountZworkspace_idplaceholder_idxrP   bw_donated_idxs)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   	__class__ri   rj   r     s   






zGraphLowering.__init__c                 C  s   | j   d S rh   )r   freeze_runtime_assertsr  ri   ri   rj   r    s   z$GraphLowering.freeze_runtime_assertsextorch.Tensor=tuple[Sequence[Union[int, Expr]], Sequence[Union[int, Expr]]]c           	      C  sx   | j rt| t| fS ddlm} |dt| jj }| j	||\}}}dd |D }dd |D }||fS )z
        Support dynamic shapes and dynamic strides by assigning variables
        to each dimension.  We duck-shape tensors, so if two tensors
        have the same size they get assigned the same symbolic variable.
        r   )ConstantSourceZ__inductor_unknown_tensor_c                 S  $   g | ]}t |tjr|jjn|qS ri   ro   ru   SymIntr   exprrz   r   ri   ri   rj   
<listcomp>     $ z8GraphLowering.symbolic_sizes_strides.<locals>.<listcomp>c                 S  r  ri   r  r  ri   ri   rj   r    r   )
r   rM   sizestrideZtorch._dynamo.sourcer  lenr   Z
var_to_valZ,create_symbolic_sizes_strides_storage_offset)	r  r  r  sourcer!  r"  _Zr_sizeZr_strideri   ri   rj   symbolic_sizes_strides  s&   z$GraphLowering.symbolic_sizes_strides)tuple[list[sympy.Expr], list[sympy.Expr]]c                 C  s,   dd |  D }dd | D }||fS )z+
        Primarily used to weights
        c                 S     g | ]}t |qS ri   rp   rt   r  ri   ri   rj   r        z6GraphLowering.static_sizes_strides.<locals>.<listcomp>c                 S  r(  ri   r)  r  ri   ri   rj   r    r*  )r!  r"  )r  r  r!  r"  ri   ri   rj   static_sizes_strides  s   z"GraphLowering.static_sizes_stridesr   OUnion[ir.TensorBox, ir.StorageBox, ir.Buffer, WorkspaceArg, ir.TorchBindObject]Sequence[Expr]c                 C  sP   t |tjr	|j}t |tjr|j}t |tjr$|j| jv r$| j|j S | S rh   )	ro   r'   r=   datar<   ComputedBufferr   r   get_size)r  r   ri   ri   rj   get_allocation_size  s   
z!GraphLowering.get_allocation_size2Union[ir.Buffer, WorkspaceArg, ir.TorchBindObject]r   c                 C  s*   |  }| |}|j}|j}t|||S rh   )Z
get_layoutr1  r"  offsetr   )r  r   layoutr!  r"  r3  ri   ri   rj   get_allocation_storage_size  s
   
z)GraphLowering.get_allocation_storage_sizer   .Union[torch._inductor.ir.IRNode, device, None]featurer)   c                 C  s$   t |ts	J ||| t|v S rh   )ro   r)   r+   r7   )r  r   r7  ri   ri   rj   has_feature  s   zGraphLowering.has_featuretorch.devicec                 C  s   | j  }r|S td)NzNo current device)r   r   r  r   ri   ri   rj   get_current_device_or_throw  s   
z)GraphLowering.get_current_device_or_throwIterator[None]c                 c  s*    | j }|| _ z	d V  W || _ d S || _ w rh   )r   )r  r   r   ri   ri   rj   set_current_device  s   z GraphLowering.set_current_devicer   c                 C  s   | j rdS | jr
dS dS )NZ	inferenceZbackwardforward)r   r   r  ri   ri   rj   get_training_phase  s
   z GraphLowering.get_training_phaser^   c             
   C  s  t jsdS t jr
dS dd | jjD }t|}|dkrdS tjjj	r3tjj
 r3tdd |D r3dS tt| jjd| krFtd	 dS td
d |D rVtd dS d'dd}d(dd}d(dd}|rddlm} tt}|D ]g}	tjj|	\}
}}|
r|dd#}tj |	j|i | W d   n1 sw   Y  W d   n1 sw   Y  | }||	rd}n||	rd}n	||	rd}nd}||  |7  < qttd qtd}d }d!}d"}t| }|d | |d |  |d |  |d |  }||k}|std#|| |S tt||r!td$ dS tt||r0td% dS tt||r?td& dS dS ))zl
        Decide if we should enable layout optimization for this graph based on
        heuristics.
        FTc                 S  s"   g | ]}|j tjjjjkr|qS ri   )r   ru   opsr   r   default)rz   nri   ri   rj   r  1  s    z3GraphLowering.decide_layout_opt.<locals>.<listcomp>r   c                 s  s6    | ]}d D ]}|j | jd jtdkV  qqdS )r   r%   valr   N)rd   r   r   ru   rz   rB  r   ri   ri   rj   r|   =  s    z2GraphLowering.decide_layout_opt.<locals>.<genexpr>i,  z*Skipped layout opt because only a few convc                 s  s.    | ]}d D ]}t |j| jd V  qqdS )rC  rD  N)r   rd   r   rE  ri   ri   rj   r|   L  s    zeSee perf regression with dynamic shape. Follow up in https://github.com/pytorch/pytorch/issues/102670rB  r   rf   ry   c                 S  s<   | j d jd }t|tjsJ | j d dko|ddkS )Nr%   rD  r   )rd   r   ro   ru   r   r!  )rB  Zmeta_valri   ri   rj   
is_groupedV  s   z3GraphLowering.decide_layout_opt.<locals>.is_groupedr   c                 S  sJ   | j d jd dd | j d jd dko$| j d jd ddkS )Nr%   rD  r      rd   r   r!  rB  ri   ri   rj   is_in_out_channel[  s   0z:GraphLowering.decide_layout_opt.<locals>.is_in_out_channelc                 S  s4   | j d jd ddko| j d jd ddkS )Nr%   rD  r   @   rH  rI  ri   ri   rj   is_small_channela  s   z9GraphLowering.decide_layout_opt.<locals>.is_small_channel)FlopCounterMode)displayNgroupedZsmallZin_outrA  zConv inputs meta not foundg|?5^?gtV?g333333?guV?zhSkipped layout opt in inference because weighted flops indicate slowdown, default: %d, channels last: %dzFSkip layout opt because found grouped convolution with >1 in_channels!zBSkip layout opt because some convolutions have smaller out_channelz>Skip layout opt because all convolution channels are too small)rB  r   rf   ry   )rB  r   rf   ry   )r&   Zlayout_optimizationZforce_layout_optimizationr   r   r#  ru   backendsmkldnnenabledZis_availableallr   logdebugr   Ztorch.utils.flop_counterrM  r   float	_inductorZfx_utilsZget_fake_args_kwargsrX   	fake_moder   Zget_total_flopsr   valuesmap)r   r   Z
conv_nodesZnconvrF  rJ  rL  rM  Zflop_countsr   successrd   re   Zflop_counter_modeZcounted_flopsZ	node_typeZGROUPED_MULTIPLIERZDEFAULT_MULTIPLIERZIN_OUT_MULTIPLIERZSMALL_MULTIPLIERZtotal_flopsZweighted_flopsZdo_layout_optri   ri   rj   r   %  s   
	








	
zGraphLowering.decide_layout_optc                 C  s   | j dur| j  d| S |S )z2Prepend the given name with the graph name if any.Nr%  r   r  r   ri   ri   rj   qualify_name  s   
zGraphLowering.qualify_namelist[torch.Tensor]subgraph_nameSubgraphLoweringc                 C  s.   t | ||| j| j| j| j| j| j| |d
S )a  
        Make a subgraph of the current graph with all inherited parts, except
        the graph module (`gm`) and `example_inputs`.  The subgraphs are lowered
        separately and lifted into a separate function in the parent output
        wrapper code.  The subgraph name is qualified by the parent graph's
        name. Note that the lifting of subgraph is supported for python wrapper
        only. For cpp wrapper, we inline the subgraphs in the parent wrapper.
        )
parentr   r   r   r   r   r   r   r   r   )ra  r   r   r   r   r   r   r^  )r  r   r   r`  ri   ri   rj   make_subgraph  s   zGraphLowering.make_subgraphOrderedSet[Node]c                 C  s   t t  }t| jjjD ]"}|jtjj	j
jkr|| q|jD ]}||v r-||  nq q| jjjD ]}||v r@||j q4|S )aC  
        The rule to decide if an node prefer channels last is simple.
        1. if it's input/output of a convolution
        2. if one of its user prefers channels last

        We have rule 1 because cudnn runs a faster convolution kernel for channels last inputs;
        Rule 2 is also important. It makes sure that indirect inputs to convolution also prefers
        channels last.

        Consider the scenario: conv -> batch-norm -> relu -> conv
        Without rule 2, batch-norm output may use a contiguous layout. That will cause 2 extra copies:
        1. the output of batch-norm should be channels last initially since its input is a conv's output.
           Forcing the batch-norm's output to be contiguous results in the first copy
        2. The second conv's input is initially contiguous. This layout is propagated from the batch-norm's output.
           We need convert it to channels last layout which results in the second copy.
        With rule 2, we makes sure all the tensors in the chain uses channels last layout. So both copies
        can be saved.
        )r#   r!   r   r  r   r   r   ru   r@  r   r   rA  addusersupdate)r  Z
output_setrB  userri   ri   rj   r     s    



z-GraphLowering.find_nodes_prefer_channels_lastc                 C  s*   || j vr| j | td| d S d S )NzUsing FallbackKernel: %s)r   re  perf_hint_loginfor]  ri   ri   rj   warn_fallback  s   
zGraphLowering.warn_fallbackc                 C  sR   | j |j |jd ur| j|j tjjr%|| jvr'tjj| j|< d S d S d S rh   )	r   re  typeindexr   rX   r   r   r  r:  ri   ri   rj   add_device_info!  s   
zGraphLowering.add_device_info,torch._subclasses.fake_tensor.FakeTensorModec                 C  s   t jS rh   )rX   rX  r  ri   ri   rj   rX  (  s   zGraphLowering.fake_modebuffer_name<Optional[Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]]c                 C  sl   || j v r
| j | S || jv r| j| S || jv r4tjj| }tj|tj|j|j	gtj
|R  dS d S Nr   r4  )r   r   r   rX   r   r'   ConstantBufferr6   r   dtyper+  )r  rp  r.  ri   ri   rj   try_get_buffer,  s   





zGraphLowering.try_get_buffersymbolrm   c                 C  s   t d)Nz'Should not be called for the main graph)r   )r  rw  ri   ri   rj   add_symbol_graph_input>  s   z$GraphLowering.add_symbol_graph_input2Union[ir.TensorBox, ir.Buffer, ir.TorchBindObject]c                 C  s$   |  |}|d ur|S td| )Nz$Failed to find buffer matching name )rv  r   r  rp  bufri   ri   rj   
get_bufferA  s   
zGraphLowering.get_buffertorch.dtypec                 C  s   || j v r| j | jS t| jdr5|| jjv r5| jj| }|| jv r)| j|  S || jv r5| j|  S || jv rA| j|  S || jv rM| j|  S t	d|}|r]| |
dS td| )Nmutation_real_namez1(as_strided|reinterpret_tensor)\(([a-zA-Z0-9_]+),r%   could not find )r   ru  r   r   r~  r   	get_dtyper   rematchgroupKeyError)r  rp  Zmutated_bufr{   ri   ri   rj   r  I  s$   





zGraphLowering.get_dtypeUnion[int, Expr]c                 C  sf   || j v r| j |  S || jv r | j| }| sdS | S || jv r,| j|  S td| )Nr%   r  )r   Znumelr   Zhas_tensor_output	get_numelr   r  rz  ri   ri   rj   r  _  s   



zGraphLowering.get_numelrd   r   c                   s8   t d t j| W  d    S 1 sw   Y  d S )NGraphLowering.run)r   r   run)r  rd   r  ri   rj   r  k  s   

$r  rx   ir.Operationc                 C  s^   |j d u sJ d| t|tjsJ | dt| j }| j| || j|< ||_ |S )NzOperation registered twice: rx   )	Zoperation_namero   r'   Z	Operationr^  r#  r   appendr   )r  rx   r   ri   ri   rj   register_operationo  s   
z GraphLowering.register_operationset_namebuffer	ir.Bufferr  c                C  sv   |  dt| j }| j| || j|< | }|d ur4t|tjr/|	 r/|t
dks4| | |r9||_|S )Nr{  r   )r^  r#  r   r  r   Z
get_devicero   r'   r/  Zis_zero_elementsru   r   rn  r   )r  r  r  r   r   ri   ri   rj   register_bufferx  s   


zGraphLowering.register_bufferoperation_names	list[str]c                 C  s"   |  dd| }|| j|< |S )Nlist_r%  )r^  r   r   )r  r  r   ri   ri   rj   register_operation_list  s   
z%GraphLowering.register_operation_listnode_output%Union[Iterable[ir.IRNode], ir.IRNode]c                   s   d fdd  | d S )Nvaluer  rf   rg   c                   sR   t | ttfr| D ]} | q	t | tjr%|  D ]}j| |  qd S d S rh   )ro   r   tupler'   r=   Zget_read_namesr   r  )r  xZ	read_nameregisterr  ri   rj   r    s   
z1GraphLowering.register_users_of.<locals>.register)r  r  rf   rg   ri   )r  r  ri   r  rj   register_users_of  s   zGraphLowering.register_users_ofc                 C  sD   t |tsJ | j| || jvrdS | j| D ]}|  qdS )z
        When a buffer is mutated we need to make sure all the reads to
        the old version are realized before the mutation happens.
        N)ro   r   r   re  r   realize)r  r   rh  ri   ri   rj   mark_buffer_mutated  s   

z!GraphLowering.mark_buffer_mutatedc                 C  sP   || j v r
|| jv sJ d| t| j | }|| jjv r#| jj| S | j| S )z
        In AOTI, module buffers may have been mutated during the tracing and compilation.
        Thus we need to read from previously stored original buffers, to make sure the
        generated model.so uses correct initial values.
        z$Can not find the original value for )r  r   rO   r  r   )r  r   	orig_nameri   ri   rj   get_original_value_of_constant  s   
z,GraphLowering.get_original_value_of_constantr.  Union[Tensor]c              
   C  s   t jjs| j D ]\}}t||r|  S q	|d u r#dt| j }|}|d  r0d| }| |}t	|}|}d}|| jv rR| d| }|d7 }|| jv sB|| j|< |j
d|jdt| dt| dt|d	| j|< || j|< |S )NZconstantr   Z	constant_r%  r%    r  )r&   aot_inductoruse_runtime_constant_foldingr   itemsrR   r#  isdigitr^  rT   r   ru  r  r!  r"  hashr   r  )r  r   r.  constant_namer  r  prefixZcntri   ri   rj   allocate_non_dup_const_name  s:   








z)GraphLowering.allocate_non_dup_const_namer   r=   c              	   C  s8   |  ||}ttj|t|j|jg| |R  dS rr  )	r  r=   creater'   rt  r6   r   ru  r+  )r  r.  r   new_nameri   ri   rj   add_tensor_constant  s   z!GraphLowering.add_tensor_constantdevice_overrideOptional[torch.device]c                 C  sz   | j | j|ks|du r|S tjj  | | d|j |jp!d | j | 	|W  d   S 1 s6w   Y  dS )z
        We AOT copy constants to the devices they are needed on.
        If device_override doesn't match the constant's device, then
        copy it and return a different name.
        Nr%  r   )
r   r   ru   utils_python_dispatch_disable_current_modesr  rl  rm  to)r  r   r  ri   ri   rj   r    s   $zGraphLowering.constant_namer   tuple[object]re   dict[str, object]Union[Expr, TensorBox, None]c              	     st  |  j d7  _ t |||}| |}t|tr*|jj}|| j|< | j	
| |S t|tttfrDt|}|| j|< | j	
| |S t|tr\t||d}|| j|< | j	
| |S |d u rh| j	
| d S t|tru| j	
| d S t|tjrttjjjdkrtttjjjjtjjju sJ t j!||j"d}|| j|< | j	
| |S t|tj#sJ ||j$s| %|\}}	n| &|\}}	| j'r| j(r| j | j(v rt)*t+|t,|j"|j-||	d}
nt)*t.|t,|j"|j-||	d}
|
| j|< | j	
| |
j/j/| j0|< | jjr| 1|j" t2  t3|r'| j45| W d    |
S W d    |
S 1 s3w   Y  |
S )Nr%   r   r  )r   r   rs  )6r  r   placeholderr^  ro   r    r   r  r   r   r  intry   rV  rp   Zsympifyr   r>   r   ru   	Generatorr#  rX   r   r   rf  r   iterr   Z_primsZ	rng_primsZgraphsafe_run_with_rng_stater'   GeneratorStater   r   Z_has_symbolic_sizes_stridesr+  r&  r   r  r=   r  r5   r6   ru  r9   r.  r   rn  rS   rU   r  re  )r  r   rd   re   Zexampler  r~   gensizesstridestensorr  ri   rj   r    s   













zGraphLowering.placeholderr   dict[str, Any]c              
     s  |t ju rt|d tttfrt |||S t|tj	j
s*t|dr*||i |S |tvrt|tj	js<J | d| dd }|tv rQt|ddd nTtjrt|gr[tnt}td|||| tjjj|jv rst}ntjjj|jv r~d }ntj j!"|r| j#rt$}nd }t||d	 nt|grt|||t|||z:t%d
t|  | j&}t'|}|r||}	}
||g|R i |\}}t| |i |}|r| (||	|
|| |W S  t)y } zt*||||+|j,d d }~ww )Nr   Z_inductor_lowering_functionz is not an OpOverloadr   FT)warnZoverride_decompz"Creating implicit fallback for:
%s)Zlayout_constraintz  via %s)-operatorgetitemro   r   r  dictr   r   ru   r   ZOpOverloadPacketr   rD   r   r   r   rA   rE   r&   Zimplicit_fallbacksr   r2   r3   rT  rj  Zoperator_str_CTagneeds_fixed_stride_ordertagsr@   flexible_layoutZ_libraryr  
is_builtinr   rH   rU  r   rF   propagate_mutation	Exceptionr1   with_traceback__traceback__)r  r   rd   re   	base_nameerrorZdecided_constraintrB  Zlayout_constraintsold_args
old_kwargsouter  ri   rj   r   Z  sh   


zGraphLowering.call_functiontc                 C  s   t | jdko| jd dkS )zM
        True if this is a small constant attr that will be inlined.
        r%   r      )r#  shape)r  ri   ri   rj   can_inline_constant  s   z!GraphLowering.can_inline_constant	tuple[()]8Union[Constant, TensorBox, ir.Subgraph, TorchBindObject]c                 C  s  t | j|}t|tjjr%|| jv r| j| S tj||d}|| j|< |S t|tj	j
r<|| j|< d| j|< t||dS t|trS|j| j|< d| j|< t||jdS t|tjs[J tjjsftjsft|rl| ||S t D |jdkrt| |j|jdW  d    S | |rtdt | ddl!m"} ||# |j|jd	W  d    S W d    n1 sw   Y  | ||S )
N)r   Zgraph_moduler   r  ri   )r  ru  r   zInlining constant: %s r%   )r  )ru  r   )$r   r  ro   ru   fxr^   r   r'   ZSubgraphr  ZScriptObjectr   r   r>   r   Zreal_objr   r&   r  r  Zalways_keep_tensor_constantsrI   r  r"   r  r4   itemru  r   r  rT  rU  r   loweringr  tolist)r  r   rd   re   r  r  r  ri   ri   rj   get_attr  sJ   








zGraphLowering.get_attrr   c                 C     t rh   AssertionErrorr  r   rd   re   ri   ri   rj   call_module  rk   zGraphLowering.call_modulec                 C  r  rh   r  r  ri   ri   rj   call_method  rk   zGraphLowering.call_methodc              	     sZ  t  |||}t|ttfs|f}t|ttfsJ t|tdd |D s,J |tjj	j
d }t|ttfs=|f}dd |D }g }t|t|ksPJ t||D ]C\}}t|tjtjfsh|| qUt| tjrz|tj| qUtjj|sJ dd |jd  D }	|t||	 qU|| _| j D ]q\}
}t|trqt|ttj tjjj!fsJ dt| t|tsq|"  t|tsJ |j#}t|tj$sJ |}|j#}t|t%r|& |
krtj'(|| j)|
  z| j*|}| j)|
 | j|< W q t+y   Y qw q| ,  t-.d	| j/| j0d ur(| j0 d S d
 d S )Nc                 s  s@    | ]}t |ttjtd tjtjtjj	j
ttjtjf	V  qd S rh   )ro   r=   r'   r4   rl  rt  rp   r   ZlogicZboolalgBooleanr  ZEffectfulKernelShapeAsConstantBufferrz   r  ri   ri   rj   r|     s"    
z'GraphLowering.output.<locals>.<genexpr>r   c                 S  s   g | ]}t j|qS ri   )r'   ExternKernelZrealize_inputr  ri   ri   rj   r    s    z(GraphLowering.output.<locals>.<listcomp>c                 S  r  ri   r  rz   sri   ri   rj   r  "      rD  z'Unsupported inductor graph input type: zGForce channels last inputs for %d conv for the current graph with id %dr   )1r   r   ro   r  r   rl  rS  rX   r   r   rd   r#  zipr'   r=   BaseViewr  Zget_output_specZCommBufferLayoutr  Z
copy_inputru   rW  Zis_storage_and_layoutr   r"  Ztry_match_insignificant_stridesr   r   r  r>   rp   r   r  r  r.  r<   r9   get_nameZMutationLayoutSHOULDREMOVEZrealize_intor   rm  
ValueErrorfinalizerT  rU  r   r   )r  r   rd   re   resultZfx_node_argsZresult_correct_stridesrfx_nodeZmeta_stridesr   r  Zvalue_storage_boxindr  ri   rj   r     s~   




zGraphLowering.outputc                 C  s   | j D ]}|  qd S rh   )r   Zdecide_layout)r  r{  ri   ri   rj   r  M  s   

zGraphLowering.finalizer   c                 c  s*    | j }z|| _ d V  W || _ d S || _ w rh   )r   )r  r   oldri   ri   rj   set_current_nodeQ  s   zGraphLowering.set_current_nodec                 c  s$    | j }z	d V  W || _ d S || _ w rh   r   )r  r  ri   ri   rj   set_current_wrapper_codeZ  s
   z&GraphLowering.set_current_wrapper_coder  r  
tuple[Any]r  new_args
new_kwargsc                   sd  t |t |ks
J t |t |ksJ |jtjjju ra|jd }t|ts(J tj	j
|d |d dd | D }|D ] }|d | }	|d | }
|	|
u rQq> tjjjj|	|
fi  q>dS t|jtjjskJ d fdd}|jj}tt||D ]\}\}	}
|j| }|||	|
 q}dd |jD }| D ]}|| }	|| }
|| }|||	|
 qdS )ax  Propagate mutations on new_args/new_kwargs back to old_args/old_kwargs.

        Assumes we may have cloned old_args/old_kwargs into new_args/new_kwargs
        and then called fx_node(*new_args, **new_kwargs).

        If fx_node mutates any of new_args/new_kwargs, and they are different from
        old_args/old_kwargs, then we need to update the original tensor.
        re   Z
kernel_idxZconstant_args_idxc                 S  s.   i | ]\}}|t |tjjr|jd  n|qS )rD  )ro   ru   r  r!   r   )rz   kvri   ri   rj   
<dictcomp>{  s    z4GraphLowering.propagate_mutation.<locals>.<dictcomp>N
schema_argtorch._C.Argumentold_arg	ir.IRNodenew_argrf   rg   c                   sz   ||u rd S | j d ur7| j jr9t|tjr|f}|f}t||D ]\}}||u r)q  tjj	j
j||fi  q d S d S d S rh   )Z
alias_infoZis_writero   r'   IRNoder  r   ru   r@  r   copy_rA  )r  r  r  Zold_arg_itemZnew_arg_itemr  ri   rj   maybe_propagate  s   z9GraphLowering.propagate_mutation.<locals>.maybe_propagatec                 S  s   i | ]}|j |qS ri   r\  )rz   argri   ri   rj   r     s    )r  r  r  r  r  r  rf   rg   )r#  r   ru   r@  higher_ordertriton_kernel_wrapper_mutationre   ro   r  r   r   Zget_mutated_tensorsr  r   r   r  rA  r   r   Z_schemar   r  	argumentsr   )r  r  r  r  r  r  re   Zmutatedr   r  r  r  Zschemar   r  Zschema_kwargskeyri   r  rj   r  b  s@   

z GraphLowering.propagate_mutationrB  objectc           ,        s	  d;fdd}ddl m} tjtjtg}jd	k}|r1\}}|t||O }t	j
|6  t jd	krrjtjurrtsa|d
dfddrr|d tjdd|i |}njd	kr҈jtjjju rtjdkr|d tjdkr|}	|}
jd }r|d }|d }t||||\}}ntg|R i |\}}j||}|	|
|| n8t dtj t!jr|d t"jd tj#tj$tj%frjd j&j'}nt( )}n
|d t( )}tjj*j+j,tjj*j-j,tjj*j.j,tjj*j/j,tjj*j0j,g t1dd j2D }j3v }t1 fddj2D }jddrnt"|t4rn|5  jd 6 }tj7j8j9| }|: |krn|snt	;|}t	j<=||}|rt"|t4rt"|j>t	j?r|5  |s|r&t"jd tj@r&|rj3}njd 6 }|d ur&t|dkr&tjAp| o| }tjBCjd }ttD|dk}|s|rt|E dkrjFv r|s|st	jGH|E tjI}|s&t|r&jd J s	t"|j>t	j?rt	j<j=|t	;||d}nd d! |D }t	j<jK|||d}ttj2}|dkrt"|t4rj2D ]}|jtLv r|M  tjj*jNj,tjj*jOj,tjj*jPj,g}g }jQsg|Rtjj*jSj, tjTjUr|tjjVjWj,tjjVjWjXtjj*jYj,tjjZj[j,tjjZj[j\tjjZj[jXtjjZj[j]g7 }|tjjVj^j,tjjVj^jXtjjVj_jXtjjVj`j,tjjZjaj,tjjZjajXg7 }tjTjbr|tjjcjdj,g7 }|j|v rt	j<j=|t	;jd 6 d"d}|j|v r|jed u rt	j<=|t	;tfjd jg}|jd#krt"|j>j>thtifr|5  q;|jtj2 t"|t4r%|k r%|M  t"|t4rGt"|j>tlrG|j>j>}t"|thrG|jmd$d%rG|5  W d    n	1 sRw   Y  W d    n	1 sbw   Y  W d    n	1 srw   Y  t"|t4rt"|j>t	jlrt"|j>j>t	jnr|j>j>od& nTt"|j>j>t	jpr|j>j>od& t"|j>j>t	jqrt"|j>j>j>t	jnr|j>j>j>od& n&t"|j>j>t	jrr|j>j>jsst"|j>j>jtd t	jpr|j>j>jtd od& u| ttvjw  }jd  D ]	}||x O }qjd  D ]	}||x O }qd<fd'd(}jd)krtjyjzj{} d=fd,d-}!|D ]q}"j|}|"g }#| j~|" }$|  |$sxd>d1d2}%|%|$jrd|!|"|$jk|" d3|$j  |%|$jrx|!|"|$jk|" d4|$j  |#D ]*}&tD|&j'}'|'j }(|(rt|(td5})j||)g R|& qz|!|&j'|&j'  qzq5 j|O  _ttjyjzj{jd6i }*|*d usJ td7d |* D }+||+ksJ d8| d3|+ d9  d:|  |S )?Nmsgr   rf   rg   c                   s   t dt j|  d S )Nzlowering %s %s)rT  rU  r   format_node)r  rI  ri   rj   rU    s   z%GraphLowering.run_node.<locals>.debugr   )CompilerBisectorr   ZinductorrD   c                     s   t  S rh   )reprri   rI  ri   rj   <lambda>      z(GraphLowering.run_node.<locals>.<lambda>rB   F)Zadd_to_fallback_setr  Z-user_defined_triton_kernel_layout_constraintsr  arg_kwarg_valsr%   z1Unknown triton_kernel_default_layout_constraint: r}   rD  r   c                 s  s    | ]}|j d kV  qdS )r   Nr   rz   rh  ri   ri   rj   r|         z)GraphLowering.run_node.<locals>.<genexpr>c                 3  s    | ]}|j  v V  qd S rh   )r   r  )as_strided_opsri   rj   r|   	  s    

Zinductor_realize_to_strides   )allow_paddingc                 S  r  ri   r  r  ri   ri   rj   r  H  r  z*GraphLowering.run_node.<locals>.<listcomp>Tr   d   )	thresholdZorigin_nodec                    s@   dd j  d  D } | dd jd  D  d| S )Nc                 S  s"   g | ]}d |   d| dqS )unbacked_symbol_defs= in:

get_unbacked_symbol_defs)rz   r{  ri   ri   rj   r    s    zCGraphLowering.run_node.<locals>.format_new_defs.<locals>.<listcomp>c                 s  s&    | ]}d |   d| dV  qdS )r  r  r  Nr   )rz   rx   ri   ri   rj   r|     s
    
zBGraphLowering.run_node.<locals>.format_new_defs.<locals>.<genexpr>z***
)r   extendr   r   )r  )buffer_watermarkoperation_watermarkr  ri   rj   format_new_defs  s   

z/GraphLowering.run_node.<locals>.format_new_defsr  r  r   c                   s(   t | |} j|dd  | d S )NTr  )r'   ZAssertScalarr  r  )r  r  Z	assert_opr  ri   rj   make_assert  s   z+GraphLowering.run_node.<locals>.make_assertr  r   ry   c                 S  s6   | t t  fv r	dS zt|  W dS  ty   Y dS w )NFT)r$   r  	TypeError)r  ri   ri   rj   is_convertible  s   z.GraphLowering.run_node.<locals>.is_convertiblez >= z <= )r  unbacked_bindingsc                 s  s"    | ]}t jjj||V  qd S rh   )rX   rX  r   Zunbacked_renamingsr   r  ri   ri   rj   r|     s
    
zfailed z (inductor >= fx)
fx node is: z
new operations are:

)r  r   rf   rg   rf   r   )r  r   r  r   rf   rg   )r  r   rf   ry   )Z!torch._inductor.compiler_bisectorr  r#  r   r   r#   rx   Zfetch_args_kwargs_from_envrN   r'   r  Zcurrent_originsr  rX   r   r  r  rC   Zdisable_subsystemrB   ru   r@  r
  r  r&   Z'triton_kernel_default_layout_constraintr   r   r?   r@   r   r  r   r}   ro   r  SymFloatZSymBoolr   r  r   run_noder   Z
as_stridedrA  Zas_strided_Zas_strided_scatterresizeZ	resize_asr   rf  r   r=   r  r"  rW  r  Zany_is_symbolicZmaybe_get_strideZget_stride_orderr  Zrequire_stride_orderr.  r  r   r   Z_prims_commonZis_non_overlapping_and_denser   r0  r   ZFlexibleLayoutZ stride_ordered_for_memory_formatZchannels_lastZ_is_viewZrequire_exact_stridesrG   Zrealize_hintr   mmZ_int_mmr   r  r   r  Z_has_mkldnnrQ  Z_linear_pointwisebinaryZmkldnn_rnn_layerZonednnZqlinear_pointwiser  Zbinary_tensorZ_convolution_pointwiseZ_convolution_pointwise_Z _convolution_transpose_pointwiseZqconv2d_pointwiseZhas_mklZmklZ_mkl_linearrd   r   r  r:   r;   Z
mark_reuseZhas_exceeded_max_readsr<   Zhas_large_inner_fnZLoopsZ_post_init_setattrZBufferr/  ZMultiOutputindicesZinputsr  rp   rq   r!  r   r   r   r   popZvar_to_rangeZ _default_unspecified_value_rangeissubsetlowerupperr   r   r   
setdefaultr   r   r  ),r  rB  rU  r  ZoriginsZis_call_functionrd   re   r  r  r  r  Zinp_argsZ
inp_kwargsZ	is_outputZis_user_visibleZis_input_for_as_stridedr  Zsym_stridesZstride_orderr  ZdenseZunbacked_symbols_in_stridesZ	num_usersrh  Zneed_fixed_layoutZneed_fixed_channels_last_layoutcurrZnew_unbacked_defsr{  rx   r%  r   r&  Zi0ZrasZvrr(  raZfvsmissingi1r)  Zrenamed_unbacked_bindingsr  )r  r#  rB  r$  r  rj   r,    s  































	






    r



	

	zGraphLowering.run_nodec                 C  s,   t jrtdtjdvrtdtj d S )NzC++ codegen is disabled)linuxdarwinwin32zUnsupported platform )r&   Zdisable_cpp_codegenr0   sysplatformr  ri   ri   rj   !validate_can_generate_cpp_wrapper)  s
   
z/GraphLowering.validate_can_generate_cpp_wrapperis_subgraphparent_wrapper_codeOptional[PythonWrapperCodegen]partition_signatures!Optional[GraphPartitionSignature]c                 C  s   | j  }|d |d t|dksJ dd|t|dk}|r)dn| | _| jr5| 	  t
| j| _t| j| j}|d usOJ d| j d|||||| _| jrk| jjj| j_| jjj| j_d S d S )	Nr   r   r%   zDoes not support mixing {}+r   zDevice z not supported)r   r   discardr#  formatr   r1  r   r   r?  r,   r   r-   r  r   r   Z_names_iterZsrc_to_kernel)r  r@  r`  rA  rC  r   Zonly_cpuZwrapper_code_gen_clsri   ri   rj   init_wrapper_code0  s8   




zGraphLowering.init_wrapper_code)tuple[ValueWithLineMap, ValueWithLineMap]c                   s  t fdddD rtjjr S d_ j}dd	d
 tj	j
 }|durOttjtsO|jr8|j  dd |jD } fddt|tjD n fddttjtr]jntjD jrddlm} fddtjD }|D ]}| }t|tjsJ |||< ~qztjj  | W d   n1 sw   Y  d_j  j   j!  tj"j#j$  tj"j#j%  t&'  t(ddi  W  d   S 1 sw   Y  dS  S )zQ
        For GPU, Triton kernels are autotuned and stored as cubin files
        c                 3  s    | ]}| j v V  qd S rh   )r   )rz   r   r  ri   rj   r|   `  r  z9GraphLowering.codegen_with_cpp_wrapper.<locals>.<genexpr>)cudaZxpuFr  1Union[torch.SymInt, torch.SymFloat, torch.Tensor]rf   Union[int, float, torch.Tensor]c                 S  s\   | d u rd S t | tjtjfr| jjS t | trt| S t | tjs,J dt	t
|  | S )Nz&Unknown type when creating real inputs)ro   ru   r  r+  r   hintr   r   r   r   rl  )r  ri   ri   rj   materializej  s   
z;GraphLowering.codegen_with_cpp_wrapper.<locals>.materializeNc                 S  s   g | ]}|d ur|qS rh   ri   )rz   paramri   ri   rj   r    s
    z:GraphLowering.codegen_with_cpp_wrapper.<locals>.<listcomp>c                      g | ]} |qS ri   ri   r  rN  ri   rj   r        c                   rP  ri   ri   r  rQ  ri   rj   r    rR  r%   )clone_preserve_stridesc                   s.   g | ]\}}|j v rt | tjr|qS ri   )r   ro   ru   r   )rz   r   r   )real_inputsr  ri   rj   r    s    
Tztriton.autotune_at_compile_time)r  rK  rf   rL  ))r   r&   tritonautotune_at_compile_timecodegenr   compile_to_modulecallru   Z_guardsZTracingContextZtry_getro   rX   rT  rW   Zoutput_stridesclearparams_flatr  chainr   r   Z
compile_fxrS  r   r   r   r  r  r  r   r   r   r   r   Zprecomputed_replacementsZinv_precomputed_replacementsr(   resetpatch)r  compiledZtracing_contextr[  rS  r   r   Zmutated_inpri   )rN  rT  r  rj   codegen_with_cpp_wrapperZ  s`   






	



$z&GraphLowering.codegen_with_cpp_wrapperc                 C  sJ   ddl m} tdd || j| _ W d   dS 1 sw   Y  dS )z
        (Re)initializes the scheduler member.  When initializing the scheduler, no CUBIN
        files should be generated (to avoid biasing any benchmarks and pessimizing
        fusion decisions).
        r%   )	Schedulerztriton.store_cubinFN)r   ra  r&   r^  r   )r  ra  ri   ri   rj   _update_scheduler  s   "zGraphLowering._update_schedulerc                   s   t ddd` |   |   tj| j| jj | j	
|  | j  tdtjj tj }|rQ|\ tddd  fddd	 tdd
d fddd	 | j	| j}| j	  |W  d    S 1 siw   Y  d S )NGraphLowering.codegenTlog_pt2_compile_eventzFFinished codegen for all nodes. The list of kernel names available: %sartifactc                   S  
   dddS )NZ)inductor_triton_kernel_to_post_grad_nodesjsonr   encodingri   ri   ri   ri   rj   r       z'GraphLowering.codegen.<locals>.<lambda>c                     
   t  S rh   rh  dumpsri   )
debug_infori   rj   r       
 )Zmetadata_fn
payload_fnc                   S  rg  )NZ*inductor_provenance_tracking_node_mappingsrh  ri  ri   ri   ri   ri   rj   r    rk  c                     rl  rh   rm  ri   )node_mappingsri   rj   r    rp  )r   rH  rb  rX   rU  Zdraw_orig_fx_graphr  r   r   r   Zpush_codegened_graphrW  rT  r   r  Z1log_inductor_triton_kernel_to_post_grad_node_infor   generater   Zpop_codegened_graph)r  Zprovenance_infor  ri   )ro  rr  rj   rW    s>   


	
$rc  parent_graphc                 C  s\   t ddd |j| _|j| _|j| _|   | j  W d   dS 1 s'w   Y  dS )a  
        This is a more compact version of the `codegen()` above
        where we codegen this graph as a subgraph of some parent
        graph. The parent graph is passed as an argument: the
        intention is to inline codegening of the subgraph in
        the parent graph's wrapper code (including the generated
        kerenls). The wrapper code is not finalized (via `.generate()`
        call), as this will be done in the parent graph's `codegen()`.
        GraphLowering.codegen_subgraphTrd  N)r   r   r   r   rb  r   rW  )r  rt  ri   ri   rj   codegen_subgraph  s   
"ru  Vtuple[int, list[tuple[BaseSchedulerNode, int]], list[tuple[BaseSchedulerNode, float]]]c                 C  sX   d}g }g }| j jD ]}| }||7 }|||d f ||| f q
|||fS )Nr   r  )r   r   Zget_read_write_buffers_sizesr  Zget_estimated_runtime)r  total_bytesZnode_countsZnode_runtimesr   	num_bytesri   ri   rj   count_bytes	  s   
zGraphLowering.count_byteszOptional[Callable[[str], None]]save_output_coder\   c                 C  s<   t ddddd |  W  d    S 1 sw   Y  d S )NGraphLowering.compile_to_moduleZcode_genTZ,inductor_code_gen_cumulative_compile_time_us)Z
phase_namere  Zdynamo_compile_column_us)r   _compile_to_moduler  ri   ri   rj   rX    s   $r|  c                   s   ddl m} | jr|  n|  \}tjjr-d| jj	
  | jj
  d }|j _tjd ur8tj tdj t }tj|jd zdd jD }|j\} td	  W n tyu   td
fddd  w td
 fddfddd tddd |j| |i | j| jd}W d    n1 sw   Y  || _ | _|| _tj rtj!r|j"ddd |j#d usJ t$|j# t%d	|j# t&d	|j# tj'rt(d|j# t)j*d t+j,|j# t+j-t.j/0|j#d d  |S )Nr%   )PyCodeCachez%"""
Compile-time auto-tuning block: 
z"""
zOutput code: 
%s)codec                 S  s   g | ]	\}}||j fqS ri   )Zstack_trace)rz   line_nor   ri   ri   rj   r  >  s    z4GraphLowering._compile_to_module.<locals>.<listcomp>zOutput code written to: %sZinductor_output_codec                         j S rh   r  ri   r  ri   rj   r  H      z2GraphLowering._compile_to_module.<locals>.<lambda>)rq  c                     s   d iS )Nfilenameri   ri   )pathri   rj   r  N  r  c                     r  rh   r  ri   r  ri   rj   r  O  r  zPyCodeCache.load_by_key_pathTrd  )linemapattrs)timesrepeatzCompiled module path: )filer   z.debug)1Z	codecacher~  r   r`  rW  r&   rU  rV  r   Zkernel_autotune_defsgetvalueZkernel_autotune_callsr  r   r{  rb   rU  rJ   Zinductor_meta_from_configrK   Zbegin_compileZline_mapwriter  r   r   Zload_by_key_pathr   r   r  r  r  Zbenchmark_harnessZprofile_bandwidth_outputZbenchmark_compiled_module__file__rc   rT  rj  Zbenchmark_kernelprintr=  stderrrX   Zoutput_coder   osr  splitext)r  r~  r%  Ztuning_codeZinductor_metar  r  modri   )r  r   rj   r}  %  st   






z GraphLowering._compile_to_modulec                 C  s   g }t d}t d}| jD ]1}t|tjr%|| j dt|  qt|tj	r9|| j dt|  q||
  q|S )Nr   Z_none_shape)r  r  r   ro   r'   ZNoneAsConstantBufferr  r   r   r  r  )r  namesZshape_counterZnone_counterr   ri   ri   rj   get_output_namesl  s   


zGraphLowering.get_output_namesc                 C  sR   || j  v o#| j |  dko#t| j |  dko#t| j | dkp(|| jv S )Nr%   r   r   )r   r   r  r#  r0  r7   r   r]  ri   ri   rj   is_unspec_argy  s   zGraphLowering.is_unspec_arg)NNNFFNNFFFNNNNNN)$r   r   r   r   r   r   r   r   r   ry   r   ry   r   r   r   r   r   ry   r   ry   r   ry   r   r   r   r   r   r   r   r   r   r   r   r   rf   rg   )rf   rg   )r  r  rf   r  )r  r  rf   r'  )r   r,  rf   r-  )r   r2  rf   r   )r   r6  r7  r)   rf   ry   )rf   r9  )r   r9  rf   r<  r*  )r   r^   r   ry   rf   ry   )r   r   rf   r   )r   r   r   r_  r`  r   rf   ra  )rf   rd  )r   r   rf   rg   )r   r9  rf   rg   )rf   ro  )rp  r   rf   rq  )rw  rm   rf   rg   )rp  r   rf   ry  )rp  r   rf   r}  )rp  r   rf   r  )rd   r   rf   r   )rx   r  rf   r   )r  r  r  ry   rf   r   )r  r  rf   r   )r  r  rf   rg   )r   r   rf   r  )r   r   r.  r  rf   r   rh   )r.  r   r   r   rf   r=   )r   r   r  r  rf   r   )r   r   rd   r  re   r  rf   r  )r   r   rd   r   re   r  rf   r   )r  r  rf   ry   )r   r   rd   r  re   r  rf   r  )r   r   rd   r   re   r   rf   r   )r   r   rd   r  re   r  rf   rg   )r   r   )rf   r<  )r  r   r  r  r  r  r  r  r  r  rf   rg   )rB  r   rf   r  FNNN
r@  ry   r`  r   rA  rB  rC  rD  rf   rg   )rf   rI  )rt  r   rf   rg   )rf   rw  )rf   r\   )rf   r  )r   r   rf   ry   )A__name__
__module____qualname____annotations__r   r  r&  r+  r1  r5  r8  r;  
contextlibr   r=  r?  staticmethodr   r^  rc  r   rk  rn  propertyrX  rv  rx  r|  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r   r  r  r  r  r,  r?  rH  r`  rb  rW  rv  rz  r{  rX  r}  r  r  __classcell__ri   ri   r  rj   r   
  s   
  
.

%



	

 
$


2






	




 
bX1
]J  
	*
e-	Gr   c                      s:   e Zd ZdZd fd	d
Z				dd fddZ  ZS )ra  z
    Mostly a helper class for the subgraph lowering. The main goal is to call
    init_wrapper_code with the subgraph related arguments.
    rb  r   rd   r   re   rf   rg   c                   s   || _ t j|i | d S rh   )rb  r   r   )r  rb  rd   re   r  ri   rj   r     s   zSubgraphLowering.__init__FNr@  ry   r`  r   rA  rB  rC  rD  c                   s   t  jd| j| jjd d S )NT)r@  r`  rA  )r   rH  r   rb  r   )r  r@  r`  rA  rC  r  ri   rj   rH    s
   
z"SubgraphLowering.init_wrapper_code)rb  r   rd   r   re   r   rf   rg   r  r  )r  r  r  __doc__r   rH  r  ri   ri   r  rj   ra    s    ra  )rd   r   re   r   rf   rg   )rl   rm   rf   rn   )rx   r   rf   ry   )r~   r^   r   r   rf   r   )r   r_   rf   r   )r   r_   r   r   rf   rg   )
__future__r   r  r	  r  rh  loggingr  r  r  r=  r   collectionsr   r   typingr   r   r   r   r	   r
   rp   r   ru   Ztorch._loggingZtorch.fxr   r   Ztorch._decompr   Ztorch._dynamo.utilsr   r   Z"torch._library.fake_class_registryr   r   r   Ztorch._prims_commonr   r   Ztorch._subclasses.fake_tensorr   Z%torch.fx.experimental._backward_stater   Ztorch.fx.experimental.sym_noder   r   Z%torch.fx.experimental.symbolic_shapesr   r   r   r   r   r   r    Ztorch.fx.noder!   Ztorch.utils._mode_utilsr"   Ztorch.utils._ordered_setr#   Ztorch.utils._sympy.numbersr$   r   r&   r'   r(   Zcodegen.commonr)   r*   r+   r,   r-   r.   r/   excr0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r  r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   ZruntimerJ   Zruntime.autotune_cacherK   r   rL   r  rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   ZvirtualizedrW   rX   collections.abcrY   rZ   r[   typesr\   Ztorch._higher_order_ops.effectsr]   r^   Ztorch.fx.graphr_   Zcodegen.wrapperr`   r   ra   Ztorch._inductor.codecacherb   	getLoggerr  rT  Z_loggingZgetArtifactLoggerri  r@  r   r  r   r   Ztorch._inductor.fb.utilsrc   rw   r}   r   r   r   r  ZInterpreterr   ra  ri   ri   ri   rj   <module>   s     $	$	440







M               	