o
    ZhZ                    @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZ d d	lmZmZm Z m!Z!m"Z"m#Z# d d
l$m%Z% d dl&Z'd dl(Z'd dl)m*  m+Z, d dl-m.Z. d dl'm/Z/ d dl0m1Z1 d dl2m3Z3m4Z5mZ6m*Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZE d dlFm4ZG d dlHmIZI d dlJmKZKmLZLmMZM d dlNmOZOmPZPmQZQ d dlRmSZSmTZTmUZUmVZV d dlWmXZX d dlYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_ d dl`maZa d dlbmcZcmdZdmeZemfZfmgZgmhZhmiZimjZj d dlkmlZl d dlmmnZn d dl(moZo d dlpmqZqmrZr d dlsmtZt d d lumvZv d d!lwmxZx d"d#lymzZz d"d$l{m|Z|m}Z} d"d%l~mZ d"d&lmZ d"d'lmZ d(d)lm4Z4mZ d(d*lmZ d(d+lmZ d(d,lmZ d(d-lmZ d(d.lmZmZ d(d/lmZ d(d0lmZ d(d1lmZmZ d(d2lmZ d(d3lmZ d(d4l*mZmZmZmZmZmZmZmZmZ d(d5lmZ erd d6lmZmZ d d7lYmZ d d8lmZ d(d9lmZ e d:Zed;Zese4 sdd@dAZddFdGZnd dHlmZmZ er"d dIlmZmZmZ G dJdK dKejZddLdMZe ZeeZe'jedNZe'jedOZe'jedPZe'jedQZĐd dUdVZŐddYdZZeddd\d]Zeddd^d_ZɐddcddZʐddfdgZːddjdkZ̐ddldmZ͐dddqdrZ	s		d	d
dzd{Zϐdd|d}ZАdddZѐdddZ	nddddZ	ddddZejՐdddZG dd de"dndZG dd de!ZؐdddZedddddZG dd dZG dd deZG dd de܃ZݐdddZސdddZ	ddddddddZdddZdddńZ	ddddɄZedfdddфZed Zddd݄ZdddZdddZeddfdddZdddZd ddZd ddZd!ddZ	dddd"ddZdS (#      )annotationsN)ABCabstractmethod)defaultdict)AbstractContextManager)currentframe)count)AnyCallableOptionalTYPE_CHECKINGTypeVarUnion)Neveroverride	ParamSpecProtocol	TypedDictUnpack)mock)#min_cut_rematerialization_partition)fx)enable_python_dispatcher)compiled_autogradconfigloggingutils)get_interface_for_device)wrap_compiler_debug)	chromium_event_timedCompileEventLoggercountersdetect_fake_modedynamo_timedflatten_graph_inputsget_metrics_contextlazy_format_graph_codeset_feature_use)r   )!unwrap_tensor_subclass_parameters)aot_export_modulemake_boxed_funcSerializableAOTDispatchCompiler)	code_hashFxGraphCacheoutput_code_log)BoxedDeviceIndexformat_default_skip_message#log_cudagraph_skip_and_bump_counterPlaceholderInfo)save_args_for_compile_fx_inner)CompiledAOTICompiledFxGraphCompiledFxGraphConstantsWithGmget_expanded_dimsindex_expanded_dims
OutputCode)	cache_dir)	BoxedBoolcount_tangentsfresh_inductor_cache	InputTypeis_gpushould_assume_input_aligned should_use_remote_fx_graph_cachetensor_is_aligned)trace_structured)compile_time_strobelight_meta)GraphModule)free_unbacked_symbolsSymExprPrinter)FakeTensorProp)_WaitCounter)
OrderedSet   )aot_autograd)ShortenTraceback	SkipFrame)_use_lazy_graph_module)_PyTreeCodeGen)
has_triton   )r   metrics)DebugContext)select_decomp_table)InductorError)joint_graph_passes)post_grad_passesview_to_reshape)pre_grad_passes)GraphLowering)get_device_typeIRNode)complex_memory_overlap)TritonBundler)	align_inputs_from_check_idxsclone_preserve_stridescopy_misaligned_inputs get_cloned_parameter_buffer_name%get_first_incompatible_cudagraph_node#maybe_get_suppress_shape_guards_ctxoutput_noderemove_unaligned_input_idxsshape_env_from_inputs)V)	GeneratorSequence)_StrideExprStr)
OpOverload)ExternKernelNode_P_Tattrstrreturn.Callable[[Callable[_P, _T]], Callable[_P, _T]]c                 C  s   t jS N)dynamo_utilsidentityrq    ry   I/var/www/auris/lib/python3.10/site-packages/torch/_inductor/compile_fx.pytime_and_log   s   r{   argsobjectkwargsNonec                  O     d S ru   ry   r|   r~   ry   ry   rz   log_optimus_to_scuba   s   r   )r   r{   )FQNGraphInputNameGraphSignaturec                   @  s   e Zd ZdZdZdZdS )FxCompileModer   rR   rK   N)__name__
__module____qualname__NORMAL	SERIALIZE
SUBPROCESSry   ry   ry   rz   r      s    r   c                  C  s   d} t j| }|d u rtjS z	| }t| W S  tyH   dd l}|t	}|
d|| dtdd tj D  t j|  tj Y S w )NZTORCHINDUCTOR_FX_COMPILE_MODEr   z>Invalid value of %s for %s. Expected one of %s. Using default., c                 s  s    | ]}t |V  qd S ru   )repr.0xry   ry   rz   	<genexpr>       z+_fx_compile_mode_default.<locals>.<genexpr>)osenvirongetr   r   upperKeyErrorr   	getLoggerr   errorjoinsorted__members__keyspop)namevaluer   logry   ry   rz   _fx_compile_mode_default   s&   


r   Z
perf_hintsZpre_grad_graphsZpost_grad_graphsZcudagraph_static_inputs	num_fixedint	list[int]c                 C  s2   t jj }tt| }|r|js|S ||jj S ru   )torch_guardsTracingContexttry_getlistrangefw_metadatastatic_input_indices)r   contextfixedry   ry   rz   get_static_input_idxs   s
   
r   gmrE   c                 C  s|   | j jddd }g }|jd D ]&}t|tjjr1|jd }d ur1t|tj	r1|
|  q|
d  q||jd< d S )Noutputopr   valZoriginal_output_strides)graph
find_nodesr|   
isinstancer   r   Nodemetar   Tensorappendstride)r   rf   output_stridesr   r   ry   ry   rz   record_original_output_strides   s   
r   Callable[..., None]c                   C  s
   t tS ru   )dynamo_loggingZget_step_loggerr   ry   ry   ry   rz   _step_logger   s   
r   c                   C  s>   t j rt jjjjst j dkrtd d S d S d S d S )N)   r   zTensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.)	r   cudais_availablebackendsmatmulZ
allow_tf32Zget_device_capabilitywarningswarnry   ry   ry   rz   _warn_tf32_disabled   s   
r   modgraph_signaturer   c                 C  s  ddl m}m} i }| jddD ]\}}|||< |||||jd q| jddD ]\}}|||< |||||jd q(|jjdd}	g }
|	D ]<}|j	}||j
v rZ|j
| }|
| qE||jv ru|j| }|
| t|| |jt|< qE||jv s|J |
d  qEddlm} t|jjd	 jd }g }|j}|j}|j}t|D ].\}}d }|t|t| t| k r|j	|v r||j	 }n
|j	|v r||j	 }|| q|||
|t d |i }|S )
Nr   )_assign_attr	_AttrKindF)Zremove_duplicate)Z	attr_kindplaceholderr   )_unlift)Ztorch.export.unflattenr   r   Znamed_parametersZ	PARAMETERZnamed_buffersZBUFFERr   r   r   Zinputs_to_parametersr   Zinputs_to_buffersra   r   rc   Zuser_inputsZtorch.export._unliftr   r   nodesr|   Zbuffers_to_mutateZuser_inputs_to_mutateoutput_tokens	enumeratelenpytreeZLeafSpec)r   r   r   r   r   Z
state_dictr   parambufferplaceholder_nodesZlifted_inputsnodeZ	node_nameZparameter_nameZbuffer_namer   ZoutputsZmutated_outputsZbuffer_mutationsZuser_input_mutationsr   idxoutr   unlifted_gmry   ry   rz   _unlift_graph   sp   








	r   Generator[str, None, None]c                 c  s    t t| jjdtjjjd| jjdtjjj	dD ]7}|j
tjjjkr8|jd j}|jd j}|V  |V  q|j
tjjj	krR|jd j}|jd j}|V  |V  qd S )Ncall_functionr   targetrR   rK   r   )r   	itertoolschainr   r   r   opsZhigher_orderZcondZ
while_loopr   r|   r   )r   r   Ztrue_subgraph_nameZfalse_subgraph_nameZcond_subgraph_nameZbody_subgraph_namery   ry   rz   _get_subgraph_names>  s*   
r   example_inputsSequence[InputType]c                 C  sz   t dddd, tj}tj}t| D ]}t| |}t|d}t| || qt| |||W  d    S 1 s6w   Y  d S )N_recursive_pre_grad_passesTZpre_grad_pass_time_uslog_pt2_compile_eventdynamo_compile_column_usry   )	r#   r   Zadd_pre_grad_passesZremove_pre_grad_passesr   getattrr   setattrrZ   )r   r   Z
add_passesZremove_passessubgraph_namesubgraphZnew_subgraphry   ry   rz   r   S  s   

$r   c                 C  s\   t dddd t| D ]}t| |}t| qt|  W d    d S 1 s'w   Y  d S )N_recursive_joint_graph_passesTZjoint_graph_pass_time_usr   )r#   r   r   r   rW   )r   r   r   ry   ry   rz   r   f  s   


"r   Fis_inferenceboolc                 C  s`   t dddd t| D ]}t| |}t|| qt| | W d    d S 1 s)w   Y  d S )N_recursive_post_grad_passesTZpost_grad_pass_time_usr   )r#   r   r   r   rX   )r   r   r   r   ry   ry   rz   r   r  s   
"r   Tskip_constructorlifted_constant_namesOptional[list[str]]skip_folding_node_fn)Optional[Callable[[torch.fx.Node], bool]]"tuple[GraphModule, dict[str, int]]c                 C  sZ  ddl m}m}m}m}m} || |||}	|du r|	 nd}
dd tt|	jj	d j
d D }g }g }i }| jj	D ]}|j|v rG|| q:|j| |krX|jdkrX|| q:|D ] }d|j }|| ||du rp|
||j  nd| ||j ||< q[|ddd D ]!}|jr|jD ]}|j| |ksJ d	| d
qq| j| q|   |	|fS )a  
    This function takes an GraphModule input "gm".
    The gm will be split into 2 components,
      1) const_gm, which consists the subgraph of gm that can be constant folded.
      2) gm (being inplace modified,) which returns the graph after constant folding.

    If an additional "lifted_constants" argument is passed in, we will assume the gm has
    been lifted and run the transformation accordingly.

    When a "skip_folding_node_fn" callback is passed, we will skip constant folding on
    the nodes for which the callback returns True.

    const_output_index is a mapping of corresponding node name from gm to the
    output index of const_gm.
    Returns (const_gm, const_output_index)
    r   )CONST_MODULE_TAGMETA_TAG
MODULE_TAGreplace_node_with_constantrun_and_get_constant_graphNc                 S  s   i | ]\}}|j |qS ry   )r   r   r   r   ry   ry   rz   
<dictcomp>  s    z"split_const_gm.<locals>.<dictcomp>r   r   Z_FOLDED_CONST_znode: z user not empty.)Z torch._inductor.constant_foldingr   r   r   r   r   r   tupler   r   r|   r   r   r   r   Zusers
erase_node	recompile)r   r   r   r   r   r   r   r   r   const_gmZconst_resultZconst_outputsZto_erase_nodeZto_replace_nodeconst_output_indexr   Znew_const_namenry   ry   rz   split_const_gm~  sH   




 r	  c                 C  s   t jj}t|jj|jj|jj|jjg}|D ].}| j	j
d|dD ]#}t|jdd t jrC|jd jt jkrC|jd jjdkrC  dS q qdS )Nr   r   r   r   TF)r   r   atenrJ   mmdefaultZaddmmZbmmZbaddbmmr   r   r   r   r   r   dtypeZfloat32devicetype)r   r
  Ztf32_opsr   r   ry   ry   rz   is_tf32_warning_applicable  s$   r  "AbstractContextManager[None, None]c                 C  s^   t dd | D }tjrtjr|std tjddS tjjr+td tjddS t	
 S )z
    For CPU backend, enable comprehensive padding causes some unit tests
    fail due to changing number of generated kernels. Skip for now.
    c                 s  s(    | ]}t |tjrt|jjV  qd S ru   )r   r   r   r?   r  r  )r   try   ry   rz   r     s    

z6maybe_disable_comprehensive_padding.<locals>.<genexpr>z!Skip comprehensive padding on CPUF)comprehensive_paddingz;Skip comprehensive padding for use_runtime_constant_folding)anyr   Zdisable_padding_cpur  perf_hint_loginfopatchaot_inductoruse_runtime_constant_folding
contextlibnullcontext)r   Zhas_gpury   ry   rz   #maybe_disable_comprehensive_padding  s   
r  cpp_wrapperaot_modec                 C  s   | s|r
t jddS t S )zH
    graph partition does not support cpp_wrapper and aot_mode yet.
    F)Zgraph_partition)r   r  r  r  )r  r  ry   ry   rz   maybe_disable_graph_partition  s   r  force_allow_non_fake_inputs torch._subclasses.FakeTensorModec              	   C  s   t  Y t|}|stjjdd}t| |dj|  n1|s!t nt	j
|dd}| t| |dj|  W d   n1 s?w   Y  W d   |S W d   |S W d   |S 1 s_w   Y  |S )z}
    If we can not detect fake mode from the context of inputs, create one.

    The created fake mode will be returned.
    Tallow_non_fake_inputs)moder#  N)r   r"   r   _subclassesFakeTensorModerH   	propagater  r  r   r  r}   Zpropagate_dont_convert_inputs)r   r   r   	fake_modectxry   ry   rz   fake_tensor_prop  s0   



r*  config_patches$Optional[Union[str, dict[str, Any]]]dict[str, Any]c                 C  s6   t |  t  W  d    S 1 sw   Y  d S ru   )r   r  Zget_config_copy)r+  ry   ry   rz   get_patched_config_dict  s   $r.  Generator[None, None, None]c                   c  sL    t jr!tt dd d V  W d    d S 1 sw   Y  d S d V  d S )NF)dirdelete)r   force_disable_cachesr=   r:   ry   ry   ry   rz   with_fresh_cache_if_config$  s   "
r3  c                   @  s^   e Zd ZU ded< ded< ded< ded< ded	< ded
< ded< ded< ded< ded< dS )_CompileFxKwargszOptional[BoxedBool]
cudagraphsSequence[int]static_input_idxsr   is_backwardzOptional[int]graph_idr  r  r   zOptional[bool]
layout_optz1Optional[Callable[[list[ExternKernelNode]], Any]]extern_node_serializerzOptional[BoxedDeviceIndex]boxed_forward_device_indexN)r   r   r   __annotations__ry   ry   ry   rz   r4  0  s   
 r4  )totalc                   @  s   e Zd Zdd	d
ZdS )_CompileFxCallabler   rE   r   r   r~   Unpack[_CompileFxKwargs]rs   r9   c                 K  r   ru   ry   )selfr   r   r~   ry   ry   rz   __call__>  s   z_CompileFxCallable.__call__Nr   rE   r   r   r~   r@  rs   r9   )r   r   r   rB  ry   ry   ry   rz   r?  =  s    r?  r@  r9   c              	   K  sD  | dd  | dd | dd | dd  | dd | dd | d	d  | d
d  | dd  t ^}|tjj  |tt	j
 |tjddddd |td  tjjjro|tjj  |t  |t  tjd|d d ttdd| |fi |W  d    S 1 sw   Y  d S )Nr5  r7  ry   r8  Fr9  r  r   r<  r:  r;  compile_fx_innerinductor_compileTZ#inductor_cumulative_compile_time_us)
phase_namer   r   z#pytorch.wait_counter.dynamo_compile)r8  inductor)Zcompiler_name)
setdefaultr  	ExitStackenter_contextr   r   Z_python_dispatchZ_disable_current_modesrO   dynamo_configuse_lazy_graph_modulerv   r#   rI   guard_dynamoZcallback_handlerZprevent_duplicate_callbacksZinstall_callbacksr3  rT   r    pt2_compiler   _compile_fx_inner)r   r   r~   stackry   ry   rz   rD  F  sH   


$rD  zcompilation time (in seconds)rx   graph_kwargsc                   s  t j}t| jdkr|sddlm} ||  t| j	S |
dd}td| t||}tttt| jjjd ttfsHJ d| j |d }du r[ttjj |d< }tjrgt| |fi | t }t }	td	  }
td
   tj! otj"p|	o| }tj"}|	}t#d| t$|D ]\}}t|t%j&rt'|j(j)r||v rd|_*qd}d}d d}t+| }t, }|rt-.| ||||\} |dur|\}}|rt-/ }t-j0||||||dd|d\}  du s d dkr	|du sJ t1| ||fi |}n d dkr|du sJ |dusJ t23  zUz,t1| ||fi |}|dus5J t, | |_4|d }||_5t26 \}}|7| W n" t8t9fyZ     t:yq } zt;|t< =|j>dd}~ww W t2?  nt2?  w |durt@| d< |j4 d< t-A||||| n d dksJ |dusJ |dusJ |d }||_5|dusJ |} durǈ d ndtBjCd  pi |d tBjDd| r dnd r dnd r dnd||d  durtEdfd d! fd"d!d# |F||| W d   n	1 s"w   Y  W d   n	1 s2w   Y  tGd$t |  tGHd%d&Id'd( tJd) K D  t%jLjMjNO  tP tQjRd*|d rfd+nd, d-|d.   |S )/z
    Inductor API that compiles a single graph.

    If you change the argument list for this function, make sure you
    also update the call to save_args_for_compile_fx_inner below accordingly.
    r   )_LazyGraphModuler7  ry   z&static input idxs compile_fx_inner: %szGinductor can only compile FX graphs which return a tuple/list, but got r5  Nz+pytorch.wait_counter.fx_codegen_and_compilez*pytorch.wait_counter.all_compilation_typesZfx_cacheTr8  F)r8  	constantscache_statebypassZmisstriton_bundler_metaZtime_taken_nshitdisabledfx_graph_cache_)metadatatime_nsrE  key
componentscache_bypass_reasonzcache not enabled)rU  Zcache_event_timer]  r^  r_  Zremote_cache_enabledZlocal_cache_enabledartifactc                     s   d  ddS )NrZ  jsonr   encodingry   ry   )rU  ry   rz   <lambda>:  s   z#_compile_fx_inner.<locals>.<lambda>c                     
   t  S ru   ra  dumpsry   )
cache_infory   rz   rd  >     
 Zmetadata_fn
payload_fnz%FX codegen and compilation took %.3fsz&Overview info of inductor aten mms: %sr   c                 s  s&    | ]\}}d | d| dV  qdS )(: )Nry   )r   r]  r   ry   ry   rz   r   G  s    
z$_compile_fx_inner.<locals>.<genexpr>Zaten_mm_infoztorchinductor done compiling 	BACKWARDSFORWARDS graph r9  )Sri   aot_compilationrv   Zcount_callsr   Ztorch.fx._lazy_graph_modulerS  Zforce_recompiler*   forwardrH  static_inputs_logdebugget_input_idxs_to_checkr   nextiterreversedr   r|   r  r   r   r;   r   tritonr5  Z	save_argsr3   timerA   rI   rM  r2  Zfx_graph_cacher'   r   r   r   r?   r  r  Z_is_inductor_staticr6   r\  r-   Zprepare_keyZget_remote_cacheZload_with_keyfx_codegen_and_compiler_   Zbegin_compileZ_time_taken_nsZ_fx_graph_cache_keyZcollectZset_triton_bundlerM   rN   	ExceptionrV   r   with_traceback__traceback__Zend_compilerr   Z_save_graphr    ZinstantrO  rC   Zpost_compiler   r  r   r!   items	_inductorZasync_compileZCompiledTritonKernelscache_clearr   r   INFO)r   r   rR  r  rS  r7  inputs_to_checkr5  startZfx_graph_remote_cache_Z	use_cachelocalremoteiinputZmb_compiled_graphZkey_infoZremote_cacherT  
start_timer]  Zdebug_lines	cache_keyZtriton_bundlerW  eZcompiled_graphry   )rh  rU  rz   rP  {  sR  


&













   

	rP  c                   @  s$   e Zd ZU dZded< d	ddZdS )
_FxCompileStatr   r   codegen_and_compilers   rr   c                 C  s   d| j  S )Nzcodegen_and_compile: )r  )rA  ry   ry   rz   __repr__]  s   z_FxCompileStat.__repr__Nrs   rr   )r   r   r   r  r=  r  ry   ry   ry   rz   r  Y  s   
 r  c                   @  s>   e Zd ZU dZeeZded< edddZ	e
dddZdS )	FxCompileza
    An FxCompile represents a mechanism that can turn a GraphModule into an
    OutputCode.
    z%dict[type[FxCompile], _FxCompileStat]_compile_statsr   rE   r   r   r  r6  rR  r4  rs   r9   c                 C  r   ru   ry   )rA  r   r   r  rR  ry   ry   rz   r  m  s   zFxCompile.codegen_and_compiler   c                 C  s   | j   d S ru   )r  clear)clsry   ry   rz   _reset_statsv  s   zFxCompile._reset_statsN
r   rE   r   r   r  r6  rR  r4  rs   r9   rs   r   )r   r   r   __doc__r   r  r  r=  r   r  classmethodr  ry   ry   ry   rz   r  a  s   
 r  c                   @  s   e Zd ZedddZdS )_InProcessFxCompiler   rE   r   r   r  r6  rR  r4  rs   r9   c           -        s	  d|v r
|d d usJ |d }| dd}| dd}| dd }| dd}	tj}
| dd}| d	d }| d
d }td 4 t  tj }d uradd l	}t
d| || trht  td  }ttt d t tjd|rdnd d|  dC fddtddd fddd tj  t }t t  t }W d    n1 sw   Y  t  t!| t"}| t#|d W d    n1 sw   Y  tj$  t%dt&dd d d d! td"fd#dd$ tj'j(r)tj)j*+j,tdd%d fd&dd tj-j_.t/ }|0 rMtj1d'k r@t2td( 3 }ntd( 4 }t5j6d |d) t7 rnzt8d*t9t: id+ W n t;ym   t
d, Y nw W d    n	1 syw   Y  t!| t<  t=|	|
 d }d }d }d }|
rtj>j?rt@\}}tA|g |||	|
|||d d-
}tB| |	sJ d.|C  |D \}}W d    n	1 sw   Y  tA |||	|
|||||r|jEnd |r|jEnd ||d/}tFG }tB| |jC   g }|jHd urKtI |jHD ]1}tJ|tKrD|L rDtMtN|O dkrD|PtQfd0d1|R jSD  q|Pd  qtT| tUd2d d3z |V  |jWrd4d5lXmY} |jZslJ d.|D \} }!t[d6| jE |!jErt[d7|!jE d }"|j\r|]|j\}"t[d8|" tUd9d d3 |j^|| jE|!jE|"|j_g t`a|jbjcd:}#W d    n	1 sw   Y  n|d je}#W d    n	1 sw   Y  |f \}$}%}&tF jg|$7  _gtF jh|&7  _htF ji|%7  _i|rTtjjjkrTtj,jlsTtj-jmjn  rTd }'j,joD ]-}(|(jp d;d })|(jqd<ks-tJ|)tjrr-tj-jmn|)s/q|(jp d=d  }'r; nqd>}*|'rK|* d?|' d@}*n|* d@}*|*tj,_l|rtj,jlsts}+|+rdA|+jt }*|+jp d=d  }'r{|* d?|' d@}*|*tj,_ltjrtJ|#t9tufsJ tv|#W  d    W  d    W  d    W  d    W  d    W  d    S |rtj,jlsddBlwmx}, |,tj,jytj,_l| jzt{|   j|d47  _|t}|#||tj,jl|~ td | | ||||W  d    W  d    W  d    W  d    W  d    W  d    S 1 s#w   Y  W d    n	1 s3w   Y  W d    n	1 sCw   Y  W d    n	1 sSw   Y  W d    n1 scw   Y  W d    d S W d    d S 1 s|w   Y  d S )DNr5  r7  ry   r8  Fr9  r  r   r;  r<  z/pytorch.wait_counter.actual_codegen_and_compiler   z3Sleeping for %s since sleep_sec_TESTING_ONLY is setrG  i  ztorchinductor compiling ro  rp  rq  rs   rr   c                    s*   t  } tjjjj|  dd d |  S )NrG  )save_dir)ioStringIOr   rN  ZreproZ	after_aotZsave_graph_reprogetvalue)fd)r   r   ry   rz   log_graph_runnable  s
   

zC_InProcessFxCompile.codegen_and_compile.<locals>.log_graph_runnabler`  c                   S  
   dddS )NZfx_graph_runnablestringrb  ry   ry   ry   ry   rz   rd       z9_InProcessFxCompile.codegen_and_compile.<locals>.<lambda>c                     s     S ru   ry   ry   )r  ry   rz   rd    s    rj  r   %szAFTER POST GRADTinclude_strideinclude_deviceZcoloredZinductor_post_grad_graphc                     s    j ddddS )NFTZprint_outputr  r  )print_readablery   )r   ry   rz   rd    s    rk  c                   S  r  )NZinductor_post_to_pre_grad_nodesra  rb  ry   ry   ry   ry   rz   rd    r  c                     re  ru   rf  ry   )provenance_tracking_jsonry   rz   rd  
  ri  )   
   Zgraph_break)	overwritenum_graph_breaksZpt2_configs)Zextra_loggingzfailed to log pt2_configs)	r   	shape_envr9  r  r  r;  r   r8  Zis_const_graphz"AOT mode only supports C++ wrapper)r   r  r9  r  r  r;  r   r8  r  const_wrapper_codeconst_kernel_codeZconst_moduler  c                 3  s    | ]}  |V  qd S ru   )Zdoprint)r   s)pry   rz   r   m  s    z:_InProcessFxCompile.codegen_and_compile.<locals>.<genexpr>zGraphLowering.compile_to_fn)r   rR   )AotCodeCompilerzOutput wrapper code: 
%szOutput kernel code:
%sz#Serialized Extern Kernel Nodes: 
%szAotCodeCompiler.compile)device_typeadditional_filesr   r   stack_tracezWgraph with symbolic shapes inputs and config.triton.cudagraph_skip_dynamic_graphs=True.z Found from 
z,disabling cudagraphs due to incompatible op ) check_lowering_disable_cudagraphr  )r   ri   rr  rI   rM  rv   preserve_rng_stater   Zsleep_sec_TESTING_ONLYr{  r   warningsleepr  r   r!   copysyssetrecursionlimitmaxgetrecursionlimitr   r   r  rC   ru  Zfx_graphrh   rY   r   Zno_gradr*  r   set_fake_modeget_cuda_device_contextr   Zfx_graph_transformedpost_grad_graphs_logr&   traceenabledr   	tracebackZget_graph_provenance_jsonr   r  Z _inductor_post_to_pre_grad_nodesr%   Zin_progressversion_infosumvaluesr>  r    Zcompilation_metric	is_fbcoder   rr   r.  
ValueErrorr  r  r  r  r	  r[   Zset_graph_handlerrunZcodegen_with_cpp_wrapperr   rS   ZCachedMetricsHelpergraph_outputsrG   r   r]   Zhas_tensor_outputr   rF   Z
get_strider   r  Z
get_layoutr   _check_triton_bf16_supportr#   Zfreeze_runtime_assertsr  Z	codecacher  r  r.   Zextern_kernel_nodesr;  compiler  dictfromkeyswrapper_coder  Zcompile_to_modulecallZcount_bytesZnum_bytes_accessednode_runtimesnodes_num_elemrz  Zcudagraph_skip_dynamic_graphsZdisable_cudagraphs_reasonr   Zany_is_symbolicr   r   r   r   rd   r   r   r4   torch._inductor.cudagraph_utilsr  Zdevice_node_mappingr  r  r  r5   Z
get_deltas)-rA  r   r   r  rR  r5  r7  r8  r9  r  r  r   r;  r<  Z	sleep_secr{  Zinductor_countersr  r(  cuda_contextZmetrics_contextr  r  Zconst_graphr  r  r  r   Zmetrics_helperr   r   r  r  Zkernel_codeZserialized_extern_kernel_nodescompiled_fn	num_bytesr  r  r  r   Zmeta_valdisableZmaybe_incompat_noder  ry   )r   r   r  r  r  rz   r  |  sN  




	




	



=





5


*   F
 *  G   Tz'_InProcessFxCompile.codegen_and_compileNr  )r   r   r   r   r  ry   ry   ry   rz   r  {  s    r  r  r6  c                 K  s\   t tjkr	t }nt tjkrddlm} | }nt tjkr&ddlm	} | }|
| |||S )NrR   )_DebugSerdeFxCompile)_SubprocessFxCompile)fx_compile_moder   r   r  r   Zcompile_fx_extr  r   Zcompile_fx_subprocr  r  )r   r   r  rR  schemer  r  ry   ry   rz   r|    s   



r|  inputsr7  c              	   C  s   g }t | D ]F\}}t|tjsqt|jjsqt $ ||v r,t|r,	 W d   qt	|s8	 W d   qW d   n1 sBw   Y  |
| q|S )z
    This function runs at compile time, and generates a list of indices for which we
    might need to do a copy to preserve alignment requirements.
    N)r   r   r   r   r?   r  r  re   rB   r@   r   )r  r7  Zids_to_checkr  r  ry   ry   rz   rv    s"   rv  ry   )rT  placeholdersmutated_input_idxsmodelCallable[..., Any]device_indexstack_traceslist[Optional[str]]r8  rT  tuple[torch.Tensor, ...]r  Sequence[PlaceholderInfo]r  tuple[int, ...]c                  sZ   ddl m}	 tjjrtj|	|||||||tjj	
 d	ntd  d
 fdd	}
|
S )Nr   )cudagraphify_impl)r  r  r8  r   rT  r  r  Z
compile_id
new_inputsr   rs   r	   c                   sJ    d u r!t   |  W d     | S 1 sw   Y   | S ru   )rv   r  )r  r  Zcudagraphify_fnr  r7  ry   rz   r  W  s   

zcudagraphify.<locals>.run)r  r   rs   r	   )Ztorch._inductor.cudagraph_treesr  r   rz  Zcudagraph_trees	functoolspartialr   r   CompileContextZcurrent_compile_id)r  r7  r  r  r8  r   rT  r  r  Znew_cudagraphify_implr  ry   r  rz   cudagraphify5  s"   
r  r   torch.Tensorc                 C  s   t j|  |  | j| jdS )z1
    Copy and input while preserving strides
    )r  r  )r   Zempty_stridedsizer   r  r  )r   ry   ry   rz   static_inputa  s   r  dstsrcexpanded_dimsc                 C  s"   t | |} t ||}| | dS )z=Index into expanded dimensions of both dst and src then copy_N)r8   Zcopy_)r  r  r  ry   ry   rz   index_expanded_dims_and_copy_h  s   

r  list[torch.Tensor] Callable[[list[InputType]], Any]c           	        s  t |}tt|t|| t|tsJ fddt|D fddt|D tt|D ]\}\}}t|tj	rM|vrMt
| || q5tj  tj }|tj  tj| | t W d   n1 sww   Y  |  tj | tj  tj tjj|dd | tW d   n1 sw   Y  tttfsftjrdfdd}nfddttD  d fdd}t||S )zQ
    Assumes inputs[static_input_idxs[i]] are always the same memory address
    c                   s$   g | ]\}}| vrt |ng qS ry   )r7   r  r7  ry   rz   
<listcomp>  s    z%cudagraphify_impl.<locals>.<listcomp>c                   s8   g | ]\}}t |tjs|n| vrt|n| qS ry   )r   r   r   r  detachr  r  ry   rz   r    s    
NZthread_local)streamZcapture_error_moder  list[InputType]rs   r  c                   s   t t | ks
J tt| D ]+\}\}}}t|tjs qt|tjs(J |v r7| | ks6J qt||| q|    	  S ru   )
r   r   zipr   r   r   Zdata_ptrr  r  replay)r  r   r  r  r  )r   inps_expanded_dimsr7  static_inputsstatic_outputsry   rz   r    s   
zcudagraphify_impl.<locals>.runc                   s   g | ]}| vr|qS ry   ry   r   r   r  ry   rz   r    s    c                   sN    D ]}| }| | }t |tjsJ t| || q|     S ru   )r   r   r   r  r  r  )r  r   r  r  )copy_indicesr   r  r  r  ry   rz   r    s   )r  r  rs   r  )rv  rJ   rg   rb   r   r   r   r  r   r   r  r   ZsynchronizeZStreamZwait_streamZcurrent_streamr  Z	CUDAGraphr   r  r   Zsize_assertsr   r   r`   )	r  r  r7  Zcheck_input_idxsr   r   r  r  r  ry   )r	  r   r  r7  r  r  rz   r  s  sL   











r  model_example_inputs_r  inner_compileOptional[dict[str, str]]Union[list[str], str]c           	      C  s  t | ts	J | t|  |d u rddini |ddi}|dtjj}|r1|dr0J dni |dt| j	i}|
dd }| jdd }tj|}td} tj|^ tdddd	G t 4 t| |tj||d
|d}t |ts|J |jW  d    W  d    W  d    W  d    S 1 sw   Y  W d    n1 sw   Y  W d    n1 sw   Y  W d    d S W d    d S 1 sw   Y  d S )Nr  Tzaot_inductor.output_pathz.pt2a
  The output path for aot_compile should not have an extension with .pt2 this is for specifying the output path for the .so in AOTInductor. If you would like to package the AOTInductor generated files into a pt2, please call `torch._inductor.aoti_compile_and_package`.r;  dynamo_compile_idcompile_fx_aot)r   Zreset_event_log_on_exit)r;  )r  r+  )r   rE   r(   r   r   r  output_pathendswithr,   coder   r   r   r   r  ri   Zset_aot_compilationZcompile_contextr   r%   
compile_fxr  r  r4   filename)	r
  r  r  r+  r  r;  Zsaved_compile_idZsaved_compile_contextZcompiled_artifactsry   ry   rz   r    s^   



hP r  aot_autograd_modelaot_example_inputsdynamo_modelnum_example_inputsr5  r;   r9  forward_devicer/   0Callable[[list[object]], Sequence[torch.Tensor]]c                   s>  ddl m}m}	 t|  tj| dd}
|
rt|  d ||  |	||  \} fddD  t| }t }|j	j
^ }}|jd }dd t|D |jd< tt|}tjj }dgd|d ur|jd usnJ |j}tdt|d	 tt  }|j}|d usJ d}t|dkrg tt|D ])}|vrd ||< |dkr|| ||d	  kr|d	7 }n|||  | q|jd usJ tt|jD ]}||vrd |j|< q|jr||jj7 }tj |d
d || |||d||
dW d    n	1 sw   Y  t!j"rS dfdd}d|_#|S )Nr   )%convert_conv_weights_to_channels_lastfreezeTr  c                   s   g | ]} | qS ry   ry   )r   ind)r  ry   rz   r  .  s    z(fw_compiler_freezing.<locals>.<listcomp>c                 S  "   g | ]\}}t |tjjr|qS ry   r   r   r   r   r   r   r  ry   ry   rz   r  6  s
    user_visible_output_idxsrR   r#  )r7  r5  r9  r   r<  r:  r|   list[object]rs   Sequence[torch.Tensor]c                   s&    fddD }    |S )Nc                   s"   g | ]} |t |   qS ry   )min)r   r  )r|   max_offset_idxunwrapped_args_offsetsry   rz   r  n  s    z9fw_compiler_freezing.<locals>.wrapper.<locals>.<listcomp>)r  )r|   Zargs_new)r&  optimized_functionpreserved_arg_indicesr'  r|   rz   wrapperm  s
   z%fw_compiler_freezing.<locals>.wrapper)r|   r#  rs   r$  )$Ztorch._inductor.freezingr  r  r   r[   Zdecide_layout_optr*  r   r"   r   r   r|   r   r   r   r   r   r   r   r   Zparams_flat_unwrap_subclassesr  rJ   r   Zparams_unwrapped_to_flat_indexaddr   Zparams_flatr   r   r   r  r}   ri   rr  Z_boxed_call)r  r  r  r  r  r5  r9  r  r  r  r:  Z	opt_modelr   r(  r  model_outputs_nodemodel_outputsr7  tracing_contextZparams_flat_unwrapZpreserved_indices_params_flatZunwrapped_idxscurrent_offsetr  r+  ry   )r  r&  r(  r)  r'  rz   fw_compiler_freezing  s   



r1  dict[str, object]c                   C  s8   t jjr
ttd t jjd urt jjnt ddddS )Nzcpp wrapper enabledFT)ztriton.autotune_at_compile_timeztriton.autotune_cublasLtztriton.cudagraphsztriton.store_cubin)r   rz  r5  r1   r0   Zautotune_at_compile_timerQ   ry   ry   ry   rz   get_cpp_wrapper_configz  s   r3  torch.fx.GraphModuleAbstractContextManager[None]c                 C  s   t j s	t S | jjdd}tdd |D }tdd t| j	d D }tdd ||B D }t
|dkrBt jtt|S t S )	zX
    Returns a cuda device context manager if there is a single device in the graph
    r   r   c                 s  s0    | ]}t |jd tjr|jd  jV  qdS r   N)r   r   r   r   r   r  r   r   ry   ry   rz   r     s    

z*get_cuda_device_context.<locals>.<genexpr>c                 s  s<    | ]}t |tjrt |jd tjr|jd  jV  qdS r6  )r   r   r   r   r   r   r   r  )r   argry   ry   rz   r     s    


r   c                 s  s    | ]
}|j d kr|V  qdS )r   N)r  )r   r  ry   ry   rz   r     s    rR   )r   r   r   r  r  r   r   rJ   rf   r|   r   r  rw  rx  )r   r   Zinput_devicesZout_devicesZcuda_devicesry   ry   rz   r    s"   
r  Callable[..., OutputCode]Optional[dict[str, Any]]decompositions.Optional[dict[OpOverload, Callable[..., Any]]]GUnion[Callable[[list[object]], Sequence[torch.Tensor]], str, list[str]]c                   sJ  |r$t | t|t ||dW  d   S 1 sw   Y  t jrt ddit  t|m |}ttrdd j	j
D }dd |D }tdd	 |D rtt ||D ]'\}}}	|durt|	tjspJ |j|	jkrtd
| d|j d|	j dq_|}t|tjdd|dW  d   W  d   S 1 sw   Y  W d   n1 sw   Y  tjt|d}
tst||
S ttrtj	jtrt||
S ttj t  tjjt j j!| ttr t"dfddd t#$dt%ddddd t&j	tj'j$_(t)|tdd	 |D rEt*||
W  d   W  d   W  d   S t j+rKJ t,|t-t j.j/ t0dt1t2|durd|nt3 }d: fd d!}tj|dd"}t4t5|}t j6rt7 stjt8 d#}ntj|dd"}t4t5|}d;d)d*}t9d+d,d< fd-d.}t4t5|}t:|ptj;j<dd/}tj=j>? ptj=>|}tj@rtAjdd0G tB|d|d1\}}d2d3lCmD} ||}|j	j
D ]&}|jEd4krd5|jFvrtG||jH}t|tjr|jI|dd6|jFd5< qW d   n	1 s(w   Y  tJ||}d7jFv rAjFd7 |jFd7< d8jFv rOjFd8 |jFd8< tjKL }|r[tjKjMntNjO}tP|Y tQR D | 0 |||W  d   W  d   W  d   W  d   W  d   W  d   S 1 sw   Y  W d   n	1 sw   Y  W d   n	1 sw   Y  tP| tj=S| tQR k tAjdd0T z@tT|||||d d9|W W  d   W  d   W  d   W  d   W  d   W  d   W  d   S  tUy0 } z|V dd}~ww 1 s5w   Y  W d   n	1 sEw   Y  W d   n	1 sUw   Y  W d   n	1 sew   Y  W d   n	1 suw   Y  W d   n1 sw   Y  W d   dS W d   dS 1 sw   Y  dS )=a@  
    Main entry point for compiling given FX graph.  Despite the fact that this
    lives in :mod:`torch._inductor`, this function is responsible for calling
    into AOT Autograd (and we will eventually get a callback to
    ``inner_compile`` to perform actual compilation.  In other words, this
    function orchestrates end-to-end compilation for the inductor backend when
    you use :func:`torch.compile`.

    NB: This function TAKES OWNERSHIP of the input ``model_`` and can potentially
    mutate it!  Make a copy if you need to preserve the original GraphModule.
    )r  r;  Nr  Fc                 S  s"   g | ]}|j d kr|jdqS )r   r   )r   r   r   r7  ry   ry   rz   r    s
    

zcompile_fx.<locals>.<listcomp>c                 S  s    g | ]}t |tjr|nd qS ru   r   r   r   )r   inpry   ry   rz   r    s    c                 s  s    | ]}|d uV  qd S ru   ry   )r   vry   ry   rz   r     r   zcompile_fx.<locals>.<genexpr>zBDevice mismatch between fake input and example input at position #rm  z vs zx. If the model was exported via torch.export(), make sure torch.export() and torch.aot_compile() run on the same device.T)r  Zinductor_pre_grad_graphc                     s     j dddddt j  S )NFTr  z

 # graph id: )r  idr   ry   )r
  ry   rz   rd    s
    zcompile_fx.<locals>.<lambda>r  r  zBEFORE PRE GRADr  c                 s  s     | ]}t |tttfV  qd S ru   )r   r   r  r  r   ry   ry   rz   r   4  s    r   rE   r   r   r   r   rs   r9   c              
     sD  t d |rt|  tjjt|}t| }t	j
rtj|j  t }tjj }|d ur;|jr;|s;|jj}nd}ttr]jj^ }}	|	jdksPJ t|	j\}
}t|
}n|}||kseJ || }||ksoJ  fddt||D |jd< ng |jd< | |t||dW  d    S 1 sw   Y  d S )N$compile_fx.<locals>.fw_compiler_baser   r   c                   s"   g | ]}t  | tjjr|qS ry   r   r  r.  ry   rz   r    s    z8compile_fx.<locals>.fw_compiler_base.<locals>.<listcomp>r"  )r7  r5  r9  r   r<  )rv   r#   r   r   r  r   Znum_fw_fixed_argumentsr   rf   r   Zkeep_output_strider   arg_tree_leavesr|   r   r   r   r   Znum_mutated_inp_runtime_indicesr   rE   r   r   r   tree_flattenr   r   r   )r   r   r   r   r-  Znum_model_outputsr   Zoriginal_output_start_indexr  Zorig_model_outputs_nodeZorig_model_outputsZnum_orig_model_outputsZorig_output_end_idxr5  r  r9  r  r
  r  rC  rz   fw_compiler_baseR  sV   



$rB  r  )r  r  r  r5  r9  r  joint_inputsSequence[object]r~   r}   tuple[GraphModule, GraphModule]c                 [  sN   t | }| t|  W d    n1 sw   Y  t| |fi |ddiS )NcompilerrG  )r  r   r   )r   rH  r~   r  ry   ry   rz   partition_fn  s   
z compile_fx.<locals>.partition_fnZbackward)rF  c                   s*  ddl m} td |e t| }tjr)tj|j	 }dd t
|D |jd< ng |jd< t| }tjr;tt nt % | |tt| ddW  d    W  d    W  d    S 1 sgw   Y  W d    n1 svw   Y  W d    d S W d    d S 1 sw   Y  d S )	Nr   )compile_lockcompile_fx.<locals>.bw_compilerc                 S  r  ry   r   r!  ry   ry   rz   r    s    z3compile_fx.<locals>.bw_compiler.<locals>.<listcomp>r"  T)r7  r5  r8  r9  r<  )Ztorch._dynamo.convert_framerM  rv   r#   rf   r   Zbw_outputs_user_visibler   rD  r|   r   r   r<   r  r  r3  r  r  r   r   )r   r   rM  r-  r.  r   )r5  r  r9  r  ry   rz   bw_compiler  s<   

PrN  r"  )Zunlift_effect_tokens)Ztrace_jointr;  r   )_detect_fake_mode_from_gmZget_attrr   )Zstatic_shapesZ dynamo_flat_name_to_original_fqnr  )fw_compilerrO  inference_compilerr;  rL  Zkeep_inference_input_mutationsr5  )r   rE   r   r   r   r   rs   r9   )r   rE   rH  rI  r~   r}   rs   rJ  )r   rE   r   r   rs   r9   )Wr   r  r  r  r3  ri   Zset_real_inputsr   rE   r   r   r  r  r   r   r   r  r  r  r  graph_returns_tuplemake_graph_return_tuple_codegenrP   handle_dynamo_export_graphrO   rK  rL  r   r   r  Zpreserve_node_metar  r  rC   pre_grad_graphs_logru  r&   rA  r  Z_pre_grad_graph_idr   r$   Z_raise_error_for_testingr   r;   rz  r5  r/   rw  _graph_counterrU   r+   r9   ZfreezingZis_grad_enabledr1  rD   r"   r%  r&  r   r   r   rr  functorch_configr)   Ztorch._export.utilsrP  r   r   r   r   Zfrom_tensorr   Z_CZ_is_any_autocast_enabledZ_DisableAutocastr  r  r  r   _disabletracingrL   rM   Zremove_dynamo_frames)r
  r  r  r+  r;  Zinputs_Zfake_inputsr   fir  Zrecursive_compile_fxrG  rQ  rR  rL  rO  r(  r/  r   r   rP  r   r   r   Zdisable_ampr   r  ry   rF  rz   r    s   
	
L+	



,*Q



$



*  ,  2 

8  ,  #
2    T r  c                 C  sz   t | tsdS t| j\}t |ttfrdS t |tjjj	r;t
|jdr;t|jjjdkr;tdd |jjjD r;dS dS )z"True if a FX graph returns a tupleT_schemarR   c                 s  s    | ]
}t |jd kV  qdS )r   N)rr   r  )r   retry   ry   rz   r   C  s    z&graph_returns_tuple.<locals>.<genexpr>F)r   rE   rf   r|   r   r  r   r   r   r   hasattrr   r   r]  returnsall)r   rvry   ry   rz   rS  8  s   

rS  
compile_gmc                   s   t | }|j\}t|\}| j| | j| W d   n1 s&w   Y  | j| t| s7J || | t	
 d fdd}|S )	z
    Mutate gm so it returns a tuple.  This is only needed for graphs
    not created by torchdynamo that return non-tuples.
    Nr|   r	   r~   rs   c                    s   t  | i |S ru   )r   Ztree_unflattenr   r  specry   rz   r+  ]  s   z(make_graph_return_tuple.<locals>.wrapper)r|   r	   r~   r	   rs   r	   )rf   r|   r   rE  r   Zinserting_beforer   r  rS  r  wraps)r   r  rc  r   rb  r+  ry   rd  rz   rT  J  s   	
rT  c                   sN   | j j tjj  | j _|   ||  j| td fdd}|S )z
    `torch._dynamo.export` embeds pytrees in the FX graph codegen object,
    convert that to a normal FX graph so inductor can compile it.
    r|   r	   rs   c                    s      j|   S ru   )Zprocess_outputsprocess_inputsr*  codegenr  ry   rz   r+  s  s   z+handle_dynamo_export_graph.<locals>.wrapperN)r|   r	   rs   r	   )	r   rU  r   r   CodeGenr  rg  r  rf  )r   r  rc  r+  ry   rh  rz   rV  d  s   	rV  r   r[   c                 C  s~   d	dd}t | j | jD ]-}t|tsqt|}|r(t|r(|	 t
jkr)qt|}|jddr6 d S ||  qd S )
Nr  Optional[torch.device]rs   r   c                 S  sF   ddl m} | d usJ t| j}|| }t|j d |d)Nr   )rN   z9 does not support bfloat16 compilation natively, skippingzBF16 is not supported)Ztorch._dynamo.excrN   r   r  Zget_device_propertiesr   r   r   )r  rN   device_interfaceZdevice_propsry   ry   rz   warn_and_skip{  s   


z1_check_triton_bf16_support.<locals>.warn_and_skipF)Zincluding_emulation)r  rk  rs   r   )r   r   Zgraph_inputsr  r  r   r]   r\   r?   Z	get_dtyper   Zbfloat16r   Zis_bf16_supportedZ
get_device)r   rm  r   r  rl  ry   ry   rz   r  z  s    

r  )options!Union[list[Any], tuple[Any, ...]]rn   tuple[list[Any], dict[str, Any]]c                C  sB  ddl m} || sJ dd}d}t| jjtjjjrA| jj}tjj | j_| 	  |j
jdur6|j
j}|j
jdur@|j
j}nt| drI| j}t| drQ| j}|durZt|nd}|duret|nd}	t||pmi f\}
}dd	 |
D }|dur||krtd
| d| |du r||	dni |||	d}||fS )z
    Flatten the inputs to the graph module and return the flat inputs and options.
    Add "aot_inductor.serialized_in_spec" and "aot_inductor.serialized_out_spec" to the options.
    rR   )rS  zGraph output must be a tuple(). This is so that we can avoid pytree processing of the outputs. Please change the module to have tuple outputs.N_in_spec	_out_spec c                 S  s(   g | ]}t |d  tjr|d  ndqS )rR   Nr>  r   ry   ry   rz   r    s    z(_aoti_flatten_inputs.<locals>.<listcomp>z>Trying to flatten user inputs with exported input tree spec: 
z-
but actually got inputs with tree spec of: 
)zaot_inductor.serialized_in_specz aot_inductor.serialized_out_spec)r  rS  r   r   rU  r   r   rP   rj  r  Zpytree_infoin_specout_specr_  rq  rr  r   Ztreespec_dumpsZtree_flatten_with_pathr  )r   r|   r~   rn  rS  rt  ru  ri  Zserialized_in_specZserialized_out_specZflat_args_with_pathZreceived_specZflat_example_inputsry   ry   rz   _aoti_flatten_inputs  s`   



rv  )rq   rr   rs   rt   )r|   r}   r~   r}   rs   r   )rs   r   )r   r   rs   r   )r   rE   rs   r   )rs   r   r  )r   rE   r   rE   r   r   rs   rE   )r   rE   rs   r   )r   rE   r   r   rs   rE   )F)r   rE   r   r   rs   r   )TNN)
r   rE   r   r   r   r   r   r   rs   r   )r   rE   rs   r   )r   r   rs   r  )r  r   r  r   rs   r  )r   rE   r   r   r   r   rs   r!  ru   )r+  r,  rs   r-  )rs   r/  rC  )r   rE   r   r   rR  r@  rs   r9   )
r   rE   r   r   r  r6  rR  r@  rs   r9   )r  r   r7  r6  rs   r6  )ry   )r  r  r7  r6  r  r   r  r  r8  r   r   r   rT  r  r  r  r  r  rs   r  )r   r  rs   r  )r  r  r  r  r  r   rs   r   )r  r  r  r  r7  r6  rs   r  )
r
  rE   r  r  r  r?  r+  r  rs   r  )r  rE   r  r   r  rE   r  r   r  r  r5  r;   r9  r   r  r/   rs   r  )rs   r2  )r   r4  rs   r5  )r
  rE   r  r   r  r9  r+  r:  r;  r<  rs   r=  )r   rE   r  r   rc  r  rs   r  )r   r[   rs   r   )
r   r4  r|   ro  r~   r:  rn  r:  rs   rp  )
__future__r   r  enumr  r  r   ra  r   r   r  r{  r   abcr   r   collectionsr   r   inspectr   r   typingr	   r
   r   r   r   r   Ztyping_extensionsr   r   r   r   r   r   Zunittestr   Ztorch._inductor.async_compiler   Ztorch.fxZtorch.utils._pytreer   Z_pytreer   Zfunctorch.compiler   r   Ztorch._dispatch.pythonr   Ztorch._dynamor   r   rK  r   rv   Ztorch._dynamo.device_interfacer   Ztorch._dynamo.repro.after_aotr   Ztorch._dynamo.utilsr   r    r!   r"   r#   r$   r%   r&   r'   Ztorch._functorchrY  Z7torch._functorch._aot_autograd.subclass_parametrizationr(   Ztorch._functorch.aot_autogradr)   r*   r+   Ztorch._inductor.codecacher,   r-   r.   r  r/   r0   r1   r2   Ztorch._inductor.debugr3   Ztorch._inductor.output_coder4   r5   r6   r7   r8   r9   Z%torch._inductor.runtime.runtime_utilsr:   Ztorch._inductor.utilsr;   r<   r=   r>   r?   r@   rA   rB   Ztorch._loggingrC   Ztorch._utils_internalrD   rE   Z%torch.fx.experimental.symbolic_shapesrF   rG   Z torch.fx.passes.fake_tensor_proprH   Ztorch.monitorrI   Ztorch.utils._ordered_setrJ   Z_dynamo.backends.commonrL   Z_dynamo.excrM   rN   Zfx._lazy_graph_modulerO   Zfx.graphrP   Zutils._tritonrQ   rs  rS   ru  rT   decompositionrU   excrV   Zfx_passes.joint_graphrW   Zfx_passes.post_gradrX   rY   Zfx_passes.pre_gradrZ   r   r[   Zirr\   r]   Zoutput_coder^   Ztriton_bundlerr_   r`   ra   rb   rc   rd   re   rf   rg   rh   Zvirtualizedri   collections.abcrj   rk   rl   Z
torch._opsrm   rn   ro   rp   r  r{   r   Ztorch._inductor.fb.utilsZ&torch._functorch._aot_autograd.schemasr   r   r   Enumr   r   r  r   r   r   Z_loggingZgetArtifactLoggerr  rW  r  rt  r   r   	lru_cacher   r   r   r   r   r   r   r	  r  r  r  r*  r.  contextmanagerr3  r4  r?  rD  rP  r  r  r  r|  rv  r  r  r  r  r  rX  r1  r3  r  r  rS  rT  rV  r  rv  ry   ry   ry   rz   <module>   s:     , (
,

	
LH 	5 ^   %,b>g!   "