a
    h                     @   sz  d Z ddlZddlZddlZddlZddlZddlmZmZ ddl	m
Z
mZmZ ddlZddlm  mZ ddlmZmZmZmZmZ ddlmZmZ ddlmZmZmZmZ ddl m!Z!m"Z" dd	l#m$Z$m%Z%m&Z& dd
l'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z< ddl=m>Z>m?Z? ddl@mAZA ddlBmCZC erddlDmEZE dZFe(eGdZHe(eGdZIdd ZJdd ZKdd ZLdd ZMG d d! d!ZNG d"d# d#ZOG d$d% d%ZPeO ZQg d&ZReAeeejSegZTeU aVd'd( ZWG d)d* d*ZXd+aYd+aZd+a[d+a\da]ej^d9e_d-d.d/Z`ej^d0d1 Zadd2d3d4Zbd5d6 Zcd7d8 ZddS ):a  
Provides functionality for compiling PyTorch's autograd (automatic differentiation) system.

This module implements compiled autograd, which traces and optimizes backward pass
computations at runtime. The key components are:

- AutogradCompilerInstance: Traces and compiles autograd graphs using FX
- Context managers (_enable/_disable): Control when compiled autograd is active
- Utility functions: Support graph manipulation, tensor operations, and hooks

Compiled autograd can significantly improve backward pass performance by removing
Python overhead and enabling additional optimizations. It works by capturing
backward computations into an FX graph that can be compiled and optimized,
while maintaining the same semantics as eager mode autograd.
    N)Counterdefaultdict)OptionalTYPE_CHECKINGUnion)call_accumulate_gradcall_backward	call_hookFakeCompiledAutogradEngineunwrap_maybe_dynamic_intGetItemSourceLocalSource)countersget_chromium_event_loggerlazy_format_graph_codeset_locals_to_steal)AutogradLazyBackwardCompileInfo%CachedAutogradLazyBackwardCompileInfocompile_contextCompileContext	CompileId)getArtifactLoggertrace_structuredclone_preserve_strides)FakeTensorMode)GraphModule)BackwardState)	decomposedisable_autocast_cachedisable_proxy_modes_tracingfetch_object_proxyProxyTorchDispatchModePythonKeyTracertrack_tensor_tree)
DimDynamicShapeEnv)preserve_node_metaset_stack_trace)
OrderedSet)CapturedTraceback)Proxya  You can turn off compiled autograd by either:
1. Moving the unsupported autograd call outside of the torch.compile'd region.
2. Wrapping the unsupported autograd call in the torch._dynamo.compiled_autograd._disable() context manager.
3. Setting torch._dynamo.config.compiled_autograd=False for the torch.compile call containing the unsupported autograd call.
4. Setting torch._dynamo.config.compiled_autograd=False at the start of the program.compiled_autogradcompiled_autograd_verbosec                   C   s   t jjjdS )Nr/   )torchZ_logging	_internalZ	log_stateZis_artifact_enabled r2   r2   M/var/www/auris/lib/python3.9/site-packages/torch/_dynamo/compiled_autograd.py snapshot_verbose_logging_enabledR   s    
r4   c                   C   s   t jjjjS N)r0   Z	_inductorconfigZtritonZ
cudagraphsr2   r2   r2   r3   snapshot_cudagraph_enabledX   s    r7   c                 C   s   | d urt | S | S r5   r   )xr2   r2   r3   maybe_clone\   s    r9   c                 C   sd   t | jtr| jjS t | jtrXtjj  | j	 W  d    S 1 sL0    Y  nt
dd S )NzEUnexpected Lazy Backward Compilation Info Type. Please file an issue.)
isinstance_lazy_backward_infor   	bw_moduler   r0   Z_subclassesZfake_tensorZunset_fake_temporarilyZbw_module_fnAssertionError)CompiledFunctionr2   r2   r3   extract_bw_moduleb   s    *r?   c                   @   sT   e Zd ZedddZejjdddZe	ej
 ddd	Ze	ej
 d
ddZdS )
NaNChecker)accumulate_gradc                 C   s   || _ g | _i | _g | _d S r5   )rA   params_indicesparams_to_checkoutput_names)selfrA   r2   r2   r3   __init__~   s    zNaNChecker.__init__graphc                 C   s   t t|j}|jdtd}|jddd jd }| jt|krL| j| ksPJ |D ]N}|jd }|jt	j
kr|jd |u rt|jd tsJ | j|jd  qTdd |D | _d S )	Ncall_functionoptargetoutputrK   r      c                 S   s   g | ]
}|j qS r2   )name).0noder2   r2   r3   
<listcomp>       z.NaNChecker.prep_with_graph.<locals>.<listcomp>)nextiternodes
find_nodesr   argsrA   boolrL   operatorgetitemr:   intrB   appendrD   )rE   rH   Zinputs_nodeZacc_grad_nodesoutput_nodesrR   
param_noder2   r2   r3   prep_with_graph   s(    


zNaNChecker.prep_with_graph)inputsc                 C   s`   | j s
d S | jD ]J}|| j}|d urDt| rDJ d| d|| | jd| d< qd S )Nz9Compiled autograd running under anomaly mode with inputs[zD] already having NaN gradient. This is not supported. {TURN_OFF_MSG}zinputs[])rA   rB   gradr0   isnananyrC   )rE   rb   idxrd   r2   r2   r3   prep_with_inputs   s    


zNaNChecker.prep_with_inputs)outc                 C   s   | j rj|rJ g }| j D ]0\}}|jd us2J t|j r|| q|rtdd	| dnNg }t
|D ]&\}}t| rv|| j|  qv|rtdd	| dd S )Nz9Compiled Autograd returned NaN gradients for parameters: ,.z;Compiled Autograd returned NaN gradients for output nodes: )rA   rC   itemsrd   r0   re   rf   r^   RuntimeErrorjoin	enumeraterD   )rE   ri   Z
nan_paramsZ
inputs_strparamZ	nan_gradsird   r2   r2   r3   check   s&    zNaNChecker.checkN)__name__
__module____qualname__rZ   rF   r0   fxGraphra   tupleTensorrh   rr   r2   r2   r2   r3   r@   }   s   r@   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )OpNamespacec                 C   s   t  | _d S r5   )r   custom_function_name_counterrE   r2   r2   r3   rF      s    zOpNamespace.__init__c                    s   |r4d| }| j | }| j |  d7  < | | }t| |rBJ t||| |rht| |tj  n tjj fdd}t| || |S )NZCppNoderO   c                     s    | i |S r5   r2   rY   kwargsresultr2   r3   run_non_traceable_cpp_in_eager   s    z7OpNamespace.add.<locals>.run_non_traceable_cpp_in_eager)r{   hasattrOpsetattrr0   _dynamoallow_in_graphdisable)rE   rP   fnis_custom_functionis_traceablecountr   r2   r   r3   add   s    
zOpNamespace.addc                 C   s
   t | |S r5   )getattr)rE   rP   r2   r2   r3   get   s    zOpNamespace.getN)rs   rt   ru   rF   r   r   r2   r2   r2   r3   rz      s   rz   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )r   c                 C   s   || _ || _|| _d| _d S )Nz#torch._dynamo.compiled_autograd.ops)r   r   rs   rt   )rE   rP   r   r   r2   r2   r3   rF      s    zOp.__init__c                 O   s   | j |i |S r5   )r   )rE   rY   r~   r2   r2   r3   __call__   s    zOp.__call__c                 C   s   | j d | j S )Nrk   )rt   rs   r|   r2   r2   r3   __repr__   s    zOp.__repr__N)rs   rt   ru   rF   r   r   r2   r2   r2   r3   r      s   r   )rb   sizesscalarshooksZpacked_datac                 C   s   t tt| d d dS )N)compiled_autograd_idZframe_idZframe_compile_idr   )r   r2   r2   r3   make_compile_context  s    r   c                   @   s  e Zd ZddddZdd ZeedddZee	j
 ee eeeef  eeeeef   eed	d
dZee dddZdd Zee	jjjee dddZdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Z d$d% Z!d&d' Z"d(d) Z#ed*d+d,Z$ee	j
 eed-d.d/Z%d0d1 Z&d2d3 Z'd4d5 Z(ee dd6d7Z)d8d9 Z*d:d; Z+d<d= Z,d>d? Z-d@dA Z.edBdC Z/edDdE Z0dFdG Z1dHdI Z2dJdK Z3dLdM Z4dNdO Z5dPdQ Z6dRdS Z7dTdU Z8d_eeeeef   dVdWdXZ9edYdZd[Z:eeee	jj; d\d]d^Z<dS )`AutogradCompilerInstanceNreturnc                 C   sT   || _ t | _| jj| _t | _tdd| jd| _t	 | _
t| j
d| _d | _d S )NT)Zallow_fallback_kernelsZallow_non_fake_inputs	shape_envZsymbolic)compiler_fn
contextlib	ExitStackstackcloser(   r   r   fake_tensor_moder%   	fx_tracerr$   
proxy_modehooks_proxy)rE   r   r2   r2   r3   rF     s    

z!AutogradCompilerInstance.__init__c                 C   s    t |tjsJ | jj||dS )N)source)r:   r0   ry   r   Zfrom_tensor)rE   r8   r   r2   r2   r3   	wrap_fake"  s    z"AutogradCompilerInstance.wrap_fakec                 C   s   t t| |S r5   r   )rP   rg   r2   r2   r3   r   &  s    zAutogradCompilerInstance.source)rb   r   r   originsrA   
check_nansc              
      s   t d d  d7  < tt _tt _t j _ j	  |rJt
|nd  _t  _t jd jd jidd tj  j_tjjtd j_i  j_i  _ fdd	tD \} _ _ _ _  j!"t#  |\}}	}
|d
 }z.t$|D ] \}} %| &d|||< qW n@ t'yV } z&t(dt)| dt* |W Y d }~n
d }~0 0  +|||  fddt$|D } fddt,t-|D }t$|D ]6\}} j.dt/|| fi ||< ||  j|j0< q +|||	}t$|D ]|\}} &d|}t1|tr$ j23||t4j5||< nBt1|t6rX j2j7 j2j8||t4j5d||d||< nt9dt)|q +| j|
 t$|D ]\}} j|  j|j0< q j!"t:i   j!" j;  j!" j<  j!"t=   j;j2d usJ  j;j2} j!"tjj>j?@| tAtBC |||fS )Nr.   ZcapturesrO   graph_idTZlog_pt2_compile_event)Z
tracer_clsc                 3   s    | ]} j d |di V  qdS )placeholderr2   N)r   create_proxy)rQ   rP   r|   r2   r3   	<genexpr>J  s   z9AutogradCompilerInstance.begin_capture.<locals>.<genexpr>r   rb   zFound tensor of type z,, which is not supported by FakeTensorMode. c              	      s*   g | ]"\}} j | d |tjqS )r   )r   $create_unspecified_symint_and_symbolr   r'   DYNAMIC)rQ   rg   valr|   r2   r3   rS   ^  s   
z:AutogradCompilerInstance.begin_capture.<locals>.<listcomp>c                    s   g | ]} j | qS r2   )sizes_proxyrQ   rq   r|   r2   r3   rS   j  rT   rI   r   )r   Zdynamic_dim)hintr   zUnexpected scalar type: )Dr   rU   COMPILE_COUNTERidr   r]   aot_id_counterr   r   	__enter__r@   nan_checkertimetime_nsstart_time_nsr   Zlog_event_startr0   nnModuler   rootrv   rw   r%   rH   Ztensor_attrssymnode_proxy_lookup_graph_placeholdersr   Zscalars_proxyr   packed_data_proxyr   enter_contextr)   ro   r   r   	ExceptionNotImplementedErrortypeTURN_OFF_MSGbind_objects_to_proxiesrangelenr   r   rR   r:   r   r   r'   r   floatZcreate_symfloatnodeZcreate_unspecified_symbolr=   r    r   r   r!   experimentalZsymbolic_shapesZ_suppress_guardsstrr   Zcurrent_compile_id)rE   rb   r   r   r   rA   r   Z
args_proxyZinputs_originsZsizes_originsZscalars_originsr8   rg   eproxiesrq   Zsymintr   r   Zsymvalenvr2   r|   r3   begin_capture*  s    	









z&AutogradCompilerInstance.begin_capturecompile_reasonsc                    s&    sJ t ddd  fddd d S )Nartifactc                   S   s
   dddS )NZ!compiled_autograd_compile_reasonsjsonrP   encodingr2   r2   r2   r2   r3   <lambda>  s    z>AutogradCompilerInstance.log_compile_reasons.<locals>.<lambda>c                      s    S r5   r2   r2   r   r2   r3   r     rT   Zmetadata_fn
payload_fn)r   )rE   r   r2   r   r3   log_compile_reasons  s    
z,AutogradCompilerInstance.log_compile_reasonsc                    s   fdd  D }j}t||j|j|j ~t rZjD ]}	|	j	rFt
dqFtjjfdd}
jjd|
||g|R i dd |d urj|  fdd	}| }fd
d}tjjjj||d}tj|}|S )Nc                    s   g | ]}  |qS r2   to_proxyrQ   r   r|   r2   r3   rS     rT   zDAutogradCompilerInstance.proxy_call_aot_backward.<locals>.<listcomp>z@torch.compile does not currently support higher order gradients.c                    s"   t jjjj| | g|R  }|S r5   )r0   
_functorch_aot_autogradruntime_wrappersZ_backward_prologue_functional)Zctx_saved_tensorsZctx_symints	flat_argsri   )maybe_subclass_metadatametadatar2   r3   call_aot_bwd_prologue  s    
zOAutogradCompilerInstance.proxy_call_aot_backward.<locals>.call_aot_bwd_prologuerI   kindrL   rY   r~   c                     s  dd } | j }fddt|td u D } }t|tjksRJ fdd|D }||d t|< d ur| d}i d }t j r dj  7  j  d7  <  fd	d
}j j	D ]}|j
dkr|| j}	||j|	_|	|< |d7 }q|j
dkrTt|jdks8J fdd|jd D }q|j
dkr|j}
j|
}tjj|t|
 jd|di }||j|_||< q|j
dkr|jtjjjjkrtjjjj|_jj |fdd}||j|_||< q|j
dkrh|j}
j|
}tjj|t|
 jj |fdd}||_||< qtdq|d usJ dd fdd|D }|| |S )Nc                 S   s.   d}| j D ]}|jdkr$|d7 }q
q
 q*q
|S )Nr   r   rO   )rW   rK   )rH   num_argsrR   r2   r2   r3   
num_inputs  s    

zkAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.num_inputsc                    s   g | ]} | qS r2   r2   r   )pgradsr2   r3   rS     s   zkAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.<listcomp>c                    s   g | ]}  |qS r2   r   r   r|   r2   r3   rS     rT   r   _rO   c                    s   d  d|  S )NZaotr   r2   )	node_name)deduped_aot_idr2   r3   make_unique  s    zlAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.make_uniquer   rM   c                    s2   g | ]*}t |tjjr*tj|  jn|qS r2   )r:   r0   rv   Noder-   r   rQ   n)rE   value_remapr2   r3   rS      s   Zget_attrr2   rI   c                    s    |  S r5   r2   r   r   r2   r3   r   4  rT   ziAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.<lambda>Zcall_modulec                    s    |  S r5   r2   r   r   r2   r3   r   =  rT   zshouldn't get herec                   S   s<   t  " tdddddW  d    S 1 s.0    Y  d S )Nr   {   r"   r0   zerosr2   r2   r2   r3   dummyH  s    zfAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.dummyc                    s$   g | ]}t |tjjr  n|qS r2   )r:   r0   rv   r-   )rQ   o)r   r2   r3   rS   L  s   )rH   r   r]   _get_compiled_autograd_symintsr   symintsr^   r   r   rW   rK   rR   rP   rY   rL   r   Zget_fresh_qualnamer   r   r   create_noder0   opsatenviewdefaultZreshapeZ	node_copyr=   r   )r   r   Z	pall_argsr   psymintsZargs_idxZpoutputsr   rR   phrP   qualnamer   outputs)aot_idr<   ctxpbackward_stater   rE   )r   r   r   r3   copy_paste_aot_backward_graph  s|    










zWAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graphc                    sX   t jj fdd}tj|}jjd|t|i d}	 }
|g|g |S )Nc                     s   j |  dS )N)
is_runtime)Zcreation_fn)unwrapped_argsr  subclass_metar2   r3   make_subclassU  s    zkAutogradCompilerInstance.proxy_call_aot_backward.<locals>.proxy_subclass_constructor.<locals>.make_subclassrI   r   )r0   r   r   pytreetree_mapr   r   r   rx   allocate_dummyr   )r	  r  r  r
  Zpunwrapped_argsZpoutputrM   r|   r  r3   proxy_subclass_constructorT  s    zTAutogradCompilerInstance.proxy_call_aot_backward.<locals>.proxy_subclass_constructor)Zmake_subclass_override)r   _forward_clsr?   r   r   _aot_idr0   Zis_grad_enabledZoutput_infoZrequires_gradrm   r   r   r   r   r   r   r   r   Z_backward_epilogue_functionalr  r  r   )rE   pinputspsaved_tensorssaved_tensorspctxr  maybe_backward_state_idxr   r>   Zoutput_alias_infor   r  r  r  resultsZpresultsr2   )r  r<   r  r   r   r  r   rE   r3   proxy_call_aot_backward  sN    


e
z0AutogradCompilerInstance.proxy_call_aot_backward)backward_idxr  r  c              
   C   s  | j d usJ | j | }| |}| |}	t|jdrN| ||	||||}
n| jjdt||	g|R i d}
|
d usxJ t v g }t	|D ]L\}}|d u s|
| d u r|
d  q|\}}}}|
tj||||d q| ||
 W d    n1 s0    Y  t|S )Nr  rI   r   )sizedtypelayoutdevice)r   r   r   r  r  r   r   r   r"   ro   r^   r0   emptyr   rx   )rE   rb   Zoutput_metadatasr  r  r  r  r  r  r  r   Zgrad_insrg   output_metadatar  r  r  r  r2   r2   r3   proxy_call_backwardo  sH    	


	

*z,AutogradCompilerInstance.proxy_call_backwardc           	   	   C   sJ   ||  ||  ||  ||  ||  ||  |f}| t|d gd S )N   )r   
proxy_callcopy_slices_prologue)	rE   rb   
base_sizesbase_stridesbase_storage_offset
view_sizesview_stridesview_storage_offsetrY   r2   r2   r3   call_copy_slices_prologue  s    	z2AutogradCompilerInstance.call_copy_slices_prologuec                 C   s    |  t||||fd gt| S r5   )r!  copy_slices_epiloguer   )rE   needs_input_gradr   res
grad_slicer2   r2   r3   call_copy_slices_epilogue  s
    
z2AutogradCompilerInstance.call_copy_slices_epiloguec                 C   s8   t   tddgW  d    S 1 s*0    Y  d S )Nr   i[r   r|   r2   r2   r3   r    s    z'AutogradCompilerInstance.allocate_dummyc                 C   s   t ||||S )zBinds ops.fn_name = fn)r   r   )rE   fn_namer   r   r   r2   r2   r3   bind_function  s    z&AutogradCompilerInstance.bind_functionc                 C   s    t |}| ||g|R |S )z:Proxies a call to ops.fn_name(grads, *args) into the graph)r   r   r!  )rE   r/  ZgradsrY   r  rK   r2   r2   r3   apply_functional  s    
z)AutogradCompilerInstance.apply_functionalc                    sn   t |\}}t fdd|}jjd||i d fdd|D }| fddtt|D  |S )z*Proxies a call to fn(*args) into the graphc                    s
     | S r5   r   )r   r|   r2   r3   r     rT   z5AutogradCompilerInstance.proxy_call.<locals>.<lambda>rI   r}   c                    s   g | ]}   qS r2   )r  )rQ   r   r|   r2   r3   rS     rT   z7AutogradCompilerInstance.proxy_call.<locals>.<listcomp>c                    s   g | ]} | qS r2   r2   r   )	proxy_outr2   r3   rS     rT   )r  Ztree_flattenr  r   r   r   r   r   )rE   r   rY   r  r   r   
proxy_argsr   r2   )r2  rE   r3   r!    s    "z#AutogradCompilerInstance.proxy_callc                 C   sX   t d}t| j|g|R }| jjd||i d}t|t|ksHJ | || |S )zEProxies a call to ops.validate_outputs(outputs, *args) into the graphvalidate_outputsrI   r}   )	r   r   r  r  r   r   r   r   r   )rE   r   r  rY   r  rK   r3  Znew_proxy_outputsr2   r2   r3   r4    s    
z)AutogradCompilerInstance.validate_outputsc                 C   sJ   |  |}|  |}| jjdtj||fi d}|  }| |g|g |S NrI   r}   )r   r   r   r0   r   r  r   )rE   Zold_varZnew_varZold_var_proxyZnew_var_proxyr2  r   r2   r2   r3   
accumulate  s    

z#AutogradCompilerInstance.accumulatec                 C   s*   | j jdt| || ||fi d d S r5  )r   r   r   r   )rE   variablerd   Zhas_post_hooksr2   r2   r3   rA     s    z(AutogradCompilerInstance.accumulate_gradc                    s(    j dt|g fdd|D R |S )NrI   c                    s   g | ]}  |qS r2   r   rQ   r8   r|   r2   r3   rS     rT   z<AutogradCompilerInstance.proxy_call_hook.<locals>.<listcomp>)r   r   r	   )rE   hookrY   r~   r2   r|   r3   proxy_call_hook  s    z(AutogradCompilerInstance.proxy_call_hookc                 C   sN   | j d usJ | j | }| j| }| j||dd}|  }| |g|g |S )Nunpack_hook	hook_type)r   r   r:  r  r   )rE   hook_idZdata_idr9  dataproxyri   r2   r2   r3   r;    s    

z$AutogradCompilerInstance.unpack_hook)rq   c                 C   sz   | j d usJ | j | }| j||| dd}t 4 t|| ||< | || g|g W d    n1 sl0    Y  |S )Ntensor_pre_hookr<  )r   r:  r"   r9   r   )rE   rb   r>  rq   r9  r@  r2   r2   r3   rA    s    
2z(AutogradCompilerInstance.tensor_pre_hook)rb   r>  rq   c              	   C   sv   | j dtjjjj|| || fi }t 4 t	|| ||< | 
|| g|g W d    n1 sh0    Y  |S NrI   )r   r   r0   _Cr   r.   Zcall_cpp_tensor_pre_hooksr   r"   r9   r   )rE   rb   r>  rq   r@  r2   r2   r3   cpp_tensor_pre_hook  s    
2z,AutogradCompilerInstance.cpp_tensor_pre_hookc                 C   sl   | j d usJ | j | }| j||dd}t * dd |D }| || W d    n1 s^0    Y  |S )Npre_hookr<  c                 S   s   g | ]}t |qS r2   r9   r8  r2   r2   r3   rS   2  rT   z5AutogradCompilerInstance.pre_hook.<locals>.<listcomp>r   r:  r"   r   )rE   rb   r>  r9  r   r2   r2   r3   rE  )  s    
*z!AutogradCompilerInstance.pre_hookc                 C   sn   | j d usJ | j | }| j|||dd}t * dd |D }| || W d    n1 s`0    Y  |S )N	post_hookr<  c                 S   s   g | ]}t |qS r2   rF  r8  r2   r2   r3   rS   @  rT   z6AutogradCompilerInstance.post_hook.<locals>.<listcomp>rG  )rE   r  rb   r>  r9  r   r2   r2   r3   rH  6  s    
*z"AutogradCompilerInstance.post_hookc                 C   sz   t |tjsJ | jd usJ | j| }| j||dd}t ( t|g}| ||g W d    n1 sl0    Y  |S )Npost_acc_grad_hookr<  )r:   r0   ry   r   r:  r"   r9   r   )rE   inputr>  r9  r@  r2   r2   r3   rI  D  s    

,z+AutogradCompilerInstance.post_acc_grad_hookc                 C   sF  i }d}t |j}|d jdks$J |d }t |j }tt}|| |d ksVJ |t| d }|| |d kszJ t|D ]|\}	}
|s|
jd j	j
dkrd}q|
jd j	j
d	k}t|
jd  dk}|r|rt |
j }td
d |D r|
||	< q|rB| D ]&}
td|
 |
jd  |
jd< qt | S g S )NFr   rb   rO   r   cudaTcpuc                 s   s@   | ]8}t |jtjjr"|jjd v p6t |jto6|jj V  qdS ))Zprimsr   N)r:   rL   r0   Z_opsZ
OpOverload	namespacer   r   rQ   userr2   r2   r3   r   n  s   	
zDAutogradCompilerInstance.move_graph_nodes_to_cuda.<locals>.<genexpr>zMoving node %s from cpu to cuda)listrW   rL   userskeysr   r   ro   metar  r   r  allvaluesverbose_logdebugrL  )rE   rH   Zto_moveZhas_cuda_inputsrW   rb   Zinputs_usersZfirst_getitem_idxZlast_getitem_idxrq   rR   Zis_cpuZ	is_scalarZ
node_usersr2   r2   r3   move_graph_nodes_to_cudaW  s8    
	
z1AutogradCompilerInstance.move_graph_nodes_to_cudac                 C   s6   t |tjjo4|jdko4|jtjjjj	tjjj
jfv S rB  )r:   r0   rv   r   rK   rL   r   r   Zsym_sizer]   Z	sym_numelr   )rE   rR   r2   r2   r3   is_sym_node  s    z$AutogradCompilerInstance.is_sym_nodec                    s   t   t| jjjddD ]\}} |j  q|tt	d ksHJ  fdd}t| jjj
}| jj| t| jjj
}td||  d S )Nr   rN   rO   c                    s(   |  v s| j dkr | jtv r dS |  S )NrI   T)rK   rL   _impure_targets	is_impurerR   Zunpack_nodesr2   r3   r\    s    z/AutogradCompilerInstance.dce.<locals>.is_impurezDCE removed %d nodes)r+   ro   r   rH   rX   updaterR  rS  r   r   rW   Zeliminate_dead_coderW  rX  )rE   rq   rR   r\  beforeafterr2   r^  r3   dce  s    zAutogradCompilerInstance.dcec           
      C   s   g }g }t | jjj}t| t|}|jdks4J |j D ]0}|jt	j
ksRJ |jrd|| q>|| q>t }|D ]b}t|jtsJ |jd |ksJ t|jd tsJ t|}||jd  |jd |f|_qz|D ]}	| jj|	 q|S )Nr   r   rO   )rV   r   rH   rW   rU   rP   rR  rS  rL   r[   r\   r^   setr:   rY   rx   r]   r   r   Z
erase_node)
rE   Z
used_sizesZunused_sizesitZ
sizes_nodegetitem_nodeused_sizes_idxusedZnext_size_idxZunusedr2   r2   r3   remove_unused_sizes  s,    z,AutogradCompilerInstance.remove_unused_sizesc                 C   s   t | jj| jj|S r5   )r   r   r   rH   )rE   r   r2   r2   r3   create_graph_module  s    z,AutogradCompilerInstance.create_graph_modulec              	      s  j dtjdi  j  j ddj |fi  g t	 rX
j jj jjD ] }dD ]}||jv rj|j|= qjqbtddd fddd	                 jrjj j  d
j  t dg td dddd}td| t !d| td fddd fdd}t" j#dt$% djij&dd j'(d d d  |) fS )NrI   r2   rM   )Ztensor_metaZexample_valuer   r   c                   S   s
   dddS )NZ&compiled_autograd_graph_pre_reorderingstringr   r2   r2   r2   r2   r3   r     s    z6AutogradCompilerInstance.end_capture.<locals>.<lambda>c                      s&   t  jj jjd j djddS )NCompiledAutogradZPreReorderingFZprint_output)r   r   r   rH   r   print_readabler2   r|   r2   r3   r     s   r   rk  rb   zCompiled autograd graphT)Zinclude_deviceZinclude_strideZcoloredz%sZcompiled_autograd_graphc                      s    j ddS )NFrl  )rm  r2   rG   r2   r3   r     rT   )r   c              	      s2  z$da jrj| g }t|D ]H\}}|v r&|dkrd|td| tj|d d q&|| q& D ]}	||	 	 j
dd||	< qtt v tjJ | |||||}
jrȈj|
 |
W  d    W  d    W da S 1  s0    Y  W d    n1 s0    Y  W da nda 0 d S )NTr   rK  rO   )Znon_blockingF)in_compiled_autograd_regionr   rh   ro   r^   r0   r  r   Zmaybe_mark_dynamicZ
pin_memoryrL  _disabler   r   rr   )Zcompiled_fnrb   r   r   r   Zpacked_inputsZfiltered_sizesrg   integerrq   ri   )runtime_inputs_to_moverE   rf  r2   r3   runtime_wrapper  s.    
 6z=AutogradCompilerInstance.end_capture.<locals>.runtime_wrapperr.   r   r   )*r   r   r
   _exec_final_callbacks_stubr   r   r   Z
create_argr   r7   rY  rH   rW   rT  r   delay_unpack_hook_nodesreorder_tensor_pre_hook_nodes'reorder_pre_hook_nodes_to_schedule_asapreorder_accumulate_grad_nodes%reorder_pre_hook_nodes_to_mimic_eager reorder_post_acc_grad_hook_nodesreorder_post_hook_nodesrb  r   ra   rh  ri  r   r   r   compiled_autograd_loginforW  rX  r   Zlog_event_endr   r   r   r   __exit__r   )rE   r  rR   fieldZlazy_graph_coderr  r2   )rH   rq  rE   rf  r3   end_capture  sz    



z$AutogradCompilerInstance.end_capturec                 C   s   dd | D }|S )Nc                 S   s    g | ]}t |tjju r|qS r2   )r   r0   rv   r   r   r2   r2   r3   rS   E  rT   z:AutogradCompilerInstance.get_all_nodes.<locals>.<listcomp>r2   )rY   rW   r2   r2   r3   get_all_nodesB  s    z&AutogradCompilerInstance.get_all_nodesc                 C   s8   | j dks0| j dkr4| jtjkr4| jd j dkr4dS dS )Nr   rI   r   TF)rK   rL   r[   r\   rY   r]  r2   r2   r3   is_placeholderH  s    

z'AutogradCompilerInstance.is_placeholderc                 C   s   | j jjdtdD ]t}|jd |jd  }}d}|jtjkrJ|}|jd }t||g}||j	ur| 
|s|| |dur|| qdS )a  
        Usage of AOTAutograd causes all the accumulate_grad_ nodes to get pushed to the end of
        the graph.  This differs from eager mode, which schedules them as soon as possible. This
        pass attempts to reorder the graph to mimic eager behavior.
        rI   rJ   r   rO   N)r   rH   rX   r   rY   rL   r[   r\   maxprevr  r^   )rE   rR   r`   Z	grad_nodere  argr2   r2   r3   rw  R  s    


z6AutogradCompilerInstance.reorder_accumulate_grad_nodesc                 C   sD   | j jjdtdD ],}|jdddkr*qt|j}|| qdS )zp
        We can delay unpack hooks until they are needed, even later than in the eager autograd engine.
        rI   rJ   r=  Nr;  )	r   rH   rX   r	   r~   r   minrR  prepend)rE   rR   Z
first_userr2   r2   r3   rt  g  s    

z0AutogradCompilerInstance.delay_unpack_hook_nodesc                 C   sl   | j jjdtdD ]T}|jdddkr*q|jd }|jd }||jur| |s|	| |	| qdS )a  
        Usage of AOTAutograd causes all the tensor_pre_hook nodes to get pushed
        to the end of the graph. This differs from eager mode, which schedules
        them as soon as possible. This pass attempts to reorder the graph to
        mimic eager behavior.
        rI   rJ   r=  NrA  r   rO   )
r   rH   rX   r	   r~   r   rY   r  r  r^   )rE   rR   re  
input_noder2   r2   r3   ru  t  s    



z6AutogradCompilerInstance.reorder_tensor_pre_hook_nodesc                 C   s   | j jjdtdD ]}|jdddkr*q|jd }| |jd }g }g }|g}|D ]>}|jdkrV|j	t
jkrV||jd  || || qVt||D ]\}}	|| ||	 qt|}
|
|jur| |
s|
| |D ]}|| qqdS )a  
        In this function, we schedule the pre hooks as soon as possible. This
        does not match eager behavior (schedule pre hook right before its
        registered node), but it can make acc grad be scheduled properly when
        the pre hooks are registered to them. After reordering acc grad node, we
        will reorder the pre hooks again to mimic eager behavior.
        rI   rJ   r=  NrE  r   rO   )r   rH   rX   r	   r~   r   rY   r  rK   rL   r[   r\   r^   zipremover  r  r  )rE   rR   re  input_nodesZ	to_removeZ	to_appendZ
hook_blockr   abr  r2   r2   r3   rv    s.    




z@AutogradCompilerInstance.reorder_pre_hook_nodes_to_schedule_asapc                 C   s   g }| j jjdtdD ]"}|jdddkr.q|| qt|D ]}|jd }t	|j
 }t|dkrlqBtdd |D sJ tt|d j
 }||jurB|| || |D ]}|| qqBdS )	a%  
        Usage of AOTAutograd causes all the pre_hook nodes to get pushed to the
        end of the graph. This differs from eager mode, which schedules them
        right before their registered node execution. This pass attempts to
        reorder the graph to mimic eager behavior.
        rI   rJ   r=  NrE  r   c                 s   s$   | ]}|j d ko|jtjkV  qdS )rI   N)rK   rL   r[   r\   rO  r2   r2   r3   r     s   zQAutogradCompilerInstance.reorder_pre_hook_nodes_to_mimic_eager.<locals>.<genexpr>)r   rH   rX   r	   r~   r   r^   reversedrY   rQ  rR  rS  r   rU  rU   rV   r  )rE   Z	pre_hooksrR   Zhook_getitem_noderR  Zregistered_noder\   r2   r2   r3   rx    s*    




z>AutogradCompilerInstance.reorder_pre_hook_nodes_to_mimic_eagerc                 C   s   g }| j jjdtdD ]"}|jdddkr.q|| qt|D ]p}|jd }|jd }d}t	|j
 D ] }|jdkrl|jtkrl|} qql|dusJ d|| || qBdS )	a  
        Usage of AOTAutograd causes all the post_acc_grad_hook nodes to get
        pushed to the end of the graph. This differs from eager mode, which
        schedules them as soon as possible. This pass attempts to reorder the
        graph to mimic eager behavior.
        rI   rJ   r=  NrI  r   rO   z8post_acc_grad_hook must have corresponding acc grad node)r   rH   rX   r	   r~   r   r^   r  rY   rQ  rR  rS  rK   rL   r   )rE   Zpost_acc_grad_hooksrR   re  r`   Zacc_grad_noder   r2   r2   r3   ry    s(    




z9AutogradCompilerInstance.reorder_post_acc_grad_hook_nodesc                    sl  g }| j jjdtdD ]"  jdddkr.q|  qt|D ]"  jd } jd } jd }t	|dkrtqBg }|
t| |D ]&}|
 fd	d
t|j D  qt|}|jdkr>|jtkr>|jd }d}	t|j D ].}
|
jdkr|
jtkr|
jdddkr|
}	q|	dur>|	| |  qB| jurB| |sB|| |  qBdS )a  
        Usage of AOTAutograd causes all the post_hook nodes to get pushed to the
        end of the graph. This differs from eager mode, which schedules them as
        soon as possible. This pass attempts to reorder the graph to mimic eager
        behavior.
        rI   rJ   r=  NrH  r   rO      c                 3   s8   | ]0}|j d kr,|jtkr, jdddks|V  qdS )rI   r=  NrH  )rK   rL   r	   r~   r   rO  r]  r2   r3   r     s
   

zCAutogradCompilerInstance.reorder_post_hook_nodes.<locals>.<genexpr>rI  )r   rH   rX   r	   r~   r   r^   r  rY   r   extendrQ  rR  rS  r  rK   rL   r   r  r  )rE   Z
post_hooksre  r_   r  Zinput_nodes_and_usersr  r  r`   Zpost_acc_grad_hook_noder   r2   r]  r3   rz    sL    










z0AutogradCompilerInstance.reorder_post_hook_nodesc                    s   |d u rd S t |tr( fdd|D S t |trHt fdd|D S t |tjtjfrf j|j S t |tjsv|S t	 j
|}t |tjjjjsJ |jS )Nc                    s   g | ]}  |qS r2   r   r8  r|   r2   r3   rS   0  rT   z5AutogradCompilerInstance.to_proxy.<locals>.<listcomp>c                 3   s   | ]}  |V  qd S r5   r   r8  r|   r2   r3   r   2  rT   z4AutogradCompilerInstance.to_proxy.<locals>.<genexpr>)r:   rQ  rx   r0   ZSymIntZSymFloatr   rR   ry   r#   r   rv   r   proxy_tensorZ_ProxyTensorr@  )rE   tr  r2   r|   r3   r   ,  s    

z!AutogradCompilerInstance.to_proxy)r   c                    s   t  tjjr|rjt|t|ks&J g }tt|D ],}|| \}}| ||d  | |  q6| n fddtt|D  t|t ksJ t| d | j	d  S )Nc                    s   g | ]} | qS r2   r2   r   r   r2   r3   rS   I  rT   zDAutogradCompilerInstance.bind_objects_to_proxies.<locals>.<listcomp>ZconstantZtracer)
r:   r0   rv   r-   r   r   set_node_originr^   r&   r   )rE   Zobjectsr   r   Zbound_proxiesrq   nodecall_indexr   r2   r  r3   r   <  s    z0AutogradCompilerInstance.bind_objects_to_proxies)indexc                 C   s4   | j d usJ | j | }t }t||d | jd |S )Nr  )r   r   r&   r   )rE   r  r@  Zbw_stater2   r2   r3   bind_backward_stateO  s
    
z,AutogradCompilerInstance.bind_backward_state)r   r  pyobjc           	      C   sp   d}|d ur4|j }t|dr4|jd u r.td|j}| | d| d}t  d }|d|}t	| d S )N r  zThis compiled backward function was saved by AOTAutogradCache, which does not support
                    compiled autograd. Please turn off AOTAutogradCache using `TORCHINDUCTOR_AUTOGRAD_CACHE=0`.z (NodeCall )rK  z:raw_stack_trace = CapturedTraceback.extract().format()[-1])
r  r   r;   rm   r  r,   extractformatreplacer*   )	rE   r   r  r  Zmaybe_aot_idZforward_clsZnew_codeZraw_stack_traceZnew_stack_tracer2   r2   r3   r  V  s    

z(AutogradCompilerInstance.set_node_origin)N)=rs   rt   ru   rF   r   staticmethodr   r   rQ  r0   ry   r]   r   r   rx   r   rZ   r   r   r  autogradfunctionZBackwardCFunctionr   r  r)  r.  r  r0  r1  r!  r4  r6  rA   r:  r;  rA  rD  rE  rH  rI  rY  rZ  rb  rh  ri  r  r  r  rw  rt  ru  rv  rx  ry  rz  r   r   r  ZFunctionr  r2   r2   r2   r3   r     s|   w H3
1"w

	$#$9 	
r   FT)dynamicc              
   c   sN  |st rd V  n8|r&t|tu s&J ddlm} |jjdkrZdazd V  W danda0 ndd l}|j	j
jtt| |\}}t r|j	j
jt dat}td7 azj|jd d V  W d    n1 s0    Y  W |sda|j	j
j|| td8 at|ksJJ dn8|sda|j	j
j|| td8 at|ksHJ d0 d S )Nr   )
eval_frameZforce_eagerTFrO   zINested Compiled Autograd Contexts must return before their parent context)active_disable_ctxr   rZ   Ztorch._dynamor  Z_stanceZstance%compiled_autograd_enabled_force_eagerZtorch._inductor.cudagraph_treesrC  r   r.   set_autograd_compiler	functoolspartialr   r4   set_verbose_loggerrW  compiled_autograd_enableddepthr  Zset_multithreading_enabled)r   r  Zignore_active_disable_ctxr  r0   prior_compilerprior_dynamicZprior_depthr2   r2   r3   _enable}  sV    

&

r  c               
   c   sp   t jjjd d\} }dats"daz(d V  W | r4dadat jjj| | n | rTdadat jjj| | 0 d S )NFT)r0   rC  r   r.   r  r  r  )r  r  r2   r2   r3   ro    s*    

ro  r   c                   C   sH   da trJ tjjjd d tjjjd  tjjj  t	
 ad S )NF)r  rn  r0   rC  r   r.   r  r  clear_cache	itertoolsr   r   r2   r2   r2   r3   reset  s    r  c                 C   sT   | d }| ||}|d us J || || }	||||	}
||
|
jtjdgS )Nr   )Zmemory_format)Znew_empty_stridedcopy_Z
as_stridedcloner0   Zcontiguous_format)rb   r#  r$  r%  r&  r'  r(  rd   r   offsetr-  r2   r2   r3   r"    s    	
r"  c                 C   sf   d gt |  }tt | D ]F}| | r|| d u r4q|dkrT|||  |||< q|| ||< q|S )Nr   )r   r   r  )r+  r   r,  r-  Zgrad_inputsrq   r2   r2   r3   r*    s    
r*  )TT)e__doc__r   r  r  r[   r   collectionsr   r   typingr   r   r   r0   Ztorch.utils._pytreeutilsZ_pytreer  Ztorch._dynamo.external_utilsr   r   r	   r
   r   Ztorch._dynamo.sourcer   r   Ztorch._dynamo.utilsr   r   r   r   Z/torch._functorch._aot_autograd.runtime_wrappersr   r   Ztorch._guardsr   r   r   Ztorch._loggingr   r   Ztorch._prims_commonr   Ztorch._subclassesr   Ztorch.fxr   Z%torch.fx.experimental._backward_stater   Z"torch.fx.experimental.proxy_tensorr    r!   r"   r#   r$   r%   r&   Z%torch.fx.experimental.symbolic_shapesr'   r(   Ztorch.fx.tracebackr)   r*   Ztorch.utils._ordered_setr+   Ztorch.utils._tracebackr,   Ztorch.fx.proxyr-   r   rs   r{  rW  r4   r7   r9   r?   r@   rz   r   r   r   rs  r[  r   r   r   r   r  r  rn  r  r  contextmanagerrZ   r  ro  r  r"  r*  r2   r2   r2   r3   <module>   s   $	

O	        eG
