o
    Zh                     @   s@  d Z ddlZddlZddlZddlZddlZddlmZmZ ddl	m
Z
mZmZmZ ddlZddlm  mZ ddlmZmZmZ ddlmZmZ ddlmZmZmZmZ ddlm Z m!Z!m"Z" dd	l#m$Z$m%Z% dd
l&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z? erddl@mAZA e$eBdZCe$eBdZDdd ZEdd ZFdd ZGG dd dZHG dd dZIeH ZJg d ZKe=eeejLejJjMjNjOgZPeQ aRd!d" ZSG d#d$ d$ZTd%aUd%aVd%aWejXd1d&d'ZYejXd(d) ZZd2d+d,Z[d-d. Z\d/d0 Z]dS )3a  
Provides functionality for compiling PyTorch's autograd (automatic differentiation) system.

This module implements compiled autograd, which traces and optimizes backward pass
computations at runtime. The key components are:

- AutogradCompilerInstance: Traces and compiles autograd graphs using FX
- Context managers (_enable/_disable): Control when compiled autograd is active
- Utility functions: Support graph manipulation, tensor operations, and hooks

Compiled autograd can significantly improve backward pass performance by removing
Python overhead and enabling additional optimizations. It works by capturing
backward computations into an FX graph that can be compiled and optimized,
while maintaining the same semantics as eager mode autograd.
    N)Counterdefaultdict)AnyOptionalTYPE_CHECKINGUnion)call_backward	call_hookFakeCompiledAutogradEngineGetItemSourceLocalSource)countersget_chromium_event_loggerlazy_format_graph_codeset_locals_to_stealcompile_contextCompileContext	CompileId)getArtifactLoggertrace_structuredclone_preserve_strides)FakeTensorMode)GraphModule)BackwardState)	decomposedisable_autocast_cachedisable_proxy_modes_tracingfetch_object_proxyProxyTorchDispatchModePythonKeyTracertrack_tensor_tree)
DimDynamicShapeEnv)preserve_node_metaset_stack_trace)
OrderedSet)CapturedTraceback)Proxycompiled_autogradcompiled_autograd_verbosec                   C   s   t jjjdS )Nr,   )torchZ_logging	_internalZ	log_stateZis_artifact_enabled r/   r/   N/var/www/auris/lib/python3.10/site-packages/torch/_dynamo/compiled_autograd.py snapshot_verbose_logging_enabledF   s   
r1   c                   C   s   t jjjjS N)r-   Z	_inductorconfigZtritonZ
cudagraphsr/   r/   r/   r0   snapshot_cudagraph_enabledL      r4   c                 C   s   | d urt | S | S r2   r   )xr/   r/   r0   maybe_cloneP   s   r7   c                   @   $   e Zd Zdd Zdd Zdd ZdS )OpNamespacec                 C   s   t  | _d S r2   )r   custom_function_name_counterselfr/   r/   r0   __init__\   r5   zOpNamespace.__init__c                    s   |rd| }| j | }| j |  d7  < | | }t| |r!J t||| |r5t| |tj  |S tjj fdd}t| || |S )NZCppNode   c                     s    | i |S r2   r/   argskwargsresultr/   r0   run_non_traceable_cpp_in_eagerm      z7OpNamespace.add.<locals>.run_non_traceable_cpp_in_eager)r:   hasattrOpsetattrr-   _dynamoallow_in_graphdisable)r<   namefnis_custom_functionis_traceablecountrD   r/   rB   r0   add_   s   
	zOpNamespace.addc                 C   s
   t | |S r2   )getattr)r<   rL   r/   r/   r0   gett   s   
zOpNamespace.getN)__name__
__module____qualname__r=   rQ   rS   r/   r/   r/   r0   r9   [   s    r9   c                   @   r8   )rG   c                 C   s   || _ || _|| _d| _d S )Nz#torch._dynamo.compiled_autograd.ops)rM   rN   rT   rU   )r<   rL   rM   rN   r/   r/   r0   r=   y   s   
zOp.__init__c                 O   s   | j |i |S r2   )rM   )r<   r@   rA   r/   r/   r0   __call__      zOp.__call__c                 C   s   | j d | j S )N.)rU   rT   r;   r/   r/   r0   __repr__   rX   zOp.__repr__N)rT   rU   rV   r=   rW   rZ   r/   r/   r/   r0   rG   x   s    rG   )inputssizesscalarshooksZpacked_datac                 C   s   t tt| d d dS )N)compiled_autograd_idZframe_idZframe_compile_idr   )r_   r/   r/   r0   make_compile_context   s   r`   c                   @   s   e Zd Zd`ddZdd ZedefddZd	ee	j
 d
ee deeeef  deeeeef   fddZdee fddZdd Zdede	jjjdee fddZdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Z d+d, Z!d-efd.d/Z"d0d1 Z#d2d3 Z$d4d5 Z%dee fd6d7Z&d8d9 Z'd:d; Z(d<d= Z)d>d? Z*d@dA Z+edBdC Z,edDdE Z-dFdG Z.dHdI Z/dJdK Z0dLdM Z1dNdO Z2dPdQ Z3dRdS Z4dTdU Z5	dadeeeeef   fdVdWZ6dXefdYdZZ7d[ed\ed]ee	jj8 fd^d_Z9dS )bAutogradCompilerInstancereturnNc                 C   sT   || _ t | _| jj| _t | _tdd| jd| _t	 | _
t| j
d| _d | _d S )NT)Zallow_fallback_kernelsZallow_non_fake_inputs	shape_envZsymbolic)compiler_fn
contextlib	ExitStackstackcloser%   rc   r   fake_tensor_moder"   	fx_tracerr!   
proxy_modehooks_proxy)r<   rd   r/   r/   r0   r=      s   


z!AutogradCompilerInstance.__init__c                 C   s    t |tjsJ | jj||dS )N)source)
isinstancer-   Tensorri   Zfrom_tensor)r<   r6   rm   r/   r/   r0   	wrap_fake   s   z"AutogradCompilerInstance.wrap_fakec                 C   s   t t| |S r2   r   )rL   idxr/   r/   r0   rm      rE   zAutogradCompilerInstance.sourcer[   r\   r]   originsc                    sn  t d d  d7  < tt _t j _ j  t  _	t
 jd j	d jidd d  _i  _tj  j_tjjtd j_i  j_i  _ fdd	tD \} _ _ _ _ jt   |\}}} fd
dt!|D } "|||  fddt!|D } "| j|}	t!|D ]\}
}|	|
  j|j#< qt!|D ]:\}} $d|}t%|t&r j'(||t)j*||< qt%|t+rֈ j'j, j'j-||t)j*d||d||< qt.dt/| "| j| t!|D ]\}
} j|
  j|j#< q jt0i   j j1  j j2  jt3   j1j'd usJ  j1j'} jtjj4j56| t7t89 |||fS )Nr+   Zcapturesr>   graph_idTZlog_pt2_compile_event)Z
tracer_clsc                 3   s"    | ]} j d |di V  qdS )placeholderr/   N)rj   create_proxy).0rL   r;   r/   r0   	<genexpr>   s
    
z9AutogradCompilerInstance.begin_capture.<locals>.<genexpr>c              	      s$   g | ]\}}  | d |qS )r[   )rp   rm   )rw   rq   r6   r;   r/   r0   
<listcomp>   s    z:AutogradCompilerInstance.begin_capture.<locals>.<listcomp>c              	      s*   g | ]\}} j | d |tjqS )r\   )rc   $create_unspecified_symint_and_symbolrm   r$   DYNAMIC)rw   rq   valr;   r/   r0   ry      s    
r]   )rm   Zdynamic_dim)hintrm   zUnexpected scalar type: ):r   nextCOMPILE_COUNTERidr`   r   	__enter__timetime_nsstart_time_nsr   Zlog_event_startaot_graph_cls_nameaot_graph_infosr-   nnModulerj   rootfxZGraphr"   graphZtensor_attrssymnode_proxy_lookup_graph_placeholdersZsizes_proxyZscalars_proxyrl   packed_data_proxyrg   enter_contextr&   	enumeratebind_objects_to_proxiesnoderm   rn   intrc   rz   r$   r{   floatZcreate_symfloatnodeZcreate_unspecified_symbolAssertionErrortyper   ri   rk   r   experimentalZsymbolic_shapesZ_suppress_guardsstrr   Zcurrent_compile_id)r<   r[   r\   r]   rr   Z
args_proxyZinputs_originsZsizes_originsZscalars_originsproxiesiZsymintrq   r|   rm   Zsymvalenvr/   r;   r0   begin_capture   s   











z&AutogradCompilerInstance.begin_capturecompile_reasonsc                    s&    sJ t ddd  fddd d S )Nartifactc                   S   
   dddS )NZ!compiled_autograd_compile_reasonsjsonrL   encodingr/   r/   r/   r/   r0   <lambda>#     z>AutogradCompilerInstance.log_compile_reasons.<locals>.<lambda>c                      s    S r2   r/   r/   r   r/   r0   r   '  s    Zmetadata_fn
payload_fn)r   )r<   r   r/   r   r0   log_compile_reasons  s   

z,AutogradCompilerInstance.log_compile_reasonsc                    s   fdd   D } j}|j|j~tjjfdd}	jjd|	||g|R i dd |d ur:j	|  fdd}
|
 }fd	d
}tj
jjj||d}tj|}|S )Nc                       g | ]}  |qS r/   to_proxyrw   er;   r/   r0   ry   >      zDAutogradCompilerInstance.proxy_call_aot_backward.<locals>.<listcomp>c                    s"   t jjjj| | g|R  }|S r2   )r-   
_functorch_aot_autogradruntime_wrappersZ_backward_prologue_functional)Zctx_saved_tensorsZctx_symints	flat_argsout)maybe_subclass_metadatametadatar/   r0   call_aot_bwd_prologueF  s   
zOAutogradCompilerInstance.proxy_call_aot_backward.<locals>.call_aot_bwd_prologuecall_functionkindtargetr@   rA   c                     s  dd } | j j}fddt|td u D } }t|tjks*J fdd|D }||d t|< d urD| d}i d }j jjD ]p}|j	dkrb|| j
|< |d7 }qO|j	d	kr~t|jdkspJ fd
d|jd D }qO|j	dkr|j}j|}	tjj|	tj | jd|	di }
|
|< qO|j	dkrjj|fdd}
|
|< qOtd|d usJ dd   fdd|D }|| |S )Nc                 S   s,   d}| j D ]}|jdkr|d7 }q |S |S )Nr   ru   r>   )nodesop)r   num_argsr   r/   r/   r0   
num_inputsb  s   

zkAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.num_inputsc                       g | ]} | qS r/   r/   rw   r   )pgradsr/   r0   ry   o  s    zkAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.<listcomp>c                    r   r/   r   r   r;   r/   r0   ry   u  r   r   ru   r>   outputc                    s2   g | ]}t |tjjrtj|  jn|qS r/   )rn   r-   r   Noder*   rj   rw   n)r<   value_remapr/   r0   ry     s    Zget_attrr/   r   c                    s    |  S r2   r/   )r   )r   r/   r0   r     s    ziAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.<lambda>zshouldn't get herec                   S   s<   t   tdddddW  d    S 1 sw   Y  d S )Nr   {   r   r-   Zzerosr/   r/   r/   r0   dummy  s   $zfAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graph.<locals>.dummyc                    s$   g | ]}t |tjjr  n|qS r/   )rn   r-   r   r*   )rw   o)r   r/   r0   ry     s    )Z
_bw_moduler   ranger   _get_compiled_autograd_symintslensymintsappendr   r   r   r@   r   rj   Zget_fresh_qualnamerH   r   rR   create_nodeZ	node_copyr   r   )r   r   Z	pall_argsr   psymintsZargs_idxZpoutputsr   rL   qualnamerC   outputs)ctxpbackward_stater   r<   )r   r   r0   copy_paste_aot_backward_grapha  sX   









zWAutogradCompilerInstance.proxy_call_aot_backward.<locals>.copy_paste_aot_backward_graphc                    sX   t jj fdd}tj|}jjd|t|i d}	 }
|g|g |S )Nc                     s   j |  dS )N)
is_runtime)Zcreation_fn)unwrapped_argsr   subclass_metar/   r0   make_subclass  rE   zkAutogradCompilerInstance.proxy_call_aot_backward.<locals>.proxy_subclass_constructor.<locals>.make_subclassr   r   )r-   rI   rJ   pytreetree_mapr   rj   rv   tupleallocate_dummyr   )r   r   r   r   Zpunwrapped_argsZpoutputr   r;   r   r0   proxy_subclass_constructor  s   zTAutogradCompilerInstance.proxy_call_aot_backward.<locals>.proxy_subclass_constructor)Zmake_subclass_override)r   _forward_clsr   r   r-   rI   rJ   rj   rv   rl   r   r   r   Z_backward_epilogue_functionalr   r   r   )r<   pinputspsaved_tensorssaved_tensorspctxr   maybe_backward_state_idxr   ZCompiledFunctionr   r   r   r   resultsZpresultsr/   )r   r   r   r   r   r<   r0   proxy_call_aot_backward*  s>   

H
z0AutogradCompilerInstance.proxy_call_aot_backwardbackward_idxr   r   c              
   C   s  | j d usJ | j | }| |}| |}	t|jdr'| ||	||||}
n| jjdt||	g|R i d}
|
d us<J t > g }t	|D ]&\}}|d u sT|
| d u rZ|
d  qF|\}}}}|
tj||||d qF| ||
 W d    t|S 1 sw   Y  t|S )N_aot_idr   r   )sizedtypelayoutdevice)rl   r   rF   r   r   rj   rv   r   r   r   r   r-   emptyr   r   )r<   r[   Zoutput_metadatasr   r   r   r   r   r   r   r   Zgrad_insrq   output_metadatar   r   r   r   r/   r/   r0   proxy_call_backward  sN   	


	


z,AutogradCompilerInstance.proxy_call_backwardc                 C   s>   ||  | | |  | | f}| t|d gd S )N   )r\   stridesZstorage_offset
proxy_callcopy_slices_prologue)r<   r[   baseviewr@   r/   r/   r0   call_copy_slices_prologue  s   	z2AutogradCompilerInstance.call_copy_slices_prologuec                 C   s    |  t||||fd gt| S r2   )r   copy_slices_epiloguer   )r<   needs_input_gradrC   res
grad_slicer/   r/   r0   call_copy_slices_epilogue  s
   
z2AutogradCompilerInstance.call_copy_slices_epiloguec                 C   s8   t   tddgW  d    S 1 sw   Y  d S )Nr   i[r   r;   r/   r/   r0   r     s   $z'AutogradCompilerInstance.allocate_dummyc                 C   s   t ||||S )zBinds ops.fn_name = fn)opsrQ   )r<   fn_namerM   rN   rO   r/   r/   r0   bind_function  s   z&AutogradCompilerInstance.bind_functionc                 C   s    t |}| ||g|R |S )z:Proxies a call to ops.fn_name(grads, *args) into the graph)r   rS   r   )r<   r   Zgradsr@   r   r   r/   r/   r0   apply_functional  s   
z)AutogradCompilerInstance.apply_functionalc                    sn   t |\}}t fdd|}jjd||i d fdd|D }| fddtt|D  |S )z*Proxies a call to fn(*args) into the graphc                    s
     | S r2   r   )r   r;   r/   r0   r     s   
 z5AutogradCompilerInstance.proxy_call.<locals>.<lambda>r   r?   c                    s   g | ]}   qS r/   )r   )rw   _r;   r/   r0   ry   !      z7AutogradCompilerInstance.proxy_call.<locals>.<listcomp>c                    r   r/   r/   r   )	proxy_outr/   r0   ry   "  r  )r   Ztree_flattenr   rj   rv   r   r   r   )r<   rM   r@   r   r   r  
proxy_argsrC   r/   )r  r<   r0   r     s   "z#AutogradCompilerInstance.proxy_callc                 C   sX   t d}t| j|g|R }| jjd||i d}t|t|ks$J | || |S )zEProxies a call to ops.validate_outputs(outputs, *args) into the graphvalidate_outputsr   r?   )	r   rS   r   r   r   rj   rv   r   r   )r<   r  r   r@   r   r   r  Znew_proxy_outputsr/   r/   r0   r  %  s   
z)AutogradCompilerInstance.validate_outputsc                 C   sJ   |  |}|  |}| jjdtj||fi d}|  }| |g|g |S )Nr   r?   )r   rj   rv   r-   rQ   r   r   )r<   Zold_varZnew_varZold_var_proxyZnew_var_proxyr  rC   r/   r/   r0   
accumulate0  s   

z#AutogradCompilerInstance.accumulatec                    s(    j dt|g fdd|D R |S )Nr   c                    r   r/   r   rw   r6   r;   r/   r0   ry   @  r   z<AutogradCompilerInstance.proxy_call_hook.<locals>.<listcomp>)rj   rv   r	   )r<   hookr@   rA   r/   r;   r0   proxy_call_hook:  s   z(AutogradCompilerInstance.proxy_call_hookc                 C   sN   | j d usJ | j | }| j| }| j||dd}|  }| |g|g |S )Nunpack_hook	hook_type)rl   r   r
  r   r   )r<   hook_idZdata_idr	  dataproxyr   r/   r/   r0   r  E  s   

z$AutogradCompilerInstance.unpack_hookr   c                 C   s|   | j d usJ | j | }| j||| dd}t  t|| ||< | || g|g W d    |S 1 s7w   Y  |S )Ntensor_pre_hookr  )rl   r
  r   r7   r   )r<   r[   r  r   r	  r  r/   r/   r0   r  R  s   

z(AutogradCompilerInstance.tensor_pre_hookc                 C   sn   | j d usJ | j | }| j||dd}t  dd |D }| || W d    |S 1 s0w   Y  |S )Npre_hookr  c                 S      g | ]}t |qS r/   r7   r  r/   r/   r0   ry   h  r  z5AutogradCompilerInstance.pre_hook.<locals>.<listcomp>rl   r
  r   r   )r<   r[   r  r	  r   r/   r/   r0   r  _  s   

z!AutogradCompilerInstance.pre_hookc                 C   sp   | j d usJ | j | }| j|||dd}t  dd |D }| || W d    |S 1 s1w   Y  |S )N	post_hookr  c                 S   r  r/   r  r  r/   r/   r0   ry   v  r  z6AutogradCompilerInstance.post_hook.<locals>.<listcomp>r  )r<   r   r[   r  r	  r   r/   r/   r0   r  l  s   

z"AutogradCompilerInstance.post_hookc                 C   s|   t |tjsJ | jd usJ | j| }| j||dd}t  t|g}| ||g W d    |S 1 s7w   Y  |S )Npost_acc_grad_hookr  )rn   r-   ro   rl   r
  r   r7   r   )r<   inputr  r	  r  r/   r/   r0   r  z  s   


z+AutogradCompilerInstance.post_acc_grad_hookc                 C   sB  i }d}t |j}|d jdksJ |d }t |j }tt}|| |d ks+J |t| d }|| |d ks=J t|D ]>\}	}
|sS|
jd j	j
dkrSd}qA|
jd j	j
d	k}t|
jd  dk}|r|rt |
j }td
d |D r|
||	< qA|r| D ]}
td|
 |
jd  |
jd< qt | S g S )NFr   r[   r>   r|   cudaTcpuc                 s   sB    | ]}t |jtjjo|jjd v pt |jto|jj V  qdS ))ZprimsatenN)rn   r   r-   Z_opsZ
OpOverload	namespacerG   rN   rw   userr/   r/   r0   rx     s    	

zDAutogradCompilerInstance.move_graph_nodes_to_cuda.<locals>.<genexpr>zMoving node %s from cpu to cuda)listr   r   userskeysr   r   r   metar   r   r   allvaluesverbose_logdebugr  )r<   r   Zto_moveZhas_cuda_inputsr   r[   Zinputs_usersZfirst_getitem_idxZlast_getitem_idxr   r   Zis_cpuZ	is_scalarZ
node_usersr/   r/   r0   move_graph_nodes_to_cuda  s:   
	z1AutogradCompilerInstance.move_graph_nodes_to_cudac                 C   s6   t |tjjo|jdko|jtjjjj	tjjj
jfv S )Nr   )rn   r-   r   r   r   r   r   r  Zsym_sizer   Z	sym_numeldefault)r<   r   r/   r/   r0   is_sym_node  s   z$AutogradCompilerInstance.is_sym_nodec                    s   t   t| jjjddD ]\}} |j  q|tt	d ks$J  fdd}t| jjj
}| jj| t| jjj
}td||  d S )Nru   )r   r>   c                    s0   |  v p| j dkp| j dkp| j dko| jtv S )Nru   r   r   )r   r   _impure_targetsr   Zunpack_nodesr/   r0   	is_impure  s   z/AutogradCompilerInstance.dce.<locals>.is_impurezDCE removed %d nodes)r(   r   rj   r   
find_nodesupdater!  r"  r   r   r   Zeliminate_dead_coder&  r'  )r<   r   r   r.  beforeafterr/   r-  r0   dce  s   zAutogradCompilerInstance.dcec                 C   s   t | jj| jj|S r2   )r   rj   r   r   )r<   r   r/   r/   r0   create_graph_module  s   z,AutogradCompilerInstance.create_graph_modulec              	      s  j dtjdi  j  j ddj |fi  g t	 r,
j jj jjD ]}dD ]}||jv r@|j|= q5q1tddd fddd	                   d
j  t dg td dddd}td| td| td fddd fdd}t  j!dt"# djij$dd j%&d d d  |' fS )Nr   r/   r   )Ztensor_metaZexample_valuer|   r   c                   S   r   )NZ&compiled_autograd_graph_pre_reorderingstringr   r/   r/   r/   r/   r0   r     r   z6AutogradCompilerInstance.end_capture.<locals>.<lambda>c                      s&   t  jj jjd j djddS )NCompiledAutogradZPreReorderingFZprint_output)r   rj   r   r   r   print_readabler/   r;   r/   r0   r      s    r   r6  r[   zCompiled autograd graphT)Zinclude_deviceZinclude_strideZcoloredz%sZcompiled_autograd_graphc                      s    j ddS )NFr7  )r8  r/   )r   r/   r0   r   '  s    )r   c              	      s   zWda  D ]}||  jdd||< qt / tj | |||||W  d    W  d    W da S 1 s:w   Y  W d    n1 sIw   Y  W da d S W da d S da w )NT)Znon_blockingF)in_compiled_autograd_regionZ
pin_memoryr  _disabler`   r   )Zcompiled_fnr[   r\   r]   r^   Zpacked_inputsr   )runtime_inputs_to_mover<   r/   r0   runtime_wrapper*  s   0z=AutogradCompilerInstance.end_capture.<locals>.runtime_wrapperr+   rs   rt   )(rj   rv   r
   _exec_final_callbacks_stubrg   rh   r   Z
create_argr   r4   r(  r   r   r#  r   rename_aot_dispatcher_nodesdelay_unpack_hook_nodesreorder_tensor_pre_hook_nodes'reorder_pre_hook_nodes_to_schedule_asapreorder_accumulate_grad_nodes%reorder_pre_hook_nodes_to_mimic_eager reorder_post_acc_grad_hook_nodesreorder_post_hook_nodesr3  r4  r   r   r   compiled_autograd_loginfor&  r'  r   Zlog_event_endr   r   r   r   __exit__rd   )r<   r   r   fieldZlazy_graph_coder<  r/   )r   r;  r<   r0   end_capture  sz   



z$AutogradCompilerInstance.end_capturec                 C   s
  | j du rdS dtjjjdtjjjfdd}tt}| j D ]\}}|d }|d }d}|d	 j	}|| r=d
||  }||  d7  < t
|j}	t|	}
|
dusTJ z|
jdkrct|	}
|
jdksZW n	 tym   Y qw zt
| jj	j}t|D ]}t| qzt|}|jdkr|||
st|}|jdkr|||
r|
jdkr|jdkr|jst|}q|||
std| | d
|
j |_t|
jD ]\}}d| | d
|j |j| _qt|	}
t|}|
jdkr|jdksW q ty   td| j ||| Y qw dS )z
        Renames nodes as they appear in the AOTDispatcher backward graphs, prefixed by AOT id
        e.g. AOTDispatcher backward graph X's `sin_Y` -> `aotX_sin_Y`
        Ncaaotc                 S   s   | j |j k}|st| j dot|j do| j j|j jk}|sFt| j drFt|j drF|j  dkrFt|jddrF| j  |jd  k}|o]| j|jko]| j|jko]t| j	t|j	kS )NrT   rL   zaten::reshapeZoriginal_aten)
r   rF   rT   rL   r#  rS   r   r   r   all_input_nodes)rK  rL  Ztarget_matchr/   r/   r0   
is_similarH  s0   




zHAutogradCompilerInstance.rename_aot_dispatcher_nodes.<locals>.is_similarca_node_start_idxaot_id aot_gmr  r>   r   r   zIFailed to match %s%s (NodeCall %s) nodes with AOT backward graph %s nodes)r   r-   r   r   r   r   r   r   itemsr   iterr   r~   r   StopIterationrj   r   r!  rL   r   rM  r&  r'  )r<   rN  Zaot_id_counternodecall_indexrG  rO  rP  Zaot_id_postfixZ	aot_graphZaot_itZaot_nodeca_itr  Zca_noder   inpr/   r/   r0   r>  @  st   






z4AutogradCompilerInstance.rename_aot_dispatcher_nodesc                 C   s   dd | D }|S )Nc                 S   s    g | ]}t |tjju r|qS r/   )r   r-   r   r   r   r/   r/   r0   ry     s     z:AutogradCompilerInstance.get_all_nodes.<locals>.<listcomp>r/   )r@   r   r/   r/   r0   get_all_nodes  s   z&AutogradCompilerInstance.get_all_nodesc                 C   s8   | j dks| j dkr| jtjkr| jd j dkrdS dS )Nru   r   r   TF)r   r   operatorgetitemr@   r,  r/   r/   r0   is_placeholder  s   

z'AutogradCompilerInstance.is_placeholderc                 C   s   | j jjdtjjjjdD ]:}|jd |jd }}d}|j	t
jkr)|}|jd }t||g}||jurG| |sG|| |durG|| qdS )a  
        Usage of AOTAutograd causes all the accumulate_grad_ nodes to get pushed to the end of
        the graph.  This differs from eager mode, which schedules them as soon as possible. This
        pass attempts to reorder the graph to mimic eager behavior.
        r   r   r   r   r>   N)rj   r   r/  r-   r   inductoraccumulate_grad_r)  r@   r   rZ  r[  maxprevr\  r   )r<   r   
param_nodeZ	grad_nodegetitem_nodeargr/   r/   r0   rB    s   



z6AutogradCompilerInstance.reorder_accumulate_grad_nodesc                 C   sD   | j jjdtdD ]}|jdddkrq	t|j}|| q	dS )zp
        We can delay unpack hooks until they are needed, even later than in the eager autograd engine.
        r   r]  r  Nr  )	rj   r   r/  r	   rA   rS   minr!  prepend)r<   r   Z
first_userr/   r/   r0   r?    s   

z0AutogradCompilerInstance.delay_unpack_hook_nodesc                 C   sl   | j jjdtdD ]*}|jdddkrq	|jd }|jd }||jur3| |s3|	| |	| q	dS )a  
        Usage of AOTAutograd causes all the tensor_pre_hook nodes to get pushed
        to the end of the graph. This differs from eager mode, which schedules
        them as soon as possible. This pass attempts to reorder the graph to
        mimic eager behavior.
        r   r]  r  Nr  r   r>   )
rj   r   r/  r	   rA   rS   r@   ra  r\  r   )r<   r   rc  
input_noder/   r/   r0   r@    s   




z6AutogradCompilerInstance.reorder_tensor_pre_hook_nodesc                 C   s   | j jjdtdD ]s}|jdddkrq	|jd }| |jd }g }g }|g}|D ]}|jdkrJ|j	t
jkrJ||jd  || || q+t||D ]\}}	|| ||	 qPt|}
|
|jur|| |
s||
| |D ]}|| qtq	dS )a  
        In this function, we schedule the pre hooks as soon as possible. This
        does not match eager behavior (schedule pre hook right before its
        registered node), but it can make acc grad be scheduled properly when
        the pre hooks are registered to them. After reordering acc grad node, we
        will reorder the pre hooks again to mimic eager behavior.
        r   r]  r  Nr  r   r>   )rj   r   r/  r	   rA   rS   r@   rY  r   r   rZ  r[  r   zipremover`  ra  r\  )r<   r   rc  input_nodes	to_removeZ	to_appendZ
hook_blockr   abrd  r/   r/   r0   rA    s4   





z@AutogradCompilerInstance.reorder_pre_hook_nodes_to_schedule_asapc                 C   s   g }| j jjdtdD ]}|jdddkrq|| qt|D ]D}|jd }t	|j
 }t|dkr6q!tdd |D sAJ tt|d j
 }||jure|| || |D ]}|| q]q!dS )	a%  
        Usage of AOTAutograd causes all the pre_hook nodes to get pushed to the
        end of the graph. This differs from eager mode, which schedules them
        right before their registered node execution. This pass attempts to
        reorder the graph to mimic eager behavior.
        r   r]  r  Nr  r   c                 s   s&    | ]}|j d ko|jtjkV  qdS )r   N)r   r   rZ  r[  r  r/   r/   r0   rx   #  s
    
zQAutogradCompilerInstance.reorder_pre_hook_nodes_to_mimic_eager.<locals>.<genexpr>)rj   r   r/  r	   rA   rS   r   reversedr@   r   r!  r"  r   r$  r~   rT  rf  )r<   Z	pre_hooksr   Zhook_getitem_noder!  Zregistered_noder[  r/   r/   r0   rC    s.   




z>AutogradCompilerInstance.reorder_pre_hook_nodes_to_mimic_eagerc                 C   s   g }| j jjdtdD ]}|jdddkrq|| qt|D ]<}|jd }|jd }d}t	|j
 D ]}|jdkrJ|jtjjjjkrJ|} nq6|dusSJ d|| || q!dS )	a  
        Usage of AOTAutograd causes all the post_acc_grad_hook nodes to get
        pushed to the end of the graph. This differs from eager mode, which
        schedules them as soon as possible. This pass attempts to reorder the
        graph to mimic eager behavior.
        r   r]  r  Nr  r   r>   z8post_acc_grad_hook must have corresponding acc grad node)rj   r   r/  r	   rA   rS   r   rn  r@   r   r!  r"  r   r   r-   r   r^  r_  r)  )r<   Zpost_acc_grad_hooksr   rc  rb  Zacc_grad_noder   r/   r/   r0   rD  /  s.   





z9AutogradCompilerInstance.reorder_post_acc_grad_hook_nodesc                    sl  g }| j jjdtdD ]  jdddkrq|  qt|D ]  jd } jd } jd }t	|dkr9q!g }|
t| |D ]}|
 fd	d
t|j D  qDt|}|jdkr|jtjjjjkr|jd }d}	t|j D ]}
|
jdkr|
jtkr|
jdddkr|
}	qx|	dur|	| |  q!| jur| |s|| |  q!dS )a  
        Usage of AOTAutograd causes all the post_hook nodes to get pushed to the
        end of the graph. This differs from eager mode, which schedules them as
        soon as possible. This pass attempts to reorder the graph to mimic eager
        behavior.
        r   r]  r  Nr  r   r>      c                 3   s:    | ]}|j d kr|jtkr jdddks|V  qdS )r   r  Nr  )r   r   r	   rA   rS   r  r,  r/   r0   rx   p  s    

zCAutogradCompilerInstance.reorder_post_hook_nodes.<locals>.<genexpr>r  )rj   r   r/  r	   rA   rS   r   rn  r@   r   extendr   r!  r"  r`  r   r   r-   r   r^  r_  r)  ra  r\  )r<   Z
post_hooksrc  Zoutput_nodesrj  Zinput_nodes_and_usersrg  rd  rb  Zpost_acc_grad_hook_noder   r/   r,  r0   rE  V  sN   













z0AutogradCompilerInstance.reorder_post_hook_nodesc                    s   |d u rd S t |tr fdd|D S t |tr$t fdd|D S t |tjtjfr3 j|j S t |tjs;|S t	 j
|}t |tjjjjsLJ |jS )Nc                    r   r/   r   r  r;   r/   r0   ry     r   z5AutogradCompilerInstance.to_proxy.<locals>.<listcomp>c                 3   s    | ]}  |V  qd S r2   r   r  r;   r/   r0   rx     s    z4AutogradCompilerInstance.to_proxy.<locals>.<genexpr>)rn   r   r   r-   ZSymIntZSymFloatr   r   ro   r    rj   r   r   proxy_tensorZ_ProxyTensorr  )r<   trq  r/   r;   r0   r     s   

z!AutogradCompilerInstance.to_proxyc                    s   t  tjjrB|r5t|t|ksJ g }tt|D ]}|| \}}| ||d  | |  q| n fddtt|D  t|t ksLJ t| d | j	d  S )Nc                    r   r/   r/   r   r   r/   r0   ry     r  zDAutogradCompilerInstance.bind_objects_to_proxies.<locals>.<listcomp>ZconstantZtracer)
rn   r-   r   r*   r   r   set_node_originr   r#   rj   )r<   objectsr   rr   Zbound_proxiesr   rV  	node_namer/   rs  r0   r     s   z0AutogradCompilerInstance.bind_objects_to_proxiesindexc                 C   s4   | j d usJ | j | }t }t||d | jd |S )Nrt  )rl   r   r#   rj   )r<   rx  r  Zbw_stater/   r/   r0   bind_backward_state  s
   
z,AutogradCompilerInstance.bind_backward_staterw  rV  pyobjc           	      C   s   d}|d ur-|j }t|dr-|jd u rtd|| _|j}t| jjj	||jj
d| j|< | | d| d}t  d }|d|}t| d S )	NrQ  r   zThis compiled backward function was saved by AOTAutogradCache, which does not support
                    compiled autograd. Please turn off AOTAutogradCache using `TORCHINDUCTOR_AUTOGRAD_CACHE=0`.)rO  rP  rR  z (NodeCall )r  z:raw_stack_trace = CapturedTraceback.extract().format()[-1])r   rF   Z_lazy_backward_infoRuntimeErrorr   r   r   rj   r   r   Z	bw_moduler   r)   extractformatreplacer'   )	r<   rw  rV  rz  Zmaybe_aot_idZforward_clsZnew_codeZraw_stack_traceZnew_stack_tracer/   r/   r0   ru    s(   

z(AutogradCompilerInstance.set_node_originrb   Nr2   ):rT   rU   rV   r=   rp   staticmethodr   rm   r   r-   ro   r   r   r   r   r   r   r   r   autogradfunctionZBackwardCFunctionr   r   r   r   r   r   r  r   r  r  r
  r  r  r  r  r  r(  r*  r3  r4  rJ  r>  rY  r\  rB  r?  r@  rA  rC  rD  rE  r   r   ry  Functionru  r/   r/   r/   r0   ra      s    

c
 
3
1`b

	$#'<

ra   Fc              
   c   s    |rt |tu sJ ddlm} |jjdkr%dazd V  W dad S daw dd l}|jj	j
tt| |\}}t rC|jj	j
t daz)|jd d V  W d    n1 sZw   Y  W |sdda|jj	j
|| d S |ssda|jj	j
|| w )Nr   )
eval_frameZforce_eagerTF)r   boolZtorch._dynamor  Z_stanceZstance%compiled_autograd_enabled_force_eagerZtorch._inductor.cudagraph_trees_CrI   r+   set_autograd_compiler	functoolspartialra   r1   set_verbose_loggerr&  compiled_autograd_enabledr  Zset_multithreading_enabled)rd   Zdynamicr  r-   prior_compilerprior_dynamicr/   r/   r0   _enable  sD   


r  c               
   c   s`    t jjjd d\} }dazd V  W | rdat jjj| | d S | r&dat jjj| | w )NFT)r-   r  rI   r+   r  r  )r  r  r/   r/   r0   r:    s$   

r:  rb   c                   C   sH   da trJ tjjjd d tjjjd  tjjj  t	
 ad S )NF)r  r9  r-   r  rI   r+   r  r  clear_cache	itertoolsrP   r   r/   r/   r/   r0   reset#  s   r  c                 C   sT   | d }| ||}|d usJ || || }	||||	}
||
|
jtjdgS )Nr   )Zmemory_format)Znew_empty_stridedcopy_Z
as_stridedcloner-   Zcontiguous_format)r[   Z
base_sizesZbase_stridesZbase_storage_offsetZ
view_sizesZview_stridesZview_storage_offsetZgradrC   offsetr   r/   r/   r0   r   0  s   	
r   c                 C   sf   d gt |  }tt | D ]#}| | r0|| d u rq|dkr*|||  |||< q|| ||< q|S )Nr   )r   r   r  )r   rC   r   r   Zgrad_inputsr   r/   r/   r0   r   D  s   
r   )Fr  )^__doc__re   r  r  rZ  r   collectionsr   r   typingr   r   r   r   r-   Ztorch.utils._pytreeutilsZ_pytreer   Ztorch._dynamo.external_utilsr   r	   r
   Ztorch._dynamo.sourcer   r   Ztorch._dynamo.utilsr   r   r   r   Ztorch._guardsr   r   r   Ztorch._loggingr   r   Ztorch._prims_commonr   Ztorch._subclassesr   Ztorch.fxr   Z%torch.fx.experimental._backward_stater   Z"torch.fx.experimental.proxy_tensorr   r   r   r    r!   r"   r#   Z%torch.fx.experimental.symbolic_shapesr$   r%   Ztorch.fx.tracebackr&   r'   Ztorch.utils._ordered_setr(   Ztorch.utils._tracebackr)   Ztorch.fx.proxyr*   rT   rF  r&  r1   r4   r7   r9   rG   r   r   r=  r^  r_  r)  r+  rP   r   r`   ra   r  r  r9  contextmanagerr  r:  r  r   r   r/   r/   r/   r0   <module>   s~   $	


	        C)

