
    [Th.$                        S r SSKrSSKJr  SSKJr  SSKrSSKJr  SSK	J
r
  SSKJr  SSKJrJrJrJrJrJr  SS	KJrJrJrJrJr  SS
KJr  SSKJr  S rS\R>                  R@                  4S jr!S\R>                  R@                  S\\"   4S jr#S\R>                  R@                  S\\"   4S jr$S\%4S jr&S\'\\"      4S jr(S r) " S S5      r*\" S\*" 5       S9  SS jr+g)a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)Optional)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendc           	         S n[        [        5      nSn[        5       nU R                   GH  nUR                  S:X  aq  [	        U" UR
                  5      [        R                  5      (       a;  U[        U" UR
                  5      R                  5       5         R                  U5        US-  nM  UR                  S:X  d  M  [        UR                  S5      (       d  M  UR                  R                  n[        UR                  5       H  u  pxU[!        UR"                  5      :  a  UR"                  U   n	O5UR$                  UR&                  ;  a  MJ  UR&                  UR$                     n	Sn
UR(                  (       a  UR(                  R*                  (       a  Sn
U
(       d  M  XB[        U" U	R
                  5      R                  5       5         -  nM     GM     U$ )	Nc                 "    SU ;   a  U S   $ U S   $ )Nvalfake_result )metas    Y/var/www/auris/envauris/lib/python3.13/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fk%find_input_mutations.<locals>.meta_fk7   s    #tmtE{Dm1DD    r   placeholderr   call_function_schemaFT)r   setnodesop
isinstancer   torchTensorr   _typed_storageaddhasattrtargetr"   	enumerate	argumentslenargsnamekwargs
alias_infois_write)gr   inputs	input_idxmutated_inputsnschemaiargargumentmut_args              r   find_input_mutationsr?   6   sb   E FIUNWW44= '!&&/5<<88~gaffo&D&D&FGHLLYWNITT_$188Y//XX%%F#F$4$45s166{? vvayHxxqxx/  xx1H>>~~.."&7 #&wx}}'='L'L'NO' N 6 : r   gmc                     0 nU R                   R                   H`  nUR                  R                  SS 5      n[	        U[
        R                  5      (       d  M@  UR                  U;  d  MR  X!UR                  '   Mb     U$ )Nr   )graphr$   r   getr&   r'   r(   device)r@   device_node_mappingr9   ts       r   get_device_node_mappingrG   ]   s_    =?XX^^FFJJud#a&&188;N+N,-)  r   	aot_modelreturnc                     [        U R                  5      [        [        U5      5      -
  nU(       d  g [	        U R                  5      n[        X25      $ N)r?   rB   r#   ranger   r   )rH   	num_fixedmutation_indicesplaceholderss       r   3check_for_mutation_ignore_cuda_graph_managed_tensorrP   f   sA     ,IOO<s5CS?TT'	8L#LCCr   c                     [         R                  (       d  [        X5      =n(       a  U$ [        [	        U 5      5      =n(       a  U$ [        U 5      =n(       a  [        SUR                   S35      $ g )Nzincompatible op ())r   (cudagraph_backend_support_input_mutationrP   r	   rG   r   r
   r1   )rH   rM   mut_skipskipnodes        r   check_for_skiprW   q   sx    ::J
 
8 
 O6	* t  4Y??t?*->tyyk+KLLr   c                 x    [        [        [        U 5      5      5      nUR                  S:X  d   eUR                  $ )Ncuda)nextiterrG   typeindex)r@   rD   s     r   get_device_indexr^      s3    $.r234F;;&   <<r   c                    [        U 5      n[        UR                  5      S:X  d   eUR                  S    Vs/ s HD  n[        U[        R
                  R                  R                  5      (       a  UR                  OS PMF     sn$ s  snf )Nr   r   )	r   r/   r0   r&   r'   fxrV   Nodestack_trace)r@   outputr<   s      r   get_stack_tracesrd      st    _Fv{{q    ;;q>!C 'sEHHMM,>,>??T	I!  s   ABc           	         ^^^^ SSK Jm  [        S5      m[        S 5      mSUUUU4S jjnUUU4S jn[	        UU[
        R                  " USS9[        R                  R                  R                  S9nU" U T5      $ )	Nr   )cudagraphify_implTc                   > [        X5      n[        [        T
5      [        U5      5      n[        X5      =n(       a&  [        R
                  " T	5        [        SU 35        U$ TR                  [        U 5      5        T" UU[        U5      TR                  SS[        U 5      [        U R                  5      [        U R                  5      S9	nSUl        U$ )Nzskipping cudagraphs due to Fdevice_indexis_backwardis_inferencestack_tracesrO   mutated_input_idxsT)r   r   r/   rW   r   disabler   r#   r^   rL   valuerd   r   rB   r?   _boxed_call)rH   
aot_inputsrk   interpfixedskip_msgoutboxed_device_indexrf   do_cudagraphsdynamo_inputss          r   forward_cudagraphs&cudagraphs.<locals>.forward_cudagraphs   s    91&s='93z?K%i7787m,/-hZ8 M/	:;%L+11))4-ioo>3IOOD

 
r   c                   >^ ^ [        T U5      nT
(       d  T $ [        T 5      n[        T U5      =n(       aS  [        SU5        [        R
                  R                  R                  TR                  SS9mTc   eU U4S jnSUl	        U$ T	" UU[        U5      [        T 5      SS[        T 5      [        T R                  5      [        T R                  5      S9	nSUl	        U$ )Nzskipping cudagraphs due to %sF)create_if_none_existsc                 4   > TR                  5         T" U 5      $ rK   )set_to_running_backward)r6   rH   managers    r   fn3cudagraphs.<locals>.backward_cudagraphs.<locals>.fn   s    //1 ((r   Trh   )r   r   rW   r   r'   	_inductorcudagraph_treesget_managerro   rp   rL   r^   rd   r   rB   r?   )rH   rq   rr   rs   rt   r   ru   r   rv   rf   rw   s   `      @r   backward_cudagraphs'cudagraphs.<locals>.backward_cudagraphs   s    9j1y)%i7787//
 oo55AA"(( B G &&&) "BNI%L))4))4-ioo>3IOOD

 
r   )rk   )fw_compilerbw_compilerinference_compilerkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesrf   r   r   r   	functoolspartialr'   _dynamor   %cudagraph_backend_keep_input_mutation)dynamo_modelrx   ry   r   aot_cudagraphsrv   rf   rw   s    `   @@@r   
cudagraphsr      so    AdOM)$/ 2$L "&'$,,-?dS',}}';';'a'a	N ,66r   c                   8    \ rS rSrSr\S 5       r\S 5       rSrg)CudagraphsBackend   r   c                      SSK Jn   U " 5         g )Nr   reset_cudagraph_trees)r   r   r   s    r   resetCudagraphsBackend.reset   s    Ir   c                     [        X5      $ rK   )r   )modelr6   s     r   __call__CudagraphsBackend.__call__   s    %((r   r   N)	__name__
__module____qualname____firstlineno__compiler_namestaticmethodr   r   __static_attributes__r   r   r   r   r      s-     M   
 ) )r   r   r   )r1   compiler_fnc                   ^^^^^	 [        U[        [        45      (       d   eT(       a(  U Vs/ s H  n[        R                  " U5      PM     snmO[        U5      m[        R
                  R                  5         [        R
                  R                  5       nUR                  [        R
                  R                  5       5        [        R
                  R                  U5         U " U6   SSS5        UR                  5         [        R
                  R                  5       R                  U5        [        R
                  R                  5         [        R
                  R                  5       m[        R
                  R                  TUS9   U " T6 m	SSS5        [        T	[        [        45      (       d  T	4m	UUUUU	4S jnU$ s  snf ! , (       d  f       N= f! , (       d  f       NN= f)zBThis isn't registered as a backend, but is used in some benchmarksN)streamc                    > [        T5      [        U 5      :X  d   eT(       a&  [        TU 5       H  u  pUR                  U5        M     TR                  5         T(       a   T Vs/ s H  o3R	                  5       PM     sn$ T$ s  snf rK   )r/   zipcopy_replayclone)	
new_inputsdstsrcxcopy_inputscopy_outputsrB   static_inputsstatic_outputss	       r   runcudagraphs_inner.<locals>.run  sk    =!S_444z:		# ;'56~!GGI~66!! 7s   $B)r&   listtupler'   
zeros_likerY   synchronizeStreamwait_streamcurrent_streamr   	CUDAGraphrB   )
r   r6   r   r   r   r   r   rB   r   r   s
     ``   @@@r   cudagraphs_innerr      sU   ftUm,,,,6<=f))!,f=V 
JJZZ F
uzz0023			6	"v 
#
	JJ++F3	JJ JJ  "E			%		/. 
0ntUm44(*	" 	" JA > 
#	" 
0	/s    G%G
G
G
G&)TT),__doc__r   collectionsr   typingr   r'   torch._dynamor   torch._dynamo.backends.commonr    torch._dynamo.backends.debuggingr   torch._inductor.cudagraph_utilsr   r	   r
   r   r   r   torch._inductor.utilsr   r   r   r   r    torch.multiprocessing.reductionsr   registryr   r?   r`   GraphModulerG   strrP   rW   intr^   r   rd   r   r   r   r   r   r   <module>r      s   .  #     6 6   < &$N 4 4 Dxx##Dc]Dehh22 (3- $C D#/ K7\) )  l0A0C D$r   