o
    Zh.$                     @   sH  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZ dd Zdejj fddZ!dejj dee" fddZ#dejj dee" fddZ$de%fddZ&de'ee"  fddZ(dd Z)G dd dZ*ede* d  d$d"d#Z+dS )%a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)Optional)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendc                 C   s  dd }t t}d}t }| jD ]q}|jdkr3t||jtjr.|t||j	  
| |d7 }q|jdkrt|jds?q|jj}t|jD ]8\}}|t|jk rY|j| }	n|j|jvr`qH|j|j }	d}
|jrq|jjrqd	}
|
r||t||	j	  O }qHq|S )
Nc                 S   s   d| v r| d S | d S )NvalZfake_result )metar   r   P/var/www/auris/lib/python3.10/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fk7   s   z%find_input_mutations.<locals>.meta_fkr   placeholderr   Zcall_function_schemaFT)r   setnodesop
isinstancer   torchTensorr   Z_typed_storageaddhasattrtargetr   	enumerate	argumentslenargsnamekwargsZ
alias_infoZis_write)gr   inputsZ	input_idxZmutated_inputsnZschemaiargargumentZmut_argr   r   r   find_input_mutations6   s:   



r1   gmc                 C   sD   i }| j jD ]}|jdd }t|tjr|j|vr|||j< q|S )Nr   )graphr   r   getr   r    r!   device)r2   Zdevice_node_mappingr-   tr   r   r   get_device_node_mapping]   s   
r7   	aot_modelreturnc                 C   s2   t | jtt| }|sd S t| j}t||S N)r1   r3   r   ranger   r
   )r8   	num_fixedZmutation_indicesplaceholdersr   r   r   3check_for_mutation_ignore_cuda_graph_managed_tensorf   s
   

r>   c                 C   sN   t jst| | }r|S tt|  }r|S t|  }r%td|j dS d S )Nzincompatible op ())r   Z(cudagraph_backend_support_input_mutationr>   r   r7   r   r	   r)   )r8   r<   Zmut_skipskipnoder   r   r   check_for_skipq   s   rB   c                 C   s$   t tt| }|jdksJ |jS )Ncuda)nextiterr7   typeindex)r2   r5   r   r   r   get_device_index   s   rH   c                 C   s.   t | }t|jdksJ dd |jd D S )Nr   c                 S   s&   g | ]}t |tjjjr|jnd qS r:   )r   r    fxrA   NodeZstack_trace).0r/   r   r   r   
<listcomp>   s    z$get_stack_traces.<locals>.<listcomp>r   )r   r'   r(   )r2   outputr   r   r   get_stack_traces   s
   rN   c                    sj   ddl m tdtd  d fdd	} fdd}t||tj|dd	tjj	j
d
}|| S )Nr   )cudagraphify_implTFc                    s   t | |}ttt|}t| | }r#t td|  |S  t|  ||t	| j
ddt| t| jt| jd	}d|_|S )Nzskipping cudagraphs due to FZdevice_indexZis_backwardis_inferenceZstack_tracesr=   Zmutated_input_idxsT)r   r   r'   rB   r   disabler   r   rH   r;   valuerN   r   r3   r1   _boxed_call)r8   
aot_inputsrQ   interpfixedskip_msgoutboxed_device_indexrO   do_cudagraphsdynamo_inputsr   r   forward_cudagraphs   s,   

z&cudagraphs.<locals>.forward_cudagraphsc                    s   t  |}s	 S t }t | }r5td| tjjjjddd us)J  fdd}d|_	|S ||t
|t ddt t jt jd	}d|_	|S )Nzskipping cudagraphs due to %sF)Zcreate_if_none_existsc                    s       | S r:   )Zset_to_running_backward)r,   r8   managerr   r   fn   s   z3cudagraphs.<locals>.backward_cudagraphs.<locals>.fnTrP   )r   r   rB   r   r    Z	_inductorZcudagraph_treesZget_managerrS   rT   r;   rH   rN   r   r3   r1   )r8   rU   rV   rW   rX   ra   rY   )r[   rO   r\   r_   r   backward_cudagraphs   s8   
z'cudagraphs.<locals>.backward_cudagraphs)rQ   )Zfw_compilerZbw_compilerZinference_compilerZkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesrO   r   r   r   	functoolspartialr    Z_dynamor   Z%cudagraph_backend_keep_input_mutation)Zdynamo_modelr]   r^   rb   Zaot_cudagraphsr   rZ   r   
cudagraphs   s   &
rf   c                   @   s(   e Zd ZdZedd Zedd ZdS )CudagraphsBackendrf   c                  C   s   ddl m}  |   d S )Nr   reset_cudagraph_trees)rc   ri   rh   r   r   r   reset   s   
zCudagraphsBackend.resetc                 C   s
   t | |S r:   )rf   )modelr,   r   r   r   __call__   s   
zCudagraphsBackend.__call__N)__name__
__module____qualname__Zcompiler_namestaticmethodrj   rl   r   r   r   r   rg      s    
rg   )r)   Zcompiler_fnTc                    s  t |ttfs	J  rdd |D nt|tj  tj }|tj  tj	| | |  W d   n1 s>w   Y  |  tj | tj  tj
 tjj|d |  W d   n1 spw   Y  t ttfsf fdd}|S )zBThis isn't registered as a backend, but is used in some benchmarksc                 S   s   g | ]}t |qS r   )r    Z
zeros_likerK   xr   r   r   rL      s    z$cudagraphs_inner.<locals>.<listcomp>N)streamc                     sT   t t | ks
J  rt| D ]	\}}|| q  r(dd D S S )Nc                 S   s   g | ]}|  qS r   )clonerq   r   r   r   rL     s    z1cudagraphs_inner.<locals>.run.<locals>.<listcomp>)r'   zipZcopy_Zreplay)Z
new_inputsdstsrccopy_inputscopy_outputsr3   Zstatic_inputsZstatic_outputsr   r   run  s   zcudagraphs_inner.<locals>.run)r   listtupler    rC   ZsynchronizeZStreamZwait_streamZcurrent_streamrs   Z	CUDAGraphr3   )rk   r,   rz   ry   rs   r{   r   rx   r   cudagraphs_inner   s*   





r~   )TT),__doc__rd   collectionsr   typingr   r    Ztorch._dynamor   Ztorch._dynamo.backends.commonr   Z torch._dynamo.backends.debuggingr   Ztorch._inductor.cudagraph_utilsr   r   r	   r
   r   r   Ztorch._inductor.utilsr   r   r   r   r   Z torch.multiprocessing.reductionsr   registryr   r1   rI   ZGraphModuler7   strr>   rB   intrH   r|   rN   rf   rg   r~   r   r   r   r   <module>   s4    '	
	N