a
    h.$                     @   sD  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZ dd Zejj dddZ!ejj ee" dddZ#ejj ee" dddZ$e%dddZ&e'ee"  dddZ(dd Z)G dd dZ*ede* d  d$d"d#Z+dS )%a  
This module implements CUDA graphs support for TorchDynamo backends.

CUDA graphs allow for capturing and replaying GPU operations, which can significantly
reduce CPU overhead in GPU-accelerated PyTorch models. This module provides:

- CUDA graph creation and management for both forward and backward passes
- Input mutation detection and handling
- Device compatibility checking
- Stack trace management for debugging
- Integration with TorchInductor's cudagraph trees

The backend supports two main modes:
1. cudagraphs: Full CUDA graph support with both forward and backward pass optimization
2. cudagraphs_inner: Lower-level CUDA graph implementation used for benchmarking

Key components:
- CudagraphsBackend: Main backend class for CUDA graph integration
- Mutation detection utilities to ensure graph safety
- Device mapping and compatibility checks
- Stack trace collection for debugging
    N)defaultdict)Optional)config)aot_autograd)	boxed_nop)BoxedDeviceIndex'check_multiple_devices_or_any_cpu_nodesformat_default_skip_messageget_mutation_stack_traceget_placeholder_info#log_cudagraph_skip_and_bump_counter)	BoxedBoolcount_tangents%get_first_incompatible_cudagraph_nodenum_fw_fixed_argumentsoutput_node)StorageWeakRef   )register_backendc                 C   s  dd }t t}d}t }| jD ]}|jdkrft||jtjr\|t||j	  
| |d7 }q |jdkr t|jds~q |jj}t|jD ]p\}}|t|jk r|j| }	n|j|jvrq|j|j }	d}
|jr|jjrd	}
|
r||t||	j	  O }qq |S )
Nc                 S   s   d| v r| d S | d S )NvalZfake_result )metar   r   O/var/www/auris/lib/python3.9/site-packages/torch/_dynamo/backends/cudagraphs.pymeta_fk7   s    z%find_input_mutations.<locals>.meta_fkr   placeholderr   call_function_schemaFT)r   setnodesop
isinstancer   torchTensorr   Z_typed_storageaddhasattrtargetr   	enumerate	argumentslenargsnamekwargsZ
alias_infoZis_write)gr   inputsZ	input_idxZmutated_inputsnZschemaiargargumentZmut_argr   r   r   find_input_mutations6   s8    




r2   )gmc                 C   sD   i }| j jD ]2}|jdd }t|tjr|j|vr|||j< q|S )Nr   )graphr   r   getr    r!   r"   device)r3   Zdevice_node_mappingr.   tr   r   r   get_device_node_mapping]   s    r8   )	aot_modelreturnc                 C   s2   t | jtt| }|sd S t| j}t||S N)r2   r4   r   ranger   r
   )r9   	num_fixedZmutation_indicesplaceholdersr   r   r   3check_for_mutation_ignore_cuda_graph_managed_tensorf   s
    
r?   c                 C   sN   t jst| | }r|S tt|  }r,|S t|  }rJtd|j dS d S )Nzincompatible op ())r   Z(cudagraph_backend_support_input_mutationr?   r   r8   r   r	   r*   )r9   r=   Zmut_skipskipnoder   r   r   check_for_skipq   s    rC   )r:   c                 C   s$   t tt| }|jdksJ |jS )Ncuda)nextiterr8   typeindex)r3   r6   r   r   r   get_device_index   s    rI   c                 C   s.   t | }t|jdksJ dd |jd D S )Nr   c                 S   s&   g | ]}t |tjjjr|jnd qS r;   )r    r!   fxrB   NodeZstack_trace).0r0   r   r   r   
<listcomp>   s   z$get_stack_traces.<locals>.<listcomp>r   )r   r(   r)   )r3   outputr   r   r   get_stack_traces   s
    rO   c                    sj   ddl m tdtd  d fdd	} fdd}t||tj|dd	tjj	j
d
}|| S )Nr   )cudagraphify_implTFc                    s   t | |}ttt|}t| | }rFt td|  |S  t|  ||t	| j
ddt| t| jt| jd	}d|_|S )Nzskipping cudagraphs due to FZdevice_indexZis_backwardis_inferenceZstack_tracesr>   Zmutated_input_idxsT)r   r   r(   rC   r   disabler   r   rI   r<   valuerO   r   r4   r2   _boxed_call)r9   
aot_inputsrR   interpfixedskip_msgoutboxed_device_indexrP   do_cudagraphsdynamo_inputsr   r   forward_cudagraphs   s,    

z&cudagraphs.<locals>.forward_cudagraphsc                    s   t  |}s S t }t | }rjtd| tjjjjddd usRJ  fdd}d|_	|S ||t
|t ddt t jt jd	}d|_	|S )Nzskipping cudagraphs due to %sF)Zcreate_if_none_existsc                    s       | S r;   )Zset_to_running_backward)r-   r9   managerr   r   fn   s    z3cudagraphs.<locals>.backward_cudagraphs.<locals>.fnTrQ   )r   r   rC   r   r!   Z	_inductorZcudagraph_treesZget_managerrT   rU   r<   rI   rO   r   r4   r2   )r9   rV   rW   rX   rY   rb   rZ   )r\   rP   r]   r`   r   backward_cudagraphs   s8    
z'cudagraphs.<locals>.backward_cudagraphs)rR   )Zfw_compilerZbw_compilerZinference_compilerZkeep_inference_input_mutations)F)torch._inductor.cudagraph_treesrP   r   r   r   	functoolspartialr!   Z_dynamor   Z%cudagraph_backend_keep_input_mutation)Zdynamo_modelr^   r_   rc   Zaot_cudagraphsr   r[   r   
cudagraphs   s    &rg   c                   @   s(   e Zd ZdZedd Zedd ZdS )CudagraphsBackendrg   c                  C   s   ddl m}  |   d S )Nr   reset_cudagraph_trees)rd   rj   ri   r   r   r   reset   s    zCudagraphsBackend.resetc                 C   s
   t | |S r;   )rg   )modelr-   r   r   r   __call__   s    zCudagraphsBackend.__call__N)__name__
__module____qualname__Zcompiler_namestaticmethodrk   rm   r   r   r   r   rh      s
   
rh   )r*   Zcompiler_fnTc                    s  t |ttfsJ  r&dd |D nt|tj  tj }|tj  tj	| | |  W d   n1 s|0    Y  |  tj | tj  tj
 tjj|d |  W d   n1 s0    Y  t ttfsf fdd}|S )zBThis isn't registered as a backend, but is used in some benchmarksc                 S   s   g | ]}t |qS r   )r!   Z
zeros_likerL   xr   r   r   rM          z$cudagraphs_inner.<locals>.<listcomp>N)streamc                     sX   t t | ksJ  r6t| D ]\}}|| q"  rPdd D S S d S )Nc                 S   s   g | ]}|  qS r   )clonerr   r   r   r   rM     rt   z1cudagraphs_inner.<locals>.run.<locals>.<listcomp>)r(   zipZcopy_Zreplay)Z
new_inputsdstsrccopy_inputscopy_outputsr4   Zstatic_inputsZstatic_outputsr   r   run  s    zcudagraphs_inner.<locals>.run)r    listtupler!   rD   ZsynchronizeZStreamZwait_streamZcurrent_streamru   Z	CUDAGraphr4   )rl   r-   r|   r{   ru   r}   r   rz   r   cudagraphs_inner   s&    

&

&r   )TT),__doc__re   collectionsr   typingr   r!   Ztorch._dynamor   Ztorch._dynamo.backends.commonr   Z torch._dynamo.backends.debuggingr   Ztorch._inductor.cudagraph_utilsr   r   r	   r
   r   r   Ztorch._inductor.utilsr   r   r   r   r   Z torch.multiprocessing.reductionsr   registryr   r2   rJ   ZGraphModuler8   strr?   rC   intrI   r~   rO   rg   rh   r   r   r   r   r   <module>   s.    '
	N