a
    ‘º”h¢^  ã                   @   s´   d dl Z d dlZd dlZddlmZ eejdƒs`edƒejjd< edƒejjd< edƒejjd< d dlm	Z	m
Z
mZ d	d
„ Zdd„ ZG dd„ dejj
ƒZG dd„ dƒZddd„ZdS )é    Né   )Ú_dummy_typeZ_CudaStreamBaseÚ
_CUDAGraphÚ_graph_pool_handleÚ_cuda_isCurrentStreamCapturing)r   r   r   c                   C   s   t ƒ S )zÌReturn True if CUDA graph capture is underway on the current CUDA stream, False otherwise.

    If a CUDA context does not exist on the current device, returns False without initializing the context.
    )r   © r   r   ú?/var/www/auris/lib/python3.9/site-packages/torch/cuda/graphs.pyÚis_current_stream_capturing   s    r	   c                   C   s   t ƒ S )zÚReturn an opaque token representing the id of a graph memory pool.

    See :ref:`Graph memory management<graph-memory-management>`.

    .. warning::
        This API is in beta and may change in future releases.
    )r   r   r   r   r   Úgraph_pool_handle"   s    r
   c                       s   e Zd ZdZd‡ fdd„	Zd‡ fdd„	Z‡ fd	d
„Z‡ fdd„Z‡ fdd„Z‡ fdd„Z	‡ fdd„Z
‡ fdd„Z‡ fdd„Z‡ fdd„Z‡  ZS )Ú	CUDAGrapha/  Wrapper around a CUDA graph.

    Arguments:
        keep_graph (bool, optional): If ``keep_graph=False``, the
            cudaGraphExec_t will be instantiated on GPU at the end of
            ``capture_end`` and the underlying cudaGraph_t will be
            destroyed. Users who want to query or otherwise modify the
            underlying cudaGraph_t before instantiatiation can set
            ``keep_graph=True`` and access it via ``raw_cuda_graph`` after
            ``capture_end``. Note that the cudaGraphExec_t will not be
            instantiated at the end of ``capture_end`` in this
            case. Instead, it wil be instantiated via an explicit called
            to ``instantiate`` or automatically on the first call to
            ``replay`` if ``instantiate`` was not already called. Calling
            ``instantiate`` manually before ``replay`` is recommended to
            prevent increased latency on the first call to ``replay``. It
            is allowed to modify the raw cudaGraph_t after first calling
            ``instantiate``, but the user must call ``instantiate`` again
            manually to make sure the instantiated graph has these
            changes. Pytorch has no means of tracking these changes.

    .. warning::
        This API is in beta and may change in future releases.

    Fc                    s   t ƒ  | |¡S ©N)ÚsuperÚ__new__)ÚclsZ
keep_graph©Ú	__class__r   r   r   I   s    zCUDAGraph.__new__NÚglobalc                    s   t ƒ j||d dS )að  Begin capturing CUDA work on the current stream.

        Typically, you shouldn't call ``capture_begin`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_begin`` internally.

        Arguments:
            pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
                :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
                with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
            capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
                Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
                may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
                actions in the current thread, and "relaxed" will not error on these actions. Do NOT change this setting
                unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_
        )ÚpoolÚcapture_error_modeN)r   Úcapture_begin)Úselfr   r   r   r   r   r   L   s    zCUDAGraph.capture_beginc                    s   t ƒ  ¡  dS )aG  End CUDA graph capture on the current stream.

        After ``capture_end``, ``replay`` may be called on this instance.

        Typically, you shouldn't call ``capture_end`` yourself.
        Use :class:`~torch.cuda.graph` or :func:`~torch.cuda.make_graphed_callables`,
        which call ``capture_end`` internally.
        N)r   Úcapture_end©r   r   r   r   r   _   s    	zCUDAGraph.capture_endc                    s   t ƒ  ¡  dS )a$  Instantiate the CUDA graph. Will be called by
        ``capture_end`` if ``keep_graph=False``, or by ``replay`` if
        ``keep_graph=True`` and ``instantiate`` has not already been
        explicitly called. Does not destroy the cudaGraph_t returned
        by ``raw_cuda_graph``.
        N)r   Úinstantiater   r   r   r   r   j   s    zCUDAGraph.instantiatec                    s   t ƒ  ¡  dS )z,Replay the CUDA work captured by this graph.N)r   Úreplayr   r   r   r   r   s   s    zCUDAGraph.replayc                    s   t ƒ  ¡  dS )z1Delete the graph currently held by this instance.N)r   Úresetr   r   r   r   r   w   s    zCUDAGraph.resetc                    s
   t ƒ  ¡ S )zäReturn an opaque token representing the id of this graph's memory pool.

        This id can optionally be passed to another graph's ``capture_begin``,
        which hints the other graph may share the same memory pool.
        )r   r   r   r   r   r   r   {   s    zCUDAGraph.poolc                    s
   t ƒ  ¡ S )z/Enable debugging mode for CUDAGraph.debug_dump.)r   Úenable_debug_moder   r   r   r   r   ƒ   s    zCUDAGraph.enable_debug_modec                    s   t ƒ  |¡S )zÖ
        Arguments:
            debug_path (required): Path to dump the graph to.

        Calls a debugging function to dump the graph if the debugging is
        enabled via CUDAGraph.enable_debug_mode()
        )r   Ú
debug_dump)r   Z
debug_pathr   r   r   r   ‡   s    zCUDAGraph.debug_dumpc                    s
   t ƒ  ¡ S )a}  Returns the underlying cudaGraph_t. ``keep_graph`` must be True.

        See the following for APIs for how to manipulate this object: `Graph Managmement <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html>`_ and `cuda-python Graph Management bindings <https://nvidia.github.io/cuda-python/cuda-bindings/latest/module/runtime.html#graph-management>`_
        )r   Úraw_cuda_graphr   r   r   r   r   ‘   s    zCUDAGraph.raw_cuda_graph)F)Nr   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r   r   r   r   r   r   r   Ú__classcell__r   r   r   r   r   .   s   	
r   c                   @   sD   e Zd ZU dZdZejd ed< dedœdd„Z	d	d
„ Z
dd„ ZdS )ÚgraphaÅ  Context-manager that captures CUDA work into a :class:`torch.cuda.CUDAGraph` object for later replay.

    See :ref:`CUDA Graphs <cuda-graph-semantics>` for a general introduction,
    detailed use, and constraints.

    Arguments:
        cuda_graph (torch.cuda.CUDAGraph): Graph object used for capture.
        pool (optional): Opaque token (returned by a call to :func:`~torch.cuda.graph_pool_handle()` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) hinting this graph's capture
            may share memory from the specified pool. See :ref:`Graph memory management<graph-memory-management>`.
        stream (torch.cuda.Stream, optional): If supplied, will be set as the current stream in the context.
            If not supplied, ``graph`` sets its own internal side stream as the current stream in the context.
        capture_error_mode (str, optional): specifies the cudaStreamCaptureMode for the graph capture stream.
            Can be "global", "thread_local" or "relaxed". During cuda graph capture, some actions, such as cudaMalloc,
            may be unsafe. "global" will error on actions in other threads, "thread_local" will only error for
            actions in the current thread, and "relaxed" will not error on actions. Do NOT change this setting
            unless you're familiar with `cudaStreamCaptureMode <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85>`_

    .. note::
        For effective memory sharing, if you pass a ``pool`` used by a previous capture and the previous capture
        used an explicit ``stream`` argument, you should pass the same ``stream`` argument to this capture.

    .. warning::
        This API is in beta and may change in future releases.

    .. _cudaStreamCaptureMode:
        https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
    Nztorch.cuda.StreamÚdefault_capture_streamr   )r   c                 C   sr   | j jd u rtj ¡ | j _|d u r&dn|f| _|d ur:|n| j j| _| jd usRJ ‚tj | j¡| _|| _	|| _
d S )Nr   )r   r%   ÚtorchÚcudaÚStreamr   Zcapture_streamÚstreamÚ
stream_ctxÚ
cuda_graphr   )r   r+   r   r)   r   r   r   r   Ú__init__¹   s    
ÿzgraph.__init__c                 C   s@   t j ¡  t ¡  t j ¡  | j ¡  | jj	| j
d| jiŽ d S )Nr   )r&   r'   ÚsynchronizeÚgcZcollectZempty_cacher*   Ú	__enter__r+   r   r   r   r   r   r   r   r/   Ï   s    


ÿÿzgraph.__enter__c                 C   s   | j  ¡  | j |||¡ d S r   )r+   r   r*   Ú__exit__)r   Úexc_typeÚ	exc_valueÚ	tracebackr   r   r   r0   Ý   s    
zgraph.__exit__)NNr   )r   r    r!   r"   r%   ÚtypingÚOptionalÚ__annotations__Ústrr,   r/   r0   r   r   r   r   r$   ™   s   
   ûûr$   é   Fc           '         sÆ  t  ¡ rt  ¡ rtdƒ‚d}t| tƒs6d}| f} |f}g ‰ t| |ƒD ]œ\}}t|t jjƒrªt	|j
ƒdkr„t	|jƒdkr„t	|jƒdksŒJ dƒ‚tdd„ | ¡ D ƒƒsªJ dƒ‚t jjj|Ž }ˆ  t|ƒ¡ td	d„ |D ƒƒsDJ d
ƒ‚qDdd„ ˆ D ƒ}	dd„ | D ƒ‰‡ ‡fdd„tt	| ƒƒD ƒ}
dd„ tt	| ƒƒD ƒ}dd„ tt	| ƒƒD ƒ}|du rVtƒ n|}t j ¡  t j t j ¡ ¡¼ t| ||
ƒD ]ž\}}}d\}}}t|ƒD ]j}t jj ||Ž ¡}tdd„ |D ƒƒ}t	|ƒdkr t jj|tdd„ |D ƒƒtdd„ |D ƒƒd|d}q |||fD ]}~qq„W d  ƒ n1 s:0    Y  t j ¡  g }g }t| ||ƒD ]p\}}}t jj||d ||Ž }W d  ƒ n1 sœ0    Y  t jj |¡\}}| t|ƒ¡ | |¡ qbg }g }tt|
ƒt|ƒt|ƒƒD ]
\}}}tdd„ |D ƒƒ}tdd„ |D ƒƒ}d}t	|ƒdkršt jj||dB t jj|tdd„ |D ƒƒtdd„ |D ƒƒd|d}W d  ƒ n1 s0    Y  g }d} |D ]:}!|!jrÔ|durÔ| ||  ¡ | d7 } n
| d¡ q¦t|ƒ}| |¡ | |¡ qô|  ¡  |  ¡  dd„ }"g }#t!| ƒD ]ˆ\}$}|"||$ ||$ ˆ|$ |	|$ ||$ |
|$ ||$ ||$ ||$ ƒ	}%t|t jjƒr¢dd „ }&|&||j"|%|j#ƒ|_#|# |¡ n
|# |%¡ q&|r¾|#d S t|#ƒS )!aØ  Accept callables (functions or :class:`nn.Module<torch.nn.Module>`\ s) and returns graphed versions.

    Each graphed callable's forward pass runs its source callable's
    forward CUDA work as a CUDA graph inside a single autograd node.

    The graphed callable's forward pass also appends
    a backward node to the autograd graph. During backward, this node runs the
    callable's backward work as a CUDA graph.

    Therefore, each graphed callable should be a drop-in replacement for its source callable
    in an autograd-enabled training loop.

    See :ref:`Partial-network capture<partial-network-capture>` for detailed use and constraints.

    If you pass a tuple of several callables, their captures will use the same memory pool.
    See :ref:`Graph memory management<graph-memory-management>` for when this is appropriate.

    Arguments:
        callables (torch.nn.Module or Python function, or tuple of these): Callable or callables to graph.
            See :ref:`Graph memory management<graph-memory-management>` for when passing a tuple of callables
            is appropriate.  If you pass a tuple of callables, their order in the tuple must be the same order
            they'll run in the live workload.
        sample_args (tuple of Tensors, or tuple of tuples of Tensors): Samples args for each callable.
            If a single callable was passed, ``sample_args`` must be a single tuple of argument Tensors.
            If a tuple of callables was passed, ``sample_args`` must be tuple of tuples of argument Tensors.
        num_warmup_iters (int): The number of warmup iterations. Currently, ``DataDistributedParallel`` needs
            11 iterations for warm up. Default: ``3``.
        allow_unused_input (bool): If False, specifying inputs that were not used when computing outputs
            (and therefore their grad is always zero) is an error. Defaults to False.
        pool (optional): Token (returned by :func:`~torch.cuda.graph_pool_handle` or
            :meth:`other_Graph_instance.pool()<torch.cuda.CUDAGraph.pool>`) that hints this graph may share memory
            with the indicated pool.  See :ref:`Graph memory management<graph-memory-management>`.
    .. note::
        The ``requires_grad`` state of each Tensor in ``sample_args`` must match the state
        that's expected for the corresponding real input in the training loop.

    .. warning::
        This API is in beta and may change in future releases.

    .. warning::
        ``sample_args`` for each callable must contain only Tensors. Other types are not allowed.

    .. warning::
        Returned callables do not support higher order differentiation (e.g., double backward).

    .. warning::
        In any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters
        may be trainable. Buffers must have ``requires_grad=False``.

    .. warning::
        After you pass a :class:`torch.nn.Module` through :func:`~make_graphed_callables`,
        you may not add or remove any of that Module's parameters or buffers.

    .. warning::
        :class:`torch.nn.Module`\s passed to :func:`~torch.cuda.make_graphed_callables` must not have module hooks
        registered on them at the time they are passed. However, registering hooks on modules *after* passing them
        through :func:`~torch.cuda.make_graphed_callables` is allowed.

    .. warning::
        When running a graphed callable, you must pass its arguments in the same order and format
        they appeared in that callable's ``sample_args``.

    .. warning::
        The automatic mixed precision is supported in :func:`~torch.cuda.make_graphed_callables` only with disabled
        caching. The context manager `torch.cuda.amp.autocast()` must have `cache_enabled=False`.
    z_make_graphed_callables does not support the autocast caching. Please set `cache_enabled=False`.FTr   z§Modules must not have hooks registered at the time they are passed. However, registering hooks on modules after passing them through make_graphed_callables is allowed.c                 s   s   | ]}|j d u V  qdS )FN©Úrequires_grad©Ú.0Úbr   r   r   Ú	<genexpr>@  ó    z)make_graphed_callables.<locals>.<genexpr>zœIn any :class:`~torch.nn.Module` passed to :func:`~make_graphed_callables`, only parameters may be trainable. All buffers must have ``requires_grad=False``.c                 s   s   | ]}t |tjƒV  qd S r   )Ú
isinstancer&   ZTensor)r<   Úargr   r   r   r>   G  r?   zfIn the beta API, sample_args for each callable must contain only Tensors. Other types are not allowed.c                 S   s   g | ]}t |ƒ‘qS r   )Úlen)r<   Úargsr   r   r   Ú
<listcomp>N  r?   z*make_graphed_callables.<locals>.<listcomp>c                 S   s*   g | ]"}t |tjjƒr"t| ¡ ƒnd ‘qS )r   )r@   r&   ÚnnÚModuleÚtupleÚ
parameters)r<   Úcr   r   r   rD   O  s   ÿc                    s   g | ]}ˆ | ˆ|  ‘qS r   r   ©r<   Úi©Zflatten_sample_argsZper_callable_module_paramsr   r   rD   S  s   ÿc                 S   s   g | ]}t j ¡ ‘qS r   ©r&   r'   r   ©r<   Ú_r   r   r   rD   X  r?   c                 S   s   g | ]}t j ¡ ‘qS r   rM   rN   r   r   r   rD   Y  r?   N)NNNc                 s   s   | ]}|j r|V  qd S r   r9   ©r<   Úor   r   r   r>   h  r?   c                 s   s   | ]}|j r|V  qd S r   r9   rJ   r   r   r   r>   l  s   c                 s   s   | ]}|j rt |¡V  qd S r   ©r:   r&   Z
empty_likerP   r   r   r   r>   o  s   )ÚoutputsÚinputsZgrad_outputsZonly_inputsZallow_unused)r   c                 s   s"   | ]}|j rt |¡nd V  qd S r   rR   rP   r   r   r   r>   “  s   c                 s   s   | ]}|j r|V  qd S r   r9   rP   r   r   r   r>   —  r?   c                 s   s   | ]}|j r|V  qd S r   r9   rJ   r   r   r   r>     r?   c                 s   s   | ]}|d ur|V  qd S r   r   rP   r   r   r   r>   ž  r?   é   c	           
         s8   G ‡‡‡‡‡‡‡	fdd„dt jjƒ‰ ‡ ‡‡fdd„}	|	S )Nc                       s@   e Zd Ze‡‡‡‡fdd„ƒZeejjj‡ ‡‡fdd„ƒƒZ	dS )zOmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphedc                    s`   t ˆƒD ].}ˆ|  ¡ ||  ¡ krˆ|  || ¡ qˆ  ¡  tˆtƒsNJ ‚tdd„ ˆD ƒƒS )Nc                 s   s   | ]}|  ¡ V  qd S r   ©ÚdetachrP   r   r   r   r>   Ì  r?   zjmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forward.<locals>.<genexpr>)ÚrangeÚdata_ptrÚcopy_r   r@   rG   )ÚctxrT   rK   )Ú	fwd_graphÚlen_user_argsÚstatic_input_surfaceÚstatic_outputsr   r   ÚforwardÄ  s    zWmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.forwardc                    sr   t |ƒt ˆƒksJ ‚tˆ|ƒD ]*\}}|d ur| ¡ | ¡ kr| |¡ qˆ  ¡  tˆtƒs`J ‚tdd„ ˆD ƒƒS )Nc                 s   s"   | ]}|d ur|  ¡ n|V  qd S r   rV   r;   r   r   r   r>   Ü  s   zkmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backward.<locals>.<genexpr>)rB   ÚziprY   rZ   r   r@   rG   )r[   ZgradsÚgÚgrad)Ú	bwd_graphÚstatic_grad_inputsÚstatic_grad_outputsr   r   ÚbackwardÎ  s    ÿzXmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.Graphed.backwardN)
r   r    r!   Ústaticmethodr`   r&   ÚautogradÚfunctionZonce_differentiablerg   r   )rd   r\   r]   re   rf   r^   r_   r   r   ÚGraphedÃ  s
   	rk   c                     s0   t jjj| Ž }ˆ jt|ƒˆ Ž }t jj |ˆ¡S r   )r&   ÚutilsÚ_pytreeÚarg_tree_leavesÚapplyrG   Ztree_unflatten)Ú	user_argsZflatten_user_argsÚout)rk   Úmodule_paramsÚoutput_unflatten_specr   r   Úfunctionalizedà  s    zVmake_graphed_callables.<locals>.make_graphed_autograd_function.<locals>.functionalized)r&   ri   ZFunction)
r\   rd   rr   r]   rs   r^   r_   rf   re   rt   r   )
rk   rd   r\   r]   rr   rs   re   rf   r^   r_   r   Úmake_graphed_autograd_function¸  s    $z>make_graphed_callables.<locals>.make_graphed_autograd_functionc                    s   ‡ ‡‡‡fdd„}|S )Nc                     s   ˆ j ˆkrˆ| Ž S ˆ| Ž S d S r   )Útraining)rp   ©ÚfuncÚgraph_training_stateÚgraphedÚorig_fwdr   r   Únew_fwdü  s    
zEmake_graphed_callables.<locals>.make_graphed_forward.<locals>.new_fwdr   )rx   ry   rz   r{   r|   r   rw   r   Úmake_graphed_forwardû  s    z4make_graphed_callables.<locals>.make_graphed_forward)$r&   Zis_autocast_enabledZis_autocast_cache_enabledÚRuntimeErrorr@   rG   ra   rE   rF   rB   Z_backward_hooksZ_forward_hooksZ_forward_pre_hooksÚallÚbuffersrl   rm   rn   ÚappendrX   r
   r'   r-   r)   r(   Ztree_leavesri   rc   r$   Ztree_flattenÚreversedr:   ÚreverseÚ	enumeraterv   r`   )'Z	callablesZsample_argsZnum_warmup_itersZallow_unused_inputr   Zjust_one_callablerI   rC   Zflatten_argZper_callable_len_user_argsZ"per_callable_static_input_surfacesZ
fwd_graphsZ
bwd_graphsZmempoolrx   r^   Zgrad_inputsrS   Zoutputs_gradrO   ÚvZper_callable_static_outputsZ"per_callable_output_unflatten_specr\   Zflatten_outputsÚspecZ per_callable_static_grad_outputsZper_callable_static_grad_inputsr_   rd   rf   re   Zgrad_idxrA   ru   ÚretrK   rz   r}   r   rL   r   Úmake_graphed_callablesã   sú    Eÿ
ÿþýûÿÿþ
þ
ÿ
ÿÿ÷
*
(ýÿû&

3÷rˆ   )r8   FN)r.   r4   r&   Ú_utilsr   ÚhasattrZ_CÚ__dict__Ztorch._Cr   r   r   r	   r
   r   r$   rˆ   r   r   r   r   Ú<module>   s    ÿ	kK ÿ