ó
    [T–h¬  ã                   ó’   • % S SK r S SKJr  S SKrS SKJr  / r\\   \S'    " S S5      r	 " S S5      r
S	\S
\4S jrS	\S\S
\4S jrg)é    N)ÚCallable)ÚTensorÚ__all__c                   ó"   • \ rS rSrS\4S jrSrg)Ú_CodeParseré   Úcode_stringc                 ó<  • SnSnSnSnSnSnSnUS-   U-   U-   U-   U-   U-   U-   U-   U-   U-   U-   U-   n	[         R                  " X‘[         R                  5      n
U
c  [        S	U 35      eU
S
   U l        U
S   U l        U
S   U l        U
S   U l        U
S   U l        g )Nz\s*z\s+z(?P<template_params>\<.+\>)z(?P<return_type>\w+)z(?P<function_name>\w+)z(?P<function_params>\(.+\))z(?P<function_body>\{.+\})Útemplatez0Couldn't parse code, please check correctness:
 Útemplate_paramsÚreturn_typeÚfunction_nameÚfunction_paramsÚfunction_body)	ÚreÚmatchÚDOTALLÚ	Exceptionr   r   r   r   r   )Úselfr	   Úoptional_wsÚrequired_wsr   r   r   r   r   ÚpatternÚresults              ÚL/var/www/auris/envauris/lib/python3.13/site-packages/torch/cuda/jiterator.pyÚ__init__Ú_CodeParser.__init__   s  € ØˆØˆØ8ˆØ-ˆØ1ˆØ8ˆØ4ˆð Øñàñð ñð ñ	ð
 ñð ñð ñð ñð ñ	ð ñ
ð ñð ñð 	ô  —’Ø¤"§)¡)ó
ˆð ‰>ÜØCÀKÀ=ÐQóð ð  &Ð&7Ñ8ˆÔØ! -Ñ0ˆÔØ# OÑ4ˆÔØ%Ð&7Ñ8ˆÔØ# OÑ4ˆÕó    )r   r   r   r   r   N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ústrr   Ú__static_attributes__© r   r   r   r      s   † ð&5 C÷ &5r   r   c                   ó8   • \ rS rSrS\S\S\4S jrS\4S jr	Sr
g	)
Ú_JittedFunctioné6   r	   Úreturn_by_refÚnum_outputsc                 óÚ   • Xl         U(       d  US:X  d   S5       eX l        X0l        [        U5      nUR                  U l        X@l        [        R                  R                  5       U l
        g )Né   z.Return by value only works for single output. )r	   r(   r)   r   r   Úkernel_nameÚkwargs_dictÚtorchÚcudaÚis_availableÚis_cuda_available)r   r	   r(   r)   ÚkwargsÚparsed_codes         r   r   Ú_JittedFunction.__init__7   se   € ð 'Ôö ˜[¨AÓ-ð	<à;ó	<Ø-à*ÔØ&Ôä! +Ó.ˆØ&×4Ñ4ˆÔà!ÔÜ!&§¡×!8Ñ!8Ó!:ˆÕr   Útensorsc                 óž  • U R                   (       d   S5       e[        U5      S::  d   S5       eU R                  R                  5       nUR	                  5        H&  u  pEX@R                  ;   a  XSU'   M  [        U S35      e   [        R                  R                  U R                  U R                  U R                  U R                  UU5      $ )NzFJiterator is only supported on CUDA and ROCm GPUs, none are available.é   z.jiterator only supports up to 8 tensor inputs.z' is not declared in function definition)r1   Úlenr-   ÚcopyÚitemsÚKeyErrorr.   Ú_CÚ)_cuda_jiterator_compile_and_launch_kernelr	   r,   r(   r)   )r   r5   r2   Úexpanded_kwargsÚkeyÚvalues         r   Ú__call__Ú_JittedFunction.__call__H   sÉ   € ð ×"×"ð	TàSó	TØ"ô 7‹|˜qÓ ÐRÐ"RÓRÐ à×*Ñ*×/Ñ/Ó1ˆØ Ÿ,™,ž.‰JˆCØ×&Ñ&Ó&Ø', Ó$ä # Ð&MÐNÓOÐOñ	 )ô x‰x×AÑAØ×ÑØ×ÑØ×ÑØ×ÑØØó
ð 	
r   )r	   r1   r,   r-   r)   r(   N)r   r   r    r!   r"   ÚboolÚintr   r   rA   r#   r$   r   r   r&   r&   6   s+   † ð;Øð;Ø/3ð;ØBEô;ð"
 ÷ 
r   r&   r	   Úreturnc                 ó    • [        U 4SSS.UD6$ )aY	  
Create a jiterator-generated cuda kernel for an elementwise op.

The code string has to be a valid CUDA function that describes the computation for a single element. The code
string has to follow the c++ template pattern, as shown in the example below. This function will be inlined
into elementwise kernel template, and compiled on the fly. Compiled kernel will be cached in memory, as well as
local temp dir.

Jiterator-generated kernels accepts noncontiguous tensors, and supports broadcasting and type promotion.

Args:
    code_string (str): CUDA code string to be compiled by jiterator. The entry functor must return by value.
    kwargs (Dict, optional): Keyword arguments for generated function

Example::

    code_string = "template <typename T> T my_kernel(T x, T y, T alpha) { return -x + alpha * y; }"
    jitted_fn = create_jit_fn(code_string, alpha=1.0)
    a = torch.rand(3, device='cuda')
    b = torch.rand(3, device='cuda')
    # invoke jitted function like a regular python function
    result = jitted_fn(a, b, alpha=3.14)

code_string also allows multiple function definitions, and the last function will be treated as the entry function.

Example::

    code_string = "template <typename T> T util_fn(T x, T y) { return ::sin(x) + ::cos(y); }"
    code_string += "template <typename T> T my_kernel(T x, T y, T val) { return ::min(val, util_fn(x, y)); }"
    jitted_fn = create_jit_fn(code_string, val=0.0)
    a = torch.rand(3, device='cuda')
    b = torch.rand(3, device='cuda')
    # invoke jitted function like a regular python function
    result = jitted_fn(a, b)  # using default val=0.0

Jiterator can be used together with python registration to override an operator's cuda kernel.
Following example is overriding gelu's cuda kernel with relu.

Example::

    code_string = "template <typename T> T my_gelu(T a) { return a > 0 ? a : 0; }"
    my_gelu = create_jit_fn(code_string)
    my_lib = torch.library.Library("aten", "IMPL")
    my_lib.impl('aten::gelu', my_gelu, "CUDA")
    # torch.nn.GELU and torch.nn.function.gelu are now overridden
    a = torch.rand(3, device='cuda')
    torch.allclose(torch.nn.functional.gelu(a), torch.nn.functional.relu(a))

.. warning::
    This API is in beta and may change in future releases.

.. warning::
    This API only supports up to 8 inputs and 1 output

.. warning::
    All input tensors must live in CUDA device
Fr+   ©r(   r)   ©r&   )r	   r2   s     r   Ú_create_jit_fnrI   b   s   € ôt ˜;ÐU°eÈÑUÈfÑUÐUr   r)   c                 ó    • [        U 4SUS.UD6$ )a\  
Create a jiterator-generated cuda kernel for an elementwise op that supports returning one or more outputs.

Args:
    code_string (str): CUDA code string to be compiled by jiterator. The entry functor must return value by reference.
    num_outputs(int): number of outputs return by the kernel
    kwargs (Dict, optional): Keyword arguments for generated function

Example::

    code_string = "template <typename T> void my_kernel(T x, T y, T alpha, T& out) { out = -x + alpha * y; }"
    jitted_fn = create_jit_fn(code_string, alpha=1.0)
    a = torch.rand(3, device='cuda')
    b = torch.rand(3, device='cuda')
    # invoke jitted function like a regular python function
    result = jitted_fn(a, b, alpha=3.14)

.. warning::
    This API is in beta and may change in future releases.

.. warning::
    This API only supports up to 8 inputs and 8 outputs
TrG   rH   )r	   r)   r2   s      r   Ú_create_multi_output_jit_fnrK   Ÿ   s&   € ô4 ØðØ#'°[ñØDJñð r   )r   Útypingr   r.   r   r   Úlistr"   Ú__annotations__r   r&   rI   rD   rK   r$   r   r   Ú<module>rO      sk   ðä 	Ý ã Ý ð €ˆˆc‰Ó ÷'5ñ '5÷T)
ñ )
ðX:V ð :V°(ô :VðzØðØ#&ðàõr   