o
    Zh#                    @  s:  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
Z
d dlZd dlZd dlmZ d dlmZmZmZmZmZ d dlZd dlmZ d dlZd dlZd dlm  mZ d dlmZ d dlm Z m!Z! d dl"m#Z# d d	l$m%Z% d d
l&m'Z' d dl(m)Z)m*Z*m+Z+m,Z,m-Z- d dl.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5m6Z6 ddl7m8Z8m9Z9m:Z: ddl;m<Z< ddl:m=Z=m>Z> ddl?m@Z@ ddlAmBZB ddlmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZK ddlLmMZMmNZNmOZOmPZPmQZQmRZRmSZS ddlTmUZU ddlVmWZWmXZXmYZY erd dlZm[Z[m\Z\ d dl]Z]ddl^m_Z_ eQ j`Zaebejcejedf Zeee:jfeRf Zgdcd"d#Zhddd&d'Zided+d,Zjdfd.d/Zkdgd2d3Zlemedenf Zoeebeenejf d4f eeogebend4f f f Zp	dhdid=d>Zqdjd?d@ZrejsG dAdB dBZtG dCdD dDZuG dEdF dFZvejsG dGdH dHevZwejsG dIdJ dJevZxejsG dKdL dLevZyG dMdN dNevZzejsG dOdP dPevZ{ejsG dQdR dRe{Z|ejsG dSdT dTe{Z}ejsG dUdV dVe{Z~G dWdX dXe{ZejsG dYdZ dZevZejsG d[d\ d\eZejsG d]d^ d^eZedZG d_d` d`eNZG dadb dbeZdS )k    )annotationsN)count)AnyCallableOptionalTYPE_CHECKINGUnion)Expr)dtype)countersdynamo_timed)DebugPrinterManager)MultiKernelState)	cache_dir)CallMethodKeyConvertIntKeyDivideByKeyresolve_unbacked_bindingsSymTypes)_get_qualified_name)
OrderedSet)SingletonInt)symbol_is_typeSymT   )async_compileconfigir)output_code_log)IRNodeReinterpretView)triton_heuristics)DeviceProperties)cache_on_selfget_benchmark_nameLineContextsympy_product	sympy_str
sympy_substriton_version_uses_attrs_dict)V   )ArgNameCodeGenDeferredLineIndentedBufferPythonPrinterWorkspaceArgWorkspaceZeroMode)cexpr)	config_ofshould_unwrap_unspec_argsignature_to_meta)IteratorSequence)GraphLoweringnode
BufferLikereturnReuseKeyc                 C  s,   t j| }|  |  tt jj|fS N)r*   graphget_allocation_storage_sizeget_device_or_error	get_dtyper'   sizevarssimplify)r:   Zstorage_size rE   N/var/www/auris/lib/python3.10/site-packages/torch/_inductor/codegen/wrapper.pybuffer_reuse_keyT   s
   rG   	input_buf
output_bufc                 C  s   |   |  kr
dS |  | krdS tjjtj| }tjjtj|}t|t|ksDtjj|d| rFtjj	||rFdS dS )NFgffffff?T)
rA   rB   r*   r?   rC   rD   r@   r'   Zstatically_known_geqZstatically_known_leq)rH   rI   Z
input_sizeZoutput_sizerE   rE   rF   can_match_buffer_size`   s"   

rJ   argtorch.Argumentstrc           
      C  s   ddl m}m} t| j}|dkr&| jd ur | jjr d| dS d| dS ||v r0|| }|S | D ]2\}}t	|d |}t
|dkrf|d }||v sXJ d	| d
| || }	| d|	 d  S q4td| )Nr+   )CONTAINER_PYTHON_TO_CPPPYTHON_TO_CPPTensorzat::&z const&z\[([a-zA-Z_]+)]r   zunsupported z type in convert_arg_type: <>zunsupport python_type: )cpprN   rO   repr	real_type
alias_infoZis_writeitemsrefindalllenAssertionError)
rK   rN   rO   python_typecpp_typeZpy_containerZcpp_containerZcontainer_matchZcontained_typeZcpp_contained_typerE   rE   rF   convert_arg_type   s(   

r_   retc                 C  sT   t | j}ddd}||d }|d usJ d| |dkr(| jd ur(|d7 }|S )Nz
at::Tensorzstd::vector<at::Tensor>)rP   zList[Tensor]zNYI return type: rP   rQ   )rU   rV   getrW   )r`   r]   Zpython_to_cppr^   rE   rE   rF   convert_return_type   s   
rb   kerneltorch._ops.OpOverloadc                 C  s   | j j}| j j}t|}|dksJ d|dkrt|d }n|dkr3ddd |D }d| d}d	d |D }| d
d| dS )Nr   z#must have at least one return valuer+   , c                 S     g | ]}t |qS rE   )rb   ).0rrE   rE   rF   
<listcomp>       z%get_cpp_op_schema.<locals>.<listcomp>zstd::tuple<rS   c                 S  s    g | ]}t | d |j qS ) )r_   namerg   rK   rE   rE   rF   ri      s     ())Z_schema	argumentsreturnsr[   rb   join)rc   argsrq   Znum_returnsZcpp_return_valueZtuple_returnsZcpp_arg_typerE   rE   rF   get_cpp_op_schema   s   rt   .rl   configslist[triton.Config]gridslist[TritonGrid]wrapperOptional[PythonWrapperCodegen]tuple[str, str]c              	     s  t  d!dd d" fd	d
}d#d$fdd}d }|d| d r2tjjr2j nt }  | t|dkrX||d \}}	|d| d|	  n`t|dks`J t|t|ksjJ t	t
  }
tt||dd ddD ]<\}}|jrdd |j D }d|}nd}||\}}	d| d | }||
v rq{|
| ||d| d |	  q{W d    n1 sw   Y  W d    n1 sw   Y  | fS )%NitemUnion[int, sympy.Expr]r<   
sympy.Exprc                 S  s   t | tjr| S t| S r>   )
isinstancesympyr	   Integer)r|   rE   rE   rF   _convert_to_sympy_expr      z@user_defined_kernel_grid_fn_code.<locals>._convert_to_sympy_exprgrid
TritonGridc                   sb   du st | r| | fS t fdd| D }|tjjr.tfdd|D fS dfS )a'  
        This function return a tuple of two values: the first one is for the real grid
        which is used in the generated code; the second one is an example grid with
        concreate values which is used in the autotune block to run the generated
        kernels at compile time.
        Nc                 3  s    | ]} |V  qd S r>   rE   rg   g)r   rE   rF   	<genexpr>       zKuser_defined_kernel_grid_fn_code.<locals>.determine_grid.<locals>.<genexpr>c                 3       | ]}  |t|V  qd S r>   generate_example_arg_valuetyper   )ry   rE   rF   r      s
    
)callabletuplecodegen_python_shape_tupler   tritonautotune_at_compile_time)r   Z
sympy_grid)r   ry   rE   rF   determine_grid   s   	
z8user_defined_kernel_grid_fn_code.<locals>.determine_gridlinerM   example_gridOptional[str]c                   s@    |  rtjjr jvrj |p|  d S d S d S d S r>   )	writeliner   r   r   kernel_autotune_nameskernel_autotune_calls)r   r   )rl   outputry   rE   rF   r      s   

z3user_defined_kernel_grid_fn_code.<locals>.writelineZgrid_wrapper_for_def z(meta):r+   r   zreturn c                 S     t | d jS Nr+   r[   kwargsxrE   rE   rF   <lambda>      z2user_defined_kernel_grid_fn_code.<locals>.<lambda>Tkeyreversec                 S  s    g | ]\}}d | d| qS )zmeta['z'] == rE   )rg   rl   valrE   rE   rF   ri     s    z4user_defined_kernel_grid_fn_code.<locals>.<listcomp>z and Trueif z	: return )r|   r}   r<   r~   )r   r   r>   )r   rM   r   r   )r/   r   r   r   r   indent
contextlibnullcontextr[   r   rM   sortedzipr   rX   rr   addgetvalue)rl   ru   rw   ry   r   r   fn_nameZkernel_autotune_calls_indentr   r   seencZguardsZ	statementrE   )r   rl   r   ry   rF    user_defined_kernel_grid_fn_code   sN   

	

 r   c                   s^   t  j| jdd ddlm  ddlm t| jg fdd|  	 S )zg
    Given a triton kernel function pointer collect the transitive closure of
    its dependencies
    Tstripr   )JITFunction)	constexprc              	     s  t dd t| jD }| jjdi }| jjjD ]}|v r!q|| jjv r| jj| }t| rM	  
d j|jdd | | qt|tttfr	  t|rgd|jd}n|}|| }rt|trd	|j d
|j }nd	|}
| | d|  n

| d|  | q||v r|dkrt|dr|jdrˈ
d|j d|j d|  | qd S )Nc                 s  s     | ]}|j d kr|jV  qdS )LOAD_GLOBALN)opnameargval)rg   instrE   rE   rF   r   4  s    
z^user_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse.<locals>.<genexpr>__annotations__z@triton.jitTr   ztl.constexpr(ro   : . = tl
__module__r   zfrom z import z as )r   disBytecodefn__globals__ra   __code__co_namesr   newliner   splicesrcr   intrM   boolvaluer   r   __name__hasattr
startswith)Z
cur_kernelZunqualified_loadsZglobal_annotationsZsymbol_namesymbolZ
symbol_str
annotationZannotation_coder   compile_wrapperr   Zsymbols_includedtraverserE   rF   r   /  sT   










zKuser_defined_triton_kernel_transitive_closure_source_code.<locals>.traverse)
r/   r   r   r   r   Ztriton.languager   r   r   r   )rc   rE   r   rF   9user_defined_triton_kernel_transitive_closure_source_code   s   :r   c                   @  s&   e Zd ZU ded< ded< dd ZdS )SymbolicCallArgrM   innerr~   
inner_exprc                 C  s
   t | jS r>   )rM   r   selfrE   rE   rF   __str__s     
zSymbolicCallArg.__str__N)r   r   __qualname__r   r   rE   rE   rE   rF   r   m  s   
 r   c                      s:   e Zd Z fddZdddZdd
dZdddZ  ZS )MemoryPlanningStatec                   s    t    tt| _d| _d S Nr   )super__init__collectionsdefaultdictlist
reuse_pooltotal_allocated_buffer_sizer   	__class__rE   rF   r   x  s   

zMemoryPlanningState.__init__r   r=   r<   r   c                 C  s   t | j|d S r>   )r   r   ra   )r   r   rE   rE   rF   __contains__     z MemoryPlanningState.__contains__FreeIfNotReusedLinec                 C  s   | j |  }|jrJ |S r>   )r   pop	is_reusedr   r   r|   rE   rE   rF   r     s   
zMemoryPlanningState.popr|   Nonec                 C  s   |j rJ | j| | d S r>   )r   r   appendr   rE   rE   rF   push  s   
zMemoryPlanningState.push)r   r=   r<   r   )r   r=   r<   r   )r   r=   r|   r   r<   r   )r   r   r   r   r   r   r   __classcell__rE   rE   r   rF   r   w  s
    

r   c                   @     e Zd ZdS )WrapperLineNr   r   r   rE   rE   rE   rF   r         r   c                   @  s2   e Zd ZU ded< ded< dddZdddZdS )EnterSubgraphLinePythonWrapperCodegenry   r9   r?   r<   r   c                 C  s   | j | j j d S r>   )ry   push_computed_sizescomputed_sizesr   rE   rE   rF   __post_init__     zEnterSubgraphLine.__post_init__coder/   c                 C  s   | j | j |  d S r>   )ry   push_codegened_graphr?   	do_indentr   r   rE   rE   rF   codegen  s   zEnterSubgraphLine.codegenNr<   r   r   r/   r<   r   r   r   r   r   r   r  rE   rE   rE   rF   r     s
   
 
r   c                   @  s*   e Zd ZU ded< dddZdd	d
ZdS )ExitSubgraphLiner   ry   r<   r   c                 C  s   | j  | j _d S r>   )ry   pop_computed_sizesr   r   rE   rE   rF   r     r   zExitSubgraphLine.__post_init__r   r/   c                 C  s   | j   |  d S r>   )ry   pop_codegened_graphdo_unindentr  rE   rE   rF   r    s   
zExitSubgraphLine.codegenNr  r  r  rE   rE   rE   rF   r    s   
 
r  c                   @  s(   e Zd ZU ded< ded< dd	d
ZdS )EnterDeviceContextManagerLiner   
device_idxzOptional[int]last_seen_device_guard_indexr   r/   r<   r   c                 C  s   t jjrO|d t jjr,| jd u r |t jj  d d S | j| jks*J dd S | jd u rC|t jj	  d| j d d S |d| j d d S |dt jj
| j d |  |t jj| j d S )	N
z) stream_guard(stream, this->device_idx_);z4AOTInductor only supports running on one CUDA devicez device_guard(z);zdevice_guard.set_index(with :)r*   r?   cpp_wrapperr   aot_moder  
device_opsZcpp_aoti_stream_guardr  Zcpp_aoti_device_guarddevice_guardr  
set_devicer  rE   rE   rF   r    s$   


z%EnterDeviceContextManagerLine.codegenNr  )r   r   r   r   r  rE   rE   rE   rF   r    s   
 r  c                   @     e Zd ZdddZdS )	ExitDeviceContextManagerLiner   r/   r<   r   c                 C  s   t jjs
|  d S d S r>   )r*   r?   r  r
  r  rE   rE   rF   r       z$ExitDeviceContextManagerLine.codegenNr  r   r   r   r  rE   rE   rE   rF   r    s    r  c                   @  s4   e Zd ZU ded< dddZdddZdddZdS )MemoryPlanningLiner   ry   stater   r<   c                 C  s   | S )zFirst pass to find reuserE   r   r  rE   rE   rF   plan  s   zMemoryPlanningLine.planr   r/   r   c                 C     dS )zSecond pass to output codeNrE   r  rE   rE   rF   r    s    zMemoryPlanningLine.codegenrM   c                 C  sr   g }t | D ]#}|jdkrqt| |j}||j d|jtju r%| n|  qt| j	 dd
| dS )zF
        Emits a string representation that fits on one line.
        ry   =rn   re   ro   )dataclassesfieldsrl   getattrr   r   r   Bufferget_namer   rr   )r   rs   fieldr   rE   rE   rF   r     s   
"zMemoryPlanningLine.__str__Nr  r   r<   r  r  r<   rM   )r   r   r   r   r  r  r   rE   rE   rE   rF   r    s
   
 

r  c                   @  s*   e Zd ZU ded< dddZdddZdS )AllocateLiner;   r:   r  r   r<   r  c                 C  s   | j  tjjv rt| jS t| j }tj	r+||v r+|
|}d|_t| j|j | j S | j  jdkrM| j| j }|d urM| jtttj|d7  _| S )NTcpur+   )r:   r$  r*   r?   removed_buffersNullLinery   rG   r   allow_buffer_reuser   r   	ReuseLinerA   r   static_shape_for_buffer_or_noner   r   	functoolsreduceoperatormul)r   r  r   Z	free_lineZstatic_shaperE   rE   rF   r    s   


zAllocateLine.planr   r/   r   c                 C  s2   | j  tjjvsJ | j| j }|| d S r>   )r:   r$  r*   r?   r*  ry   make_buffer_allocationr   r   r   r   rE   rE   rF   r    s   zAllocateLine.codegenNr&  r  )r   r   r   r   r  r  rE   rE   rE   rF   r(    s   
 
r(  c                   @  s6   e Zd ZU ded< dZded< dd
dZdddZdS )r   r;   r:   Fr   r   r  r   r<   r  c                 C  sl   t | j dkr| S t| jjtjr| S | jrJ | j t	j
jv r(t| jS tjr4|t| j|  | S r   )r[   r:   Zget_inputs_that_alias_outputr   layoutr   ZMultiOutputLayoutr   r$  r*   r?   r*  r+  ry   r   r,  r   rG   r  rE   rE   rF   r    s   

zFreeIfNotReusedLine.planr   r/   r   c                 C  s8   | j  tjjvsJ | js|| j| j  d S d S r>   )	r:   r$  r*   r?   r*  r   r   ry   make_buffer_freer  rE   rE   rF   r    s   zFreeIfNotReusedLine.codegenNr&  r  )r   r   r   r   r   r  r  rE   rE   rE   rF   r   
  s
   
 
r   c                   @  s>   e Zd ZU ded< ded< dZded< dddZdddZdS )r-  r;   r:   	reused_asTr   
delete_oldr  r   r<   r  c                 C  sL   | j  tjjv r| j tjjv sJ t| jS | j tjjvs$J | S r>   )r:   r$  r*   r?   r*  r7  r+  ry   r  rE   rE   rF   r  '  s
   
zReuseLine.planr   r/   r   c                 C  sL   | j  tjjvsJ | j tjjvsJ || j| j | j| j	 d S r>   )
r:   r$  r*   r?   r*  r7  r   ry   make_buffer_reuser8  r  rE   rE   rF   r  .  s
   zReuseLine.codegenNr&  r  )r   r   r   r   r8  r  r  rE   rE   rE   rF   r-  !  s   
 
r-  c                   @  r   )r+  Nr   rE   rE   rE   rF   r+  6  r   r+  c                   @  sH   e Zd ZU ded< ded< edddZedd
dZedddZdS )CommBufferLiner   ry   	ir.Bufferr:   r<   r   c                 C  sF   ddl m} | j }| j }||rtd| j t||j S )Nr   )is_symbolicz-The size of a comm buffer can't be symbolic: )Ztorch._inductor.utilsr<  r:   Z	get_numelrB   r\   r   itemsize)r   r<  numelr
   rE   rE   rF   size?  s   


zCommBufferLine.sizeir.CommBufferTypec                 C      | j  }t|tjsJ |jS r>   )r:   get_output_specr   r   CommBufferLayoutcomm_buffer_typer   r5  rE   rE   rF   rD  K     
zCommBufferLine.comm_buffer_typerM   c                 C  rA  r>   )r:   rB  r   r   rC  
group_namerE  rE   rE   rF   rG  Q  rF  zCommBufferLine.group_nameNr<   r   )r<   r@  r'  )r   r   r   r   propertyr?  rD  rG  rE   rE   rE   rF   r:  :  s   
 r:  c                   @  s"   e Zd Zd
ddZedd Zd	S )CommBufferAllocateLiner   r/   r<   r   c                 C  sx   | j  tjjvsJ | j  }| j  }| j  }t| j  }t| j 	 }|
| | j| j| j||||| d S r>   )r:   r$  r*   r?   r*  
get_devicerB   r   get_size
get_strider   make_allocation_linerD  rG  ry   )r   r   rl   devicer
   shapestriderE   rE   rF   r  Z  s$   


zCommBufferAllocateLine.codegenc                 C  s^   | t jjkr(| d|| d|| d| d|j d| dtdd dS td	|  )
Nz = empty_strided_p2p(re   z, torch.device("cuda:z"), group_name="z", alloc_id=r   l    ro   zUnsupported comm buffer type: )r   ZCommBufferTypeZSYMM_MEMcodegen_shape_tupleindexrandomrandintNotImplementedError)rD  rG  ry   rl   rO  r
   rP  rQ  rE   rE   rF   rN  n  s$   

z+CommBufferAllocateLine.make_allocation_lineNr  )r   r   r   r  staticmethodrN  rE   rE   rE   rF   rJ  X  s    
rJ  c                   @  r  )	CommBufferFreeLiner   r/   r<   r   c                 C  s,   | j | j}|| d| jj d d S )Nz # z buffer free)ry   r6  r:   r   rD  r   r4  rE   rE   rF   r    s   zCommBufferFreeLine.codegenNr  r  rE   rE   rE   rF   rX    s    rX  c                      s  e Zd ZdZ fddZe	dFdGddZdHddZdIddZdHddZ	dJddZ
dHddZedHdd ZedHd!d"ZdKd%d&ZedLd(d)ZdHd*d+ZdMd-d.ZdNd0d1ZdHd2d3ZdHd4d5ZdHd6d7ZdOd9d:ZdPd<d=ZdLd>d?ZdHd@dAZdHdBdCZdFdQdEdFZdGdH ZdIdJ ZdKdL ZdMdN Z dOdP Z!dRdQdRZ"dSdSdTZ#dHdUdVZ$dTdXdYZ%dUd\d]Z&dUd^d_Z'dUd`daZ(dbdc Z)ddde Z*dVdkdlZ+dWdndoZ,dpdq Z-drds Z.dtdu Z/			dXdYd|d}Z0d~d Z1dPddZ2dd Z3dd Z4dd Z5dd Z6dZddZ7dd Z8d[ddZ9dd Z:ddd\ddZ;ddd\ddZ<d]ddZ=d^ddZ>d_ddZ?d_ddZ@dRddZA	dFd`ddZBdaddZCdd ZDdd ZEdd ZFdd ZG			dbdcddĄZHddddǄZIdedd˄ZJdFdfdd΄ZKdgdd҄ZLdgddԄZMddք ZNdd؄ ZOddڄ ZPdd܄ ZQddބ ZRdd ZSdd ZTdd ZUdhddZVdd ZWdddddddiddZXdd ZYdd ZZdd Z[dFddZ\djddZ]	dFddZ^dkddZ_dlddZ`dmddZadnddZbdoddZcdpddZddqddZedd ZfdFddZgdd ZhdrddZidd Zjdsd%d&Zkd'd( Zld)d* Zmdtd-d.Zndud0d1Zod2d3 Zpd4d5 Zqd6d7 Zrd8d9 Zsd:d; Zted<d= Zued>d? Zved@dA ZwedBdC ZxedDdE Zy  ZzS (v  r   zB
    Generate outer wrapper in Python that calls the kernels.
    c                   s  t    t  _t  _t  _t  _t  _t  _	t  _
t  _t  _t  _tt   _i  _t  _g  _d _d _d _d _d _tjjrRdnd _tjjr[dnd _d  _d _i  _ tt   _!t  _"d  _# $  g  _%g  _& '   (   )  tjj*stjj+, D ]
\}} -|| qtt.   _/tt.   _0i  _1t23d  j4 _4t23d d fd
d}| _5i  _6tt   _7t8  _9tt   _:i  _;t<t=j>j?t=j>j@d _Ag  _Bd S )N #r   z
std::move(ro   Tr   rM   r<   c                   s(    j |  tjjr j|  d S d S r>   )importsr   r   r   r   r   )r   r   rE   rF   add_import_once  s   z6PythonWrapperCodegen.__init__.<locals>.add_import_once)Zdebug_printer_levelZuse_array_ref)r   rM   r<   r   )Cr   r   r   _names_iterr/   r[  headerprefixsuffixkernel_declarationswrapper_callkernel_autotune_defsr   subgraph_definitionsr   rM   r   Zsrc_to_kernelZkernel_numel_exprlinesdeclaredeclare_maybe_referenceendingcommentZnone_strr*   r?   r  Z
move_beginZmove_endr  supports_intermediate_hooksuser_defined_kernel_cacheunbacked_symbol_declsr   launcher_fn_nameset_launcher_fn_namecodegened_graph_stackcomputed_sizes_stackwrite_headerwrite_prefix!write_kernel_autotune_defs_headerr  Zconstant_reprsrX   write_constant
BufferName	allocatedfreedreusesr/  	lru_cachewrite_get_raw_streamr\  _metas
_meta_varsr   Zmulti_kernel_statealready_codegened_subgraphsallocated_workspacesr   r   aot_inductor debug_intermediate_value_printerZallow_stack_allocationdebug_printerZadditional_files)r   rl   hashedr\  r   r   rF   r     sp   

zPythonWrapperCodegen.__init__Nis_subgraphr   subgraph_namer   parent_wrapperrz   partition_signatures$Optional[ir.GraphPartitionSignature]c                 C  s.   | r|d usJ |d usJ t |||S t S r>   )SubgraphPythonWrapperCodegenr   )r  r  r  r  rE   rE   rF   create  s   zPythonWrapperCodegen.creater<   r   c                 C  s
   d| _ d S )Ncall)rm  r   rE   rE   rF   rn    r   z)PythonWrapperCodegen.set_launcher_fn_namerl   rM   r  c                 C  s   | j | d|  d S )Nz = None  # )r^  r   )r   rl   r  rE   rE   rF   rt    r   z#PythonWrapperCodegen.write_constantc              	   C  s   t jj }d}|d ur|jd urd|j }d}ttjjdkr#d}| j	j
d| dtj d| dd	d
 | jj
dd	d
 zddlm} | jj
dd	d
 W n ttfyY   Y nw tjre| jd d S d S )NrY  z
# AOT ID: r   zRfrom torch._inductor.codegen.debug_utils import _print_debugging_tensor_value_infoz
                aH  
                from ctypes import c_void_p, c_long, c_int
                import torch
                import math
                import random
                import os
                import tempfile
                from math import inf, nan
                from cmath import nanj
                from torch._inductor.hooks import run_intermediate_hooks
                from torch._inductor.utils import maybe_profile
                from torch._inductor.codegen.memory_planning import _align as align
                from torch import device, empty_strided
                from z import AsyncCompile
                from torch._inductor.select_algorithm import extern_kernels
                from torch._inductor.codegen.multi_kernel import MultiKernelCall
                z
            Tr   a  
                aten = torch.ops.aten
                inductor_ops = torch.ops.inductor
                _quantized = torch.ops._quantized
                assert_size_stride = torch._C._dynamo.guards.assert_size_stride
                empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
                reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
                alloc_from_pool = torch.ops.inductor._alloc_from_pool
                async_compile = AsyncCompile()
            )_SymmetricMemoryzs
                empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p
                zfrom torch.cuda import nvtx)torchZ_guardsZTracingContextZtry_getZaot_graph_namer   r   r  r  r[  r   r   r   r^  Ztorch._C._distributed_c10dr  AttributeErrorImportErrorannotate_trainingr   )r   contextZaot_config_commentZaot_inductor_debug_utilsr  rE   rE   rF   rq    sB   
z!PythonWrapperCodegen.write_headerr^  c                 C     d S r>   rE   )r   r^  rE   rE   rF   include_extra_header5     z)PythonWrapperCodegen.include_extra_headerc                 C     | j dtj d d S )Na	  
                import torch
                from torch._dynamo.testing import rand_strided
                from torch._dynamo.utils import preserve_rng_state
                from torch._inductor.select_algorithm import AlgorithmSelectorCache
                from aH   import AsyncCompile

                async_compile = AsyncCompile()
                generate_example_value = AlgorithmSelectorCache.generate_example_value
                empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
                empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
            )rc  r   r   r   r   rE   rE   rF   rs  8  s
   z6PythonWrapperCodegen.write_kernel_autotune_defs_headerc                 C  sn   dt j d}tjjr| j| | jtj	j
d tj	js5| jj|dd | jtj	j
d d S d S )NzU
            import triton
            import triton.language as tl
            from z+ import start_graph, end_graph
            get_raw_streamTr   )r!   r   r   r   r   r   r   r   r*   r?   r  import_get_raw_stream_asr  r[  )r   Z
import_strrE   rE   rF   write_triton_header_onceH  s   z-PythonWrapperCodegen.write_triton_header_oncec                 C  sD   t jjr| jtjjd tjj	s | j
tjjd d S d S )Nr  )r   r   r   r   r   r*   r?   r  r  r  r[  r   rE   rE   rF    write_get_raw_stream_header_onceZ  s   z5PythonWrapperCodegen.write_get_raw_stream_header_oncemetaTritonMetaParamsc                 C  sv   t |}|| jvr6dt| j }|| j|< | j| d|  tjjr6| j| d|  | j	
| | j| S )Nr  r   )rU   r{  r[   r^  r   r   r   r   r   r|  r   )r   r  varrE   rE   rF   add_meta_oncee  s   


z"PythonWrapperCodegen.add_meta_once	list[str]c                   s    fdd   D S )Nc                   s   g | ]}|  jqS rE   )codegen_referencerb  rg   r   r   rE   rF   ri   r  s    z8PythonWrapperCodegen.get_output_refs.<locals>.<listcomp>)get_graph_outputsr   rE   r   rF   get_output_refsp  s   
z$PythonWrapperCodegen.get_output_refsc                 C  r  r>   rE   r   rE   rE   rF   mark_output_typev  r  z%PythonWrapperCodegen.mark_output_type>dict[str, Union[ir.TensorBox, ir.TorchBindObject, sympy.Expr]]c                 C     t jjS r>   )r*   r?   graph_inputsr   rE   rE   rF   get_graph_inputsy  s   z%PythonWrapperCodegen.get_graph_inputslist[IRNode]c                 C  r  r>   )r*   r?   graph_outputsr   rE   rE   rF   r  ~     z&PythonWrapperCodegen.get_graph_outputsc              
   C  s   |    D ]B\}}t|tjtjfrq|tjj	vs t|tj
r!qt| dkr*q| | }| | }| jd| d| d| d qd S )Nr   zassert_size_stride(re   ro   )r  rX   r   r   r	   r   TorchBindObjectr*   r?   graph_input_namesGeneratorStater&   rL  r   rM  r_  r   )r   rl   bufr?  rQ  rE   rE   rF   codegen_input_size_asserts  s   "z/PythonWrapperCodegen.codegen_input_size_assertsc                 C  sj   | j d |   D ]&\}}t|tjtjfrqd| d}| j | d| d}| j | qd S )Nz(# make sure graph inputs are not nan/infzassert not z.isnan().any().item()z.isinf().any().item())	r_  r   r  rX   r   r   r	   r   r  )r   rl   r  r   rE   rE   rF   codegen_input_nan_asserts  s   z.PythonWrapperCodegen.codegen_input_nan_assertsc                 C     | j d d S )NzV

            async_compile.wait(globals())
            del async_compile
            )r_  r   r   rE   rE   rF   write_async_compile_wait  s   z-PythonWrapperCodegen.write_async_compile_waitinput_namesc                 C  s@   d |}t|dkr|d7 }| j| d | jd d S )Nre   r+   ,z = argszargs.clear())rr   r[   r_  r   )r   r  lhsrE   rE   rF   
write_args  s
   
zPythonWrapperCodegen.write_argsr   c                 C  s8   t jr| jd d}|S | jd| j d d}|S )Na  
                class Runner:
                    def __init__(self, partitions):
                        self.partitions = partitions

                    def recursively_apply_fns(self, fns):
                        new_callables = []
                        for fn, c in zip(fns, self.partitions):
                            new_callables.append(fn(c))
                        self.partitions = new_callables

                    def call(self, args):
                r   z
                def z(args):
                r+   )r   graph_partitionr_  r   rm  r   prefix_indentrE   rE   rF   !write_launcher_fn_call_get_indent  s   	z6PythonWrapperCodegen.write_launcher_fn_call_get_indentc                 C  r  r>   )r*   r?   r  r   rE   rE   rF   get_graph_input_names  r  z*PythonWrapperCodegen.get_graph_input_namesc                 C  s   | j d usJ |   |  }| j|< tjjr$| jt	j
j  t	j
 }tjr6| jd| d |   }rA| | |   |   W d    d S 1 sTw   Y  d S )Nz0training_annotation = nvtx._device_range_start(''))rm  r  r  r_  r   r   r   debug_sync_graphr   r*   r?   r  synchronizeZget_training_phaser  r  r  codegen_inputs"codegen_input_size_and_nan_asserts)r   r  Zphaser  rE   rE   rF   rr    s    



"z!PythonWrapperCodegen.write_prefixc                 C  s$   t jr|   t jr|   d S d S r>   )r   Zsize_assertsr  Znan_assertsr  r   rE   rE   rF   r    s
   z7PythonWrapperCodegen.codegen_input_size_and_nan_assertsr  c                 C  sX   |    d| }tjjr| j| d| d tjjr|S | | d| d |S )Nstream = get_raw_stream(ro   )	r  r   r   r   r   r   r*   r?   r  )r   r  r?   rl   rE   rE   rF   rz    s   
z)PythonWrapperCodegen.write_get_raw_streamc                 C  s
   | j d S )N)ro  r   rE   rE   rF   get_codegened_graph  r   z(PythonWrapperCodegen.get_codegened_graphc                 C     | j | d S r>   )ro  r   )r   r?   rE   rE   rF   r        z)PythonWrapperCodegen.push_codegened_graphc                 C  
   | j  S r>   )ro  r   r   rE   rE   rF   r	    r   z(PythonWrapperCodegen.pop_codegened_graphc                 C  s   ddl m} | j||S )Nr   )deepcopy)copyr  rp  r   )r   r   r  rE   rE   rF   r      s   z(PythonWrapperCodegen.push_computed_sizesc                 C  r  r>   )rp  r   r   rE   rE   rF   r    r   z'PythonWrapperCodegen.pop_computed_sizesc                 C  s   t | j S r>   )nextr]  r   rE   rE   rF   next_kernel_suffix     z'PythonWrapperCodegen.next_kernel_suffixc                 C  s   |  t|| j tjjr=|   | j dtj	j
| d | j  | j tj	j
| | j d| d| d || _d S )Nr  r  r  r  ro   )r   r  r  r   r   r   r  r   r*   r?   r  r  r  r  )r   r  rE   rE   rF   codegen_device_guard_enter  s    


z/PythonWrapperCodegen.codegen_device_guard_enterc                 C  s&   |  t  tjjr| j  d S d S r>   )r   r  r   r   r   r   r
  r   rE   rE   rF   codegen_device_guard_exit  s   z.PythonWrapperCodegen.codegen_device_guard_exitoutput_refsc                 C  s2   |r| j dd| d  d S | j d d S )Nzreturn (re   , )z	return ())rb  r   rr   )r   r  rE   rE   rF   generate_return#  s   z$PythonWrapperCodegen.generate_returnresultr/   c                 C  r  r>   rE   r   r  rE   rE   rF   generate_before_suffix)  r  z+PythonWrapperCodegen.generate_before_suffixc                 C  sB   t jrd| jt| jdkrdnd }|d| d d S d S )Nre   r+   r  rY  z-
                runner = Runner(partitions=[z{])
                call = runner.call
                recursively_apply_fns = runner.recursively_apply_fns
                )r   r  rr   all_partition_namesr[   r   )r   r  Zall_partition_name_listrE   rE   rF   generate_after_suffix,  s   
z*PythonWrapperCodegen.generate_after_suffixc                 C  r  r>   rE   r  rE   rE   rF   generate_end:  r  z!PythonWrapperCodegen.generate_endc                 C  s   |  || d S r>   )generate_extern_kernel_alloc)r   Zfallback_kernelrs   rE   rE   rF   generate_fallback_kernel=  r  z-PythonWrapperCodegen.generate_fallback_kernelc              
   C  s   t |jtj}| }| }| }| j}tj	r"d|v r"d| }|r9| 
| j | dd| d|  d S | 
| j | d| dd| d|  | jrrtjrt|d urvtd d  d	7  < | 
d
|jd| d d S d S d S d S )NZview_as_complexz.clone()rn   re   ro   r   ZinductorZintermediate_hooksr+   zrun_intermediate_hooks()r   r5  r   
NoneLayoutr$  Zget_origin_nodeZget_kernel_namerh  r   memory_planningr   rf  rr   rj  Zgenerate_intermediate_hooksr   rl   )r   Zextern_kernelrs   Z	no_returnoutput_nameZorigin_nodekernel_namerh  rE   rE   rF   r  @  s.   
*$z1PythonWrapperCodegen.generate_extern_kernel_allocrc   outout_viewrs   rO  c                 C  sz   t jjj}|||d d d |d|r|n|  | | | dd| d W d    d S 1 s6w   Y  d S )NZexternzout=rn   re   ro   )r*   r?   wrapper_coder  set_printer_argsr   r   rr   )r   rc   r  r  rs   rO  debug_printer_managerrE   rE   rF   generate_extern_kernel_out]  s   
	"z/PythonWrapperCodegen.generate_extern_kernel_outFc                   s   |j }|j}|rtdd |D }tdd |D }|j  d}d fdd|D }d fdd|D }t |j}d}| d	|j	 d
}| d| d| d| }	| d|	 d}
|
S )Nc                 s      | ]
}t jj|V  qd S r>   r*   r?   rC   atomically_apply_size_hintrg   drE   rE   rF   r   p  s    zEPythonWrapperCodegen._generate_tma_descriptor_call.<locals>.<genexpr>c                 s  r  r>   r  r  rE   rE   rF   r   q  s    
z.data_ptr()re   c                 3      | ]	}t  |V  qd S r>   r   val_to_arg_strrg   dimr   rE   rF   r   w  s    c                 3  r  r>   r  r  r   rE   rF   r   x  s    
z$triton.tools.experimental_descriptorz.create_Zd_tma_descriptorrn   ro   )
dims
block_dimsr   tensorr  rr   r   r  element_sizeZrank)r   descapply_size_hintsr  r  Zptrr  r_  r   rs   r  rE   r   rF   _generate_tma_descriptor_calll  s$   z2PythonWrapperCodegen._generate_tma_descriptor_callc                 C  s.   |  |}|j d| | j }| | d S Nr   )r  rl   rh  r   )r   r  r  r   rE   rE   rF   generate_tma_descriptor  s   
z,PythonWrapperCodegen.generate_tma_descriptorc           	      C  sf   | dd tt| }|dr|d dg| 7 }n|r(|dt| 7 }|d7 }| | d S )Nrn   r  zaten.scatter_reducere   rY  z	, reduce=ro   )rr   maprM   r   rU   r   )	r   r   inputscpp_kernel_namepython_kernel_nameZsrc_is_tensorr0  r   r   rE   rE   rF   generate_scatter_fallback  s   

z.PythonWrapperCodegen.generate_scatter_fallbackc                 C  s4   dd | d}||||g}| | || d S )N[re   ])rr   r   wrap_kernel_call)r   rc   r   indicesvalues
accumulateZindices_strrs   rE   rE   rF   generate_index_put_fallback  s   z0PythonWrapperCodegen.generate_index_put_fallbackbuf_namer  r  codegen_argsop_overloadOptional[torch._ops.OpOverload]c              	   C  s&   |  | d| dd| d d S )Nr   rn   re   ro   )r   rr   )r   r  r  r  r  r  raw_argsoutputsrE   rE   rF   ,generate_fallback_kernel_with_runtime_lookup  s   &
zAPythonWrapperCodegen.generate_fallback_kernel_with_runtime_lookupc                 C  s6   t d | |W  d    S 1 sw   Y  d S )NPythonWrapperCodegen.generate)r   	_generate)r   is_inferencerE   rE   rF   generate  s   
$r	  c                 C  s   t jrdS dS )Nr   r+   )r   r  r   rE   rE   rF   get_wrapper_call_indent  s   z,PythonWrapperCodegen.get_wrapper_call_indentc                 C  s6  t jr|   t }|| j |d || j tj	j
r*tj	jr*tj	jr*t }|| j t }|| j  t jrE| | t jrL|   |rVt jrV|   n|   t jjrft jjsf|   | jD ]}t|trw| | j qi| j| qi| ! }| "  t jj#r| jtj	j$%  t jr| &  t jjrt jjs| '  t jjr| (  t j)rt js| jd | *| W d    n1 sw   Y  | +  || j, | - }|| || j W d    n1 sw   Y  | .| || j/ | 0| | 1| | 2| |3 | j43 fS )NrY  z+nvtx._device_range_end(training_annotation))5r   Zprofile_bandwidthr  r/   r   r[  r   r^  r*   r?   r  r  Zis_const_graphrd  r   	ExitStackenter_contextrb  r   Zprofiler_mark_wrapper_call#generate_profiler_mark_wrapper_callgenerate_start_graphr  memory_planmemory_plan_reuser   Zstore_cubinr   !generate_reset_kernel_saved_flagsre  r   r   r  r  r  r  r  r  generate_end_graph generate_save_uncompiled_kernelsgenerate_and_run_autotune_blockr  r  finalize_prefixr_  r  r  r`  r  r  add_benchmark_harnessZgetvaluewithlinemapra  )r   r  r  stackr   r  Zwrapper_call_indentrE   rE   rF   r
    sl   






+



zPythonWrapperCodegen._generatec              
   C  s   | j d i }| j  d | j  }tjtjkrDtj	t
 ddd}||d |j}W d   n1 s9w   Y  td| zt|| W dS  ty` } ztd	| |d}~ww )
z
        Compose self.kernel_autotune_defs and self.kernel_autotune_calls into a single block of
        code and execute it to trigger Triton kernel compilation and auto-tuning
        zQ
            async_compile.wait(globals())
            del async_compile
        r  z.pyF)dirr`  deletezutf-8NzAuto-tuning code written to %sz%Failed to run autotuning code block: )rc  r   r   r   r   levelloggingDEBUGtempfileNamedTemporaryFiler   writeencoderl   debugexec	ExceptionRuntimeError)r   scopeZtuning_codef	file_patherE   rE   rF   r    s8   z4PythonWrapperCodegen.generate_and_run_autotune_blockc                 C  s"   ddl m} || | j| _d S )Nr+   )MemoryPlanner)r  r,  r  re  )r   r,  rE   rE   rF   r  '  s   z PythonWrapperCodegen.memory_planc                 C  s  t j }| jr2t| jd tr2| jd jj|vr2| j  | jr2t| jd tr2| jd jj|vst	 g}g }t
t| jD ]/}| j| }t|trV||d | j|< q?t|trb|t	  q?t|trn||  q?||  t|dks~J tdd |D }d S )Nr  r   c                 s  s    | ]}|j V  qd S r>   )r   )rg   srE   rE   rF   r   I  s    
z9PythonWrapperCodegen.memory_plan_reuse.<locals>.<genexpr>)r*   r?   get_output_namesre  r   r  r:   rl   r   r   ranger[   r  r   r   r  sum)r   Z	out_namesZplanning_statesZpast_planning_statesir   Z_total_allocated_buffer_sizerE   rE   rF   r  ,  s8   





z&PythonWrapperCodegen.memory_plan_reuser   ir.TensorBox
bound_varsOrderedSet[sympy.Symbol]c           	   	     sp  | j  td  fdd}td  fdd}t|tjr<t|tjr)||v r+d S  | d|  || d S t|t	j
rt| D ]#\}}t|tjrk||vrk | d|| d| d || qHt| D ]#\}}t|tjr||vr | d|| d| d || qrd S t|t	jrd S t|t	jrd S tjjjrd S tdt| )	Nc                         |  d|  d |  dS )Nz_size = z.size()_sizer   rl   r   rE   rF   sizeofU     
zDPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.sizeofc                   r5  )Nz
_stride = z	.stride()Z_strider7  r8  r9  rE   rF   strideofZ  r;  zFPythonWrapperCodegen.codegen_input_symbol_assignment.<locals>.strideofr   r  r  zUnknown value type: )r_  r/  ry  r   r   r	   Symbolr   r   r   Z	TensorBox	enumeraterL  rM  r  r  r  Z	_inductorr   r  r\   r   )	r   rl   r   r3  r:  r<  r  r?  rQ  rE   r9  rF   codegen_input_symbol_assignmentM  s:    
 

z4PythonWrapperCodegen.codegen_input_symbol_assignmentc                 C  sX   t tj  }|  }dd | D dd | D  }|D ]\}}| ||| qdS )z$Assign all symbolic shapes to localsc                 S  s$   g | ]\}}t |tjr||fqS rE   r   r   r=  rg   kvrE   rE   rF   ri     s
    z7PythonWrapperCodegen.codegen_inputs.<locals>.<listcomp>c                 S  s$   g | ]\}}t |tjs||fqS rE   r@  rA  rE   rE   rF   ri     s   $ N)r   r   r=  r  rX   r?  )r   r3  r  r  rl   r   rE   rE   rF   r  w  s   z#PythonWrapperCodegen.codegen_inputssymsympy.Symbolc                 C  sd   t |tjr.t|tjr0|| jv rd S | j| tj	j
j| }| | dt|  d S d S d S r  )r   r   r=  r   r   ZPRECOMPUTED_SIZEr   r   r*   r?   rC   inv_precomputed_replacementsr   pexpr)r   rD  exprrE   rE   rF   ensure_size_computed  s   
z)PythonWrapperCodegen.ensure_size_computedc                 C  r  r>   rE   r   rE   rE   rF   r    r  z$PythonWrapperCodegen.finalize_prefixTrD   r   r	   rD   c                C  s   t d)Nz8codegen_cpp_sizevar is only implemented for cpp_wrapper!)r'  r   r   rD   rE   rE   rF   codegen_cpp_sizevar  r  z(PythonWrapperCodegen.codegen_cpp_sizevarc                C  s   t ||dS )NrJ  )rG  rK  rE   rE   rF   codegen_python_sizevar  r  z+PythonWrapperCodegen.codegen_python_sizevarc                 C  
   |  |S r>   )rM  )r   r   rE   rE   rF   codegen_sizevar  r   z$PythonWrapperCodegen.codegen_sizevarbasenamerS  c                 C  s   | d| dS )Nr  r  rE   )r   rP  rl   rS  rE   rE   rF   codegen_tuple_access  r  z)PythonWrapperCodegen.codegen_tuple_accessrP  Sequence[Expr]c                 C  sN   g t | j|}t|dkrdS t|dkrd|d  dS dd| dS )Nr   ()r+   rn   r  re   ro   )r  rM  r[   rr   )r   rP  partsrE   rE   rF   r     s   z/PythonWrapperCodegen.codegen_python_shape_tuplec                 C  rN  r>   )r   )r   rP  rE   rE   rF   rR    r   z(PythonWrapperCodegen.codegen_shape_tuplec                 C  s.   d d|t|t|| || |gS )Nzalloc_from_pool({})re   )formatrr   rG  rM   r   )r   rl   offsetr
   rP  rQ  rE   rE   rF   codegen_alloc_from_pool  s   z,PythonWrapperCodegen.codegen_alloc_from_poolr   Callable[..., None]c                 C  s   ||j jkr+||j jkr+||j jkr+|d ur&||jkr&d|  d| dS |  S | |}| |}| |}|d urW||jkrWd|  d| d| d| d| dS d|  d| d| d| d	S )Nzaten.view.dtype(re   ro   z#aten.view.dtype(reinterpret_tensor(z), zreinterpret_tensor()r5  r?  rQ  rV  r
   r$  r   rO  )r   datar?  rQ  rV  r   r
   rE   rE   rF   codegen_reinterpret_view  s   




( z-PythonWrapperCodegen.codegen_reinterpret_viewnon_blockingc                 C  s    |  | d| d| d d S )Nz.copy_(re   ro   r7  )r   r   dstr[  rE   rE   rF   codegen_device_copy  s    z(PythonWrapperCodegen.codegen_device_copyc                 C  s$   |  | j | d| | j  d S r  )r   rf  rh  )r   rl   r   rE   rE   rF   codegen_multi_output  s   $z)PythonWrapperCodegen.codegen_multi_outputc                 C  s0  dd |j D \}t|jdkr| |j d| d not|jdkr9t|jd tr9| |j d| d nSt|jdkrt|jd tr| |j d	| d | d
|j d|jd j d|j d|jd j d	 | |j d|j d|jd j  nt	d|j | |
  d d S )Nc                 s  s    | ]}|  V  qd S r>   r  )rg   trE   rE   rF   r     r   z>PythonWrapperCodegen.codegen_dynamic_scalar.<locals>.<genexpr>r   r   .item()r+   z = 1 if z.item() else 0z_undivided = zassert z_undivided % z
 == 0, f'{z_undivided} not divisible by 'z_undivided // unrecognized keypath z = None)r  r[   keypathr   rD  r   r   r   divisorr\   r$  )r   r:   rY  rE   rE   rF   codegen_dynamic_scalar  s&   
z+PythonWrapperCodegen.codegen_dynamic_scalarc              	     s:   fdd} fdd} fdd}  g d     jdd	d
 tjj D ]\}} d|  ||| |	 |j
|j q,ttjjdkrl d tjj D ]\}} d|  ||| qZtjj D ]}\}}t|tjrttjjj|d trqrt|tjrttjjdkr d  d|  |||  qrt|tjr||tjjj|dd qrt|tjr||d|j
j d qrdd | D }dd | D }	||||	| |   qrdd!tjj"  d}
 d|
   d W d    d S 1 sw   Y  d S )Nc                   s8     |  d| d| d| d| d
 d S )Nz = rand_strided(re   
, device='	', dtype=ro   )r   r   )rl   rP  rQ  rO  r
   r   r   rE   rF   add_fake_input  s   zFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_fake_inputc                   s     |  d|  d S r  r7  )rl   r   r   rE   rF   add_expr_input     zFPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_expr_inputc                   s8   dd l }t|tjsJ  |  d||d d S )Nr   z = pickle.loads(ro   )pickler   r  ZScriptObjectr   dumps)rl   r   rn  rk  rE   rF   add_torchbind_input  s    zKPythonWrapperCodegen.benchmark_compiled_module.<locals>.add_torchbind_input)rY  rY  z3def benchmark_compiled_module(times=10, repeat=10):z
                from torch._dynamo.testing import rand_strided
                from torch._inductor.utils import print_performance
                Tr   zglobal r   zimport pickle*   fallbackztorch.cuda.default_generators[z].graphsafe_get_state()c                 S     g | ]}t jjj|d dqS rq  rr  r*   r?   rC   	size_hintr  rE   rE   rF   ri   8      zBPythonWrapperCodegen.benchmark_compiled_module.<locals>.<listcomp>c                 S  rt  ru  rv  r  rE   rE   rF   ri   <  rx  zcall([re   z])zfn = lambda: z8return print_performance(fn, times=times, repeat=repeat))#
writelinesr   r   r*   r?   	constantsrX   r   r?  rQ  rO  r
   r[   torchbind_constantsr  r   r   r=  rC   Z
var_to_valra   r   r   r  Zget_real_objr	   rw  r  rS  rL  rM  rK  rB   rr   keys)r   r   rj  rl  rp  rl   r   Ztorchbind_objrP  rQ  Zcall_strrE   ri  rF   benchmark_compiled_module  sn   


$z.PythonWrapperCodegen.benchmark_compiled_modulec                 C  sh   t jsdS | | |g d |  |ddt  dg W d   dS 1 s-w   Y  dS )zL
        Append a benchmark harness to generated code for debugging
        N)rY  rY  zif __name__ == "__main__":zBfrom torch._inductor.wrapper_benchmark import compiled_module_mainzcompiled_module_main('z', benchmark_compiled_module))r   Zbenchmark_harnessr}  ry  r   r$   r   r   rE   rE   rF   r  L  s   

"z*PythonWrapperCodegen.add_benchmark_harnessr  kernel_bodymetadatagpucpp_definitionc                 C  sf   t jjrd| d| }| j| tjjrd S |r| dnd}d| | d| }| j| d S )Nz

r   r  rY  )	r   r   r   rc  r   r*   r?   r  r^  )r   r  r  r  r  r  bodyZmetadata_commentrE   rE   rF   define_kernel^  s   z"PythonWrapperCodegen.define_kernelfn_codec                 C  r  r>   )rd  r   )r   r  rE   rE   rF   define_subgraph_launcher_fnq  r  z0PythonWrapperCodegen.define_subgraph_launcher_fnrw   "list[list[Union[int, sympy.Expr]]]c           (   	     s  ddl m} ddlm}m}	m}
 ddlm m}m	}m
}m} ddlm}m} |  |j}g i g g }fdd	d5 fdd	}t|jD ]\}}||jv r^|| |ddd qJ|vrcqJ| }| d u rx|| |ddd qJt|tjr||||d qJt|tjr||||| | d qJt|tjr|||||j | |jjd qJt|ttjfot j!j"#|d}||||||d qJt$d dd |jD d}|t%&t j!' i t()|dt*dgd}|rt+||d< |r
t+||d< t,|dkr |	- }g t.tj/|d }n]d6fdd i fd!d|D }|r>t,|t,|ks@J g }t0t1||d"d# dd$D ]\}}|2||g t.t3|g t.t4|d% qN|
j|g t.t56 d&}g 7 }t8|j9g}t,|dkr6 D ]}t|tjtjfs|2| q|2t5| |:t5| t+|}|| j;v rg | j;| |R S | d't,| j; } t< }!t=jj>r|!?d(| d) n	|!?d(|d) | |d*< |@|A  |!B|  |!Bd+g t.||d,|d-|d. tC|}"t=jj>r$|"Dd/| d0d/|  d0}"|!B|" t j!' }#|!?d1|#jE d2 tFG|j9\}$}%tFH|j9}&d3|& d4|% }'| I| |!J |' | |f| j;|< | ||fS )7Nr   )patch_triton_dtype_reprr   )config_to_dict	FixedGridPrecomputedGridr+   )ConstexprArgKernelArgTypeSizeArg	TensorArgTMADescriptorArg)gen_common_triton_importsTritonKernelc                   s    |   |  d S r>   )r   )idxrK   )arg_indices	signaturerE   rF   add_to_signature  s   
zPPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_to_signatureFc                   s   |rt  r
| | |jv r|j |j< d S d S |jv s"J |r=t  r1|  |jd n| | d|j< d S |rRt  rK|  |jd d |j< d S | | d S )Nr8  r+   )r)   rl   )r  rK   is_constexprequals_1equals_none)r  r  rz  r   rE   rF   add_arg  s"   


zGPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.add_argr8  T)r  )r  )rl   bufferr
   )rl   r  r
   rV  )r  c                 S  rf   rE   )r,   r  rE   rE   rF   ri     rj   zJPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<listcomp>)Z
size_dtyper  Zargdefs)r  )r  rO  rz  ru   Zrestore_valueZreset_to_zerorH  r}   r<   r~   c                   sx   t | tjr0g | j}|s| S |jtd |D ]}| v rqtdt   |< qt|  S t | t	s7J t
| S )N)r   Z_launcher_s)r   r   r	   free_symbolssortrM   r=  r[   r(   r   r   )rH  symbolsrD  )extra_launcher_argsrE   rF   rename_sizes_for_launcher  s   



zYPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.rename_sizes_for_launcherc                   s   g | ]	}g t  |qS rE   )r  )rg   r   )r  rE   rF   ri   (  s    c                 S  r   r   r   r   rE   rE   rF   r   -  r   zHPythonWrapperCodegen.define_user_defined_triton_kernel.<locals>.<lambda>r   )r   pythonrT   )Z	grid_typeprecomputed_gridsr  _zasync_compile.triton(z, '''r  zG
            @triton_heuristics.user_autotune(
                configs=z ,
                inductor_meta=z,
                triton_meta=z{,
                filename=__file__,
                custom_kernel=True,
            )
            @triton.jit
            r   rn   z''', device_str='r  z# Original path: r  )FFF)rH  r}   r<   r~   )Ktorch.utils._tritonr  Zruntime.triton_heuristicsr  r  r  commonr  r  r  r  r  r   r  r  r   r>  	arg_namesZ
constexprsr   r   TMADescriptorr#  r$  rB   r    rY  r5  rV  r   r   r   r*   r?   rC   Zstatically_known_equalsr6   r"   r  get_current_device_or_throwdictfromkeysr4   r   r[   Zsetup_grid_as_argsr  Zsympifyr   r   r   rG  r3   rM   r  r|  idr   extendrk  r/   r   Zunique_user_kernel_namesr   updateZinductor_meta_commonr   r   replacer   inspectgetsourcelinesgetsourcefiler  r   )(r   rc   ru   r   Zrestore_value_argsZreset_to_zero_argsrw   r  r  r  r  r  r  r  r  r  r  original_nameZequal_to_1_argsr  r  r   rK   r  Ztriton_signaturetriton_metaZinductor_metaZextra_launcher_call_argsr  r   cfg	cache_keyrl   r   Z
kernel_srcZcurrent_devicer  linenosrcfiler  rE   )r  r  r  rz  r  r   r  r  rF   !define_user_defined_triton_kernelt  s*  	$









z6PythonWrapperCodegen.define_user_defined_triton_kernelr`  c                 C  sN   | d|j  d}|d ur|d| 7 }| | dt|j  t||jS )Nr  r>  r   )r_  r   rG  r>  r   )r   r  treer`  rH  rE   rE   rF   generate_numel_expry  s
   z(PythonWrapperCodegen.generate_numel_exprwsr1   c              
   C  s.  |  }t| |}|jtjkr| | nP|jtjkr)| | | | | n<|jtjkr`| j	
|}|rMt|trBt|jtsDJ t|j||_n| | | | | || j	|< nt|jtjjr| jtj| ||j|jtjj|jfdd |jtjkr| jt| | d S d S d S )N)r+   )rP  rQ  )r$  r(  	zero_moder2   ZUNINITIALIZEDr   ZZERO_ON_CALLmake_zero_bufferZERO_PER_GRAPHr~  ra   r   r:   r1   maximumr\   r   r   r   r   r   make_allocationrO  r
   r*   r?   rC   rw  r   )r   r  rl   r   ZpriorrE   rE   rF   generate_workspace_allocation  sF   





z2PythonWrapperCodegen.generate_workspace_allocationc                 C  s$   |j tjkr| t| | d S d S r>   )r  r2   r  r   r   )r   r  rE   rE   rF   generate_workspace_deallocation  s   z4PythonWrapperCodegen.generate_workspace_deallocationc                 C  s   | d| j  S )Nz.zero_())rh  )r   rl   rE   rE   rF   r    r  z%PythonWrapperCodegen.make_zero_bufferc                 C  s   | dd | d| j S )Nrn   re   ro   )rr   rh  )r   rl   	call_argsrE   rE   rF   r    s   z%PythonWrapperCodegen.wrap_kernel_callc                 C  s8   | j d | j dtjj d || j   d S )Nz*from torch.profiler import record_functionzwith record_function('graph_z_inductor_wrapper_call'):)rb  r   r*   r?   Zgraph_idr  r   )r   r  rE   rE   rF   r    s
   z8PythonWrapperCodegen.generate_profiler_mark_wrapper_callc                 C  r  )Nzstart_graph())rb  r   r   rE   rE   rF   r    r  z)PythonWrapperCodegen.generate_start_graphc                 C  s   | j dtjd d S )Nz
end_graph(ro   )rb  r   r   Zprofile_bandwidth_outputr   rE   rE   rF   r    r   z'PythonWrapperCodegen.generate_end_graphc                 C  r  )NU
            for kernel in globals().values():
                if isinstance(kernel, zU.CachingAutotuner):
                    kernel.cuda_kernel_saved = False
            rb  r   r!   r   r   rE   rE   rF   r    s
   z6PythonWrapperCodegen.generate_reset_kernel_saved_flagsc                 C  s   | j dtj d dS )a[  
        Precompile and save the CUBINs of the Triton kernels that haven't
        been precompiled and saved as a side effect of running the generated
        JIT model (Python wrapper). This can happen when the model contains
        control flow: only one pass through the control flow operators covers
        the kernels that are saved, the remaining kernels are not launched,
        hence not saved. The main purpose of this codegen is to compile and
        save the Triton kernels outside the active control flow path for
        subsequent AOTInductor code generation and compilation.
        r  a  .CachingAutotuner):
                    if not kernel.cuda_kernel_saved:
                        if len(kernel.launchers) == 0:
                            kernel.precompile()
                        kernel.save_gpu_kernel(
                            grid=(0, 0, 0),   # use dummy grid
                            stream="stream",  # use dummy stream
                            launcher=kernel.launchers[0],
                        )
            Nr  r   rE   rE   rF   r    s
   z5PythonWrapperCodegen.generate_save_uncompiled_kernelsc                   s   dd   fdd|D S )Nc                 S  sJ   t | trt| r| d S | S t | ttttfrt| S ttj	j
| S )Nra  )r   rM   r5   r   floatr   r   rG  r*   r?   rC   rD   )rK   rE   rE   rF   wrap_arg  s
   
zAPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.wrap_argc                   s   g | ]} |qS rE   rE   rm   r  rE   rF   ri     rj   zCPythonWrapperCodegen.prepare_triton_kernel_call.<locals>.<listcomp>rE   )r   r  rE   r  rF   prepare_triton_kernel_call  s   	z/PythonWrapperCodegen.prepare_triton_kernel_callc                   s0  t |trt |tjr|j }tj|}n tj	|d ur(|}tj|}n|d us0J dd| }|}t
dd | D }t
dd tj|D }t
dd | D }	| }
| }tjjj| jtjd}d| d	|	 d
|
 d| d	| d	| d} j| d|  t |tjr j|dd}|} j| d|  |S t|tjst |trt |tr| jv r|S |d u rdS |}t |tr|j}|tjjj v rtjjj | }ttjjj!|tjdS t |tt"t#t$frt|S t |t%rdd	& fdd|D  dS t'dt(| )NzBV.graph.get_buffer(arg) and raw_arg can't be None at the same timeZtmp_arg_c                 s  $    | ]}t jjj|tjd V  qdS rr  Nr*   r?   rC   r  r   unbacked_symint_fallbackrg   r+  rE   rE   rF   r         
zBPythonWrapperCodegen.generate_example_arg_value.<locals>.<genexpr>c                 s  r  r  r  r  rE   rE   rF   r     r  c                 s  r  r  r  r  rE   rE   rF   r     r  rr  zgenerate_example_value(re   z, 'z', ro   r   T)r  r  r   r  c                 3  r   r>   r   rg   ar   rE   rF   r   C      r  zUnsupported type ))r   torch_dtyper   r  r  r$  r*   r?   
get_bufferZtry_get_bufferr   rL  get_allocation_sizerM  rK  rB   rC   rw  Z
get_layoutrV  r   r  r   r   r  
issubclassr   ZBasicr   rM   r|  r   rF  r  r   r  r   r   rr   rV  r   )r   rK   arg_typeraw_argrS  r  r  r?  Zallocation_sizerQ  rO  r
   rV  r   rE   r   rF   r     st   




*


 z/PythonWrapperCodegen.generate_example_arg_valuec                   s2   t |trdd fdd|D  d S t|S )Nr  re   c                 3  s    | ]}  |V  qd S r>   )_grid_dim_str)rg   r|   r   rE   rF   r   J  s    z5PythonWrapperCodegen._grid_dim_str.<locals>.<genexpr>r  )r   r   rr   rG  )r   Zgrid_per_dimrE   r   rF   r  G  s   
z"PythonWrapperCodegen._grid_dim_str)rO  r   	arg_typesr  r  c             
   C  s  |pt j }|s|jdks| | || dS | |}d|}t	| |j
t j}	|sGd|	 d}
| | d| d| d|
 d dS |   tjjr|| jvr|durbt|t|ksfJ di }g }|du rvdgt| }nt|t|ksJ d	tt|||D ][\}\}}}d}t|trd
t|v r|d
\}}t|trtd|r|}|||< n||vr| ||||}|||< n|| }n| ||||}||du r|n| d
|  q| j| dd| d|	 d | jdddd | D  d | j| t jjrdS t jjj }|!|||d | | | d| d|	 d W d   dS 1 sCw   Y  dS )z
        Generates kernel call code.

        triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True,
                and C++ when gpu=False.
        r)  Nre   z	c_void_p(ro   r   rn   z$call_args and arg_types do not matchz#call_args and raw_args do not matchr  z^(workspace|semaphore)z.run(z	, stream=del c                 s      | ]}|V  qd S r>   rE   rm   rE   rE   rF   r         z<PythonWrapperCodegen.generate_kernel_call.<locals>.<genexpr>r  )"r*   r?   r  r   r   r  r  rr   r   rz  rS  r  r   r   r   r   r[   r>  r   r   rM   splitr  rY   matchr   r   r   r  r   r  r  r  r  )r   r  r  rO  r   r  r  r  Zcall_args_strstream_nameZ
stream_ptrZtensor_argsall_argsr1  rK   r  r  r   Zarg_strr  rE   rE   rF   generate_kernel_callO  sz   







"

$z)PythonWrapperCodegen.generate_kernel_callc                 C  r  r>   )re  r   )r   r   rE   rE   rF   r     r  zPythonWrapperCodegen.writelinec                 C  s   |D ]}|  | qd S r>   r7  )r   re  r   rE   rE   rF   ry    r  zPythonWrapperCodegen.writelinesc                 C  s   | j t| d S r>   )re  r   r%   )r   ctxrE   rE   rF   r    r   z"PythonWrapperCodegen.enter_contextc                   s   ddl m}m} | rdd l}t|trt|jjS t|t	j
r$t|S t|ttfrEtjG dd d tt| fdd|D S t|tjjrPt|S t|tjtjtfr^| S | rlt||jjrl||S t|tjrv| S t|S )Nr   )dtype_to_stringhas_triton_packagec                   @  s   e Zd ZU ded< dd ZdS )z1PythonWrapperCodegen.val_to_arg_str.<locals>.Shimr   refc                 S  s   | j S r>   )r  r   rE   rE   rF   __repr__  s   z:PythonWrapperCodegen.val_to_arg_str.<locals>.Shim.__repr__N)r   r   r   r   r  rE   rE   rE   rF   Shim  s   
 r  c                 3  s     | ]} t |V  qd S r>   r  r  r  r   rE   rF   r     r  z6PythonWrapperCodegen.val_to_arg_str.<locals>.<genexpr>)r  r  r  r   r   r   rG  r:   rH  r   r	   r   r   r   	dataclassrU   r   r  Z_opsZ
OpOverloadr   r   r#  Z
MutableBoxr    r  languager
   r  )r   r-  type_r  r  r   rE   r  rF   r    s,   
z#PythonWrapperCodegen.val_to_arg_strr  r;   c                 C  sP   |  }| }t| }ttj|}t| }| |	 |||||S r>   )
rK  rB   r   rL  r*   r?   r  rM  r  r$  )r   r  rO  r
   rP  allocation_shaperQ  rE   rE   rF   r3    s   z+PythonWrapperCodegen.make_buffer_allocationc              
   C  s   |d u r|}|  |}|  |}|  |}	|jdv r-| d|j d| d|	 d| d
}
n| d| d|	 d|j d| d
}
||krN|
d	| d|	 d }
|
S )
N)r)  cudaZxpuz = empty_strided_rn   re   ro   z = empty_strided(rg  rh  z.as_strided()r   r   )r   rl   rO  r
   rP  rQ  r  rR  Zcodegen_allocation_shape_tupleZcodegen_stride_tupler  rE   rE   rF   r    s:   


	z$PythonWrapperCodegen.make_allocationrY  c              	   C  s(   | j  | d| | j d| j d| 	S )Nr     rk   )rf  rh  ri  )r   new_nameold_nameri  rE   rE   rF   make_tensor_alias	     (z&PythonWrapperCodegen.make_tensor_alias%Union[BufferLike, ir.TorchBindObject]c                 C  s   d|   S )Nr  r$  )r   r  rE   rE   rF   r6  
	  s   z%PythonWrapperCodegen.make_buffer_freenames_to_delc                 C  s   dd dd |D  S )Nr  re   c                 s  r  r>   rE   )rg   rl   rE   rE   rF   r   	  r  z:PythonWrapperCodegen.make_free_by_names.<locals>.<genexpr>)rr   )r   r  rE   rE   rF   make_free_by_names	  r   z'PythonWrapperCodegen.make_free_by_namesr  r  del_linec              	   C  s(   | j  | d| | | j d| j d	S )Nr   r   reuse)rg  rh  ri  )r   r  r  r  rE   rE   rF   codegen_exact_buffer_reuse	  r  z/PythonWrapperCodegen.codegen_exact_buffer_reuseoldnewr8  c                 C  s   |  |  ks
J | }| }d}|tj vr%|r%d| | }| | kr<| | kr<| |||S | 	|| | d| j
j}| j | d| | d| j dS )N;z; r   r   r  r  )rB   r$  r*   r?   r.  r6  rL  rM  r  rZ  rb  r   rf  ri  )r   r  r  r8  r  r  r  Zreinterpret_viewrE   rE   rF   r9  	  s    "z&PythonWrapperCodegen.make_buffer_reuseviewir.ReinterpretViewc                 C  s8   |  t|| j | d|  | j d| j d d S )Nr   r  z alias)r   r.   rf  r  rh  ri  )r   rl   r  rE   rE   rF   codegen_deferred_allocation#	  s   &z0PythonWrapperCodegen.codegen_deferred_allocationr;  c                 C  sR  |  }|tjjv s|| jv st|tjrd S | j| t|	 tj
tjfr.| s.d S | }t|tjr:d S t|tjrBd S t|tjrt|jtjs]J dt|j d|j t|jjtjsmJ t|jjt|jjjtjs~J t|jj| |jjj | ||j d S t|tjr| t| | d S | t| | d S )Nzunexpected r   )r$  r*   r?   r*  rv  r   r   DonatedBufferr   Zget_defining_opZExternKernelAllocMultiOutputZshould_allocaterB  ZMutationLayoutSHOULDREMOVEr  ZNonOwningLayoutr  r    r   rY  Z
StorageBoxr#  codegen_allocationr   rC  r   rJ  r(  )r   r  rl   r5  rE   rE   rF   r  +	  sB   


 "z'PythonWrapperCodegen.codegen_allocationc                 C  s   |  }t|tjtjfr| | | d S t| tjr)| t	| | d S | 
|s0d S | j| | t| | d S r>   )r$  r   r   ZInputBufferr  r   r6  rB  rC  rX  	can_reuserw  r   r   )r   r  rl   rE   rE   rF   codegen_freeS	  s   
z!PythonWrapperCodegen.codegen_freec                 C  sf   |  }|tjjv p1|tjjv ottjj| tj p1|tjj	v p1|tjj
v p1|tjjv p1|| jv  S r>   )r$  r*   r?   r*  r  r   Zgraph_inputs_originalr   r  rz  r{  Znever_reuse_buffersrw  )r   input_bufferoutput_bufferrl   rE   rE   rF   r  g	  s   


	
zPythonWrapperCodegen.can_reusec                 C  s$   |  | jv o| j|   |  kS r>   )r$  rx  )r   r  Zreused_bufferrE   rE   rF   	did_reusew	  s   zPythonWrapperCodegen.did_reuser  r  c                 C  s`   t ||sJ | | | j|  | j|  | | j| < | t| || d S r>   )	rJ   r  rw  r   r$  rv  rx  r   r-  )r   r  r  rE   rE   rF   codegen_inplace_reuse	  s   
z*PythonWrapperCodegen.codegen_inplace_reusec                 C  s,   t |}|| jv r|S | j| | j| S r>   )rM   rl  r   rf  )r   r   rl   rE   rE   rF   codegen_unbacked_symbol_decl	  s
   

z1PythonWrapperCodegen.codegen_unbacked_symbol_declr  r  r   unbacked_bindings,Optional[dict[sympy.Symbol, pytree.KeyPath]]c                   sp   t tjjj|}|sd S | D ]%\}d
 fdd  fdd}| | | d	|  | j  qd S )NrH  rM   rd  pytree.KeyPathc                   s:  |dkr| S t |dkr3t|d tr3t|d tjr3 |  d|d j d|d j d|dd  S t|d trL |  d|d j d|dd  S t|d tjr}tjj	rk d	|d j d
|  d|dd  S  |  d|d j d|dd  S t|d t
r |  d|d j d|dd  S td| )NrE   r   r   r+   r   rn   ro   rS  z	std::get<z>(r  r  z.__floordiv__(rc  )r[   r   r   pytreeSequenceKeyrl   r  r*   r?   r  r   re  r\   )rH  rd  )gorE   rF   r  	  s*   *$&"$zIPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.goc                    s   t jjrEtdkr+d }  d  t| tjr(t| jdkr(dd  S S td t	j
s5J  d j  dd  S  S )Nr+   r   )r*   r?   r  r[   r$  r   r   r  r  r  r  r  )r  r  rd  r  r  rE   rF   go_outer	  s   


 
zOPythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputs.<locals>.go_outerr   )rH  rM   rd  r  )	r   r*   r?   rC   Z	shape_envrX   r   r
  rh  )r   r  r  r  r-  r  rE   r  rF   (codegen_unbacked_symbol_defs_for_outputs	  s   
z=PythonWrapperCodegen.codegen_unbacked_symbol_defs_for_outputsc                   s    fdd}fdd}z? j j dj  |  tj}tj jj|d W d    n1 sAw   Y  |  W   d S   w )Nc                    sT   t jjt  ksJ tjj D ]\} }j |  d| j  qd S r  )r[   r?   r  r   r   rf  rh  )inner_inputouter_input)outer_inputsr   subgraphrE   rF   _codegen_subgraph_prefix	  s   zSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_prefixc                    sR   t jjt  ksJ tjj D ]\} }| d|   j  qd S r  )r[   r?   r  r   r   r  rh  )Zinner_outputZouter_output)outer_outputsr   r  rE   rF   _codegen_subgraph_suffix	  s   zSPythonWrapperCodegen.codegen_subgraph_by_inlining.<locals>._codegen_subgraph_suffix subgraph: )parent_graph)	r   r?   r   ri  rl   r*   set_graph_handlercodegen_subgraphr	  )r   r  r  r  r  r  r  rE   )r  r  r   r  rF   codegen_subgraph_by_inlining	  s   		z1PythonWrapperCodegen.codegen_subgraph_by_inliningc                 C  sh   t |t |jjksJ d|jj d| t|jj|D ]\}}| | j | d| | j  qd S )Nzgraph_input_names:z, outer_inputs: r   )r[   r?   r  r   r   rf  rh  )r   r  r  r  r  r  rE   rE   rF   codegen_subgraph_prefix
  s   "z,PythonWrapperCodegen.codegen_subgraph_prefixpartition_idir.GraphPartitionSignaturec           	   	   C  s   |j }|j}d| t|dkrdnd }dd |D }d|t|dkr*dnd }| d| d| d	 d
d | D }|rO| dd|  | d| d| d| d | d| d dS )z'Generate code to call a graph partitionre   r+   r  rY  c                 S     g | ]}|  qS rE   r  )rg   r:   rE   rE   rF   ri   %
  rj   z?PythonWrapperCodegen.codegen_partition_call.<locals>.<listcomp>	partition	_args = [r  c                 S  s   g | ]\}}|r|qS rE   rE   )rg   rl   Z
deallocaterE   rE   rF   ri   +
  s
    r  rn   z) = self.partitions[z](partition_args)zdel partition_argsN)input_deallocationoutput_nodesrr   r|  r[   r   rX   )	r   r!  r  r(  r)  r  Zoutput_namesr  r  rE   rE   rF   codegen_partition_call
  s"   z+PythonWrapperCodegen.codegen_partition_callnum_partitionsc                 C  s   dd t |D | _d S )Nc                 S  s   g | ]}d | qS )Z
partition_rE   )rg   r  rE   rE   rF   ri   8
  s    z@PythonWrapperCodegen.set_all_partition_names.<locals>.<listcomp>)r/  r  )r   r+  rE   rE   rF   set_all_partition_names7
  rm  z,PythonWrapperCodegen.set_all_partition_namesc              	   C  s   |j j}d|}t|dkr|d7 }d|t|dkrdnd }| |j j d| d |d t| D ]
}| d|  q7| d| d	|j j d|j j d
 d S )Nre   r+   r  rY  r%  r  r  rn   z) = r&  )r?   r  rr   r[   r   rl   )r   r  r  r  r  Zinner_inputsZouter_output_namesr  rE   rE   rF   codegen_subgraph_call:
  s   
z*PythonWrapperCodegen.codegen_subgraph_callc              	   C  s  t jjr| ||| d S | |j | d | | j d|j  | ||| t j}|j	|j_	|jj| j
vr{t |j% tdd |j \}}W d    n1 sYw   Y  W d    n1 shw   Y  | j
|jj | |j | ||| d S )NrY  r  r  F)r*   r?   r  r  r   r   ri  rl   r   r  r}  r  r   patchr  r   r  r   r-  )r   r  r  r  r  Zsubgraph_coder  rE   rE   rF   r  O
  s&   

z%PythonWrapperCodegen.codegen_subgraphc                   sb   |   |   dt|j  dd |jD } fddtt|jD }| |j|| d S )N = [None] * c                 S  r#  rE   r_  rg   r  rE   rE   rF   ri   o
  rj   z@PythonWrapperCodegen.codegen_invoke_subgraph.<locals>.<listcomp>c                      g | ]
}  d | dqS r  r  rE   rg   r1  r8  rE   rF   ri   p
      )r$  r   r[   r  r  r/  r  r  )r   Zinvoke_subgraphr  r  rE   r8  rF   codegen_invoke_subgraphk
  s
   z,PythonWrapperCodegen.codegen_invoke_subgraphc                   s   |   dd |jD } fddtt|jD }|j }t|jtj	s+| d}| 
  dt|j  | 
d| d | 
t| |jj | |j|| | 
t|  | 
d | 
t| |jj | |j|| | 
t|  d S )	Nc                 S  r#  rE   r_  r0  rE   rE   rF   ri   v
  rj   z<PythonWrapperCodegen.codegen_conditional.<locals>.<listcomp>c                   r1  r2  rE   r3  r8  rE   rF   ri   w
  r4  ra  r/  r   r  zelse:)r$  Zoperandsr/  r[   r  	predicater  r   r   ZShapeAsConstantBufferr   r   Ztrue_subgraphr?   r  r  Zfalse_subgraph)r   Zconditionalr  r  r6  rE   r8  rF   codegen_conditionals
  s   


z(PythonWrapperCodegen.codegen_conditionalc           
        s:  |   dd |jD }dd |jD }|   dt|  t|D ]\}}|   d| d|  q$g  fddtt|D |}  dg}t|}|d t| }	| d	 | t| |j	j
 | |j	|| | d
|d  d | t|  | t| |jj
 | |j||	 | t|  d S )Nc                 S  r#  rE   r_  r0  rE   rE   rF   ri   
      z;PythonWrapperCodegen.codegen_while_loop.<locals>.<listcomp>c                 S  r#  rE   r_  r0  rE   rE   rF   ri   
  r8  r/  r  z] = c                   r1  r2  rE   r3  r8  rE   rF   ri   
  r4  Z_cond_resultzwhile True:zif not r   z: break)r$  Zcarried_inputsZadditional_inputsr   r[   r>  r/  r   r   Zcond_subgraphr?   r  r  Zbody_subgraph)
r   Z
while_loopZouter_carried_inputsZouter_additional_inputsr1  inpZcond_outer_inputsZcond_outer_outputsZbody_outer_inputsZbody_outer_outputsrE   r8  rF   codegen_while_loop
  sD   
z'PythonWrapperCodegen.codegen_while_loopc                 C  s^   z$t | dd r
W d S t| tr| W S tjj| }|d u r |W S t|W S  ty.   Y d S w )Nr  )r"  r   r   r*   r?   Z
_shape_envZ_maybe_evaluate_staticr&  )r   r   rE   rE   rF   statically_known_int_or_none
  s   

z1PythonWrapperCodegen.statically_known_int_or_nonec                 C  s4   g }| D ]}t |}|d u r d S || q|S r>   )r   r;  r   )lstr  r   numrE   rE   rF   %statically_known_list_of_ints_or_none
  s   
z:PythonWrapperCodegen.statically_known_list_of_ints_or_nonec                 C     t | d uS r>   )r   r>  )r<  rE   rE   rF    is_statically_known_list_of_ints
  s   z5PythonWrapperCodegen.is_statically_known_list_of_intsc                 C  s   t |  S r>   )r   r>  rL  r  rE   rE   rF   r.  
  s   z4PythonWrapperCodegen.static_shape_for_buffer_or_nonec                 C  r?  r>   )r   r.  rA  rE   rE   rF   !can_prove_buffer_has_static_shape
  s   z6PythonWrapperCodegen.can_prove_buffer_has_static_shaper>   )r  r   r  r   r  rz   r  r  r  )rl   rM   r  rM   r<   r   )r^  rM   )r  r  r<   rM   r<   r  r<   r  r<   r  )r  r  rH  )r  r   r<   rM   r'  )r  r   r<   r   )r  r  r<   r   r  r/   r<   r   )rc   rM   r  rM   r  r   rs   r  rO  rM   r<   r   )F)NNN)
r  rM   r  rM   r  rM   r  r  r  r  )rl   rM   r   r2  r3  r4  )rD  rE  )r   r	   rD   r   r<   rM   )r   r	   r<   rM   )rP  rM   rl   rM   rS  rM   r<   rM   )rP  rR  r<   rM   )r   rX  r<   rM   )r[  r   )NTN)
r  rM   r  rM   r  r   r  r   r  r   )r  rM   )rw   r  )r  rM   r`  r   )r  r1   )NN)r  rM   )r  r;   )rY  )r  r  )r  r  )r  rM   r  rM   r  rM   )r  r;   r  r;   r8  r   )rl   rM   r  r  r<   r   r  r;  )r  r;  r  r;  )r  rM   r  r   r  r  r<   r   )r!  r   r  r"  )r+  r   ){r   r   r   __doc__r   rW  r  rn  rt  rq  r  rs  r#   r  r  r  r  r  r  r  r  r  r  r  r  r  rr  r  rz  r  r   r	  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r?  r  rI  r  rL  rM  rO  rQ  r   rR  rW  rZ  r]  r^  rf  r}  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r   ry  r  r  r3  r  r  r6  r  r  r9  r   r  r  r  r  r	  r
  r  r  r   r*  r,  r-  r  r5  r7  r:  r;  r>  r@  r.  rB  r   rE   rE   r   rF   r     s   R=
		O#!*]  '	Sa# (	P-*	r   c                      s   e Zd ZdZ	d3d4 fd	d
Zd5ddZd5ddZdd Zdd Zdd Z	d6ddZ
d7ddZd8ddZd8d d!Zd9d#d$Zd:d&d'Zd;d)d*Zd< fd-d.Zed5d/d0Zed5d1d2Z  ZS )=r  a  
    A wrapper codegen that generates code for a subgraph. For most of the
    methods, we rely on the implementation in the PythonWrapperCodegen. But we
    override a few functions to produce cleaner code (like avoiding writing
    imports twice in the output code)
    Nr  rM   r  r   r  r  c                   s    || _ || _|| _t   d S r>   )r  r  r  r   r   )r   r  r  r  r   rE   rF   r   
  s   z%SubgraphPythonWrapperCodegen.__init__r<   r   c                 C  s   | j | _d S r>   )r  rm  r   rE   rE   rF   rn  
  s   z1SubgraphPythonWrapperCodegen.set_launcher_fn_namec                 C  r  r>   rE   r   rE   rE   rF   rq  
  r  z)SubgraphPythonWrapperCodegen.write_headerc                 C  r  r>   rE   r~  rE   rE   rF   r  
  r  z2SubgraphPythonWrapperCodegen.add_benchmark_harnessc                 C  r  r>   rE   r~  rE   rE   rF   r}  
  r  z6SubgraphPythonWrapperCodegen.benchmark_compiled_modulec                 C  r  r>   rE   r   rE   rE   rF   r    r  z5SubgraphPythonWrapperCodegen.write_async_compile_waitc                 C  r  r>   )r  r  r   rE   rE   rF   r    s   
z/SubgraphPythonWrapperCodegen.next_kernel_suffixr  r/   c                 C  r  r>   rE   r  rE   rE   rF   r    r  z2SubgraphPythonWrapperCodegen.generate_after_suffixr   c                 C  s   | j d| j d d}|S )Nz
            def z(args):
            r+   )r_  r   rm  r  rE   rE   rF   r    s   z>SubgraphPythonWrapperCodegen.write_launcher_fn_call_get_indentc                 C  r  r   rE   r   rE   rE   rF   r    r  z4SubgraphPythonWrapperCodegen.get_wrapper_call_indentr  c                 C      | j  }r
|j}|S tjj}|S r>   )r  input_nodesr*   r?   r  )r   r  r  rE   rE   rF   r    s
   
z-SubgraphPythonWrapperCodegen.get_graph_inputsr  c                 C  s(   | j  }rt|j }|S tjj}|S r>   )r  r   rJ  r|  r*   r?   r  )r   r  namesrE   rE   rF   r     s
   
z2SubgraphPythonWrapperCodegen.get_graph_input_namesr  c                 C  rI  r>   )r  r)  r*   r?   r  )r   r  r  rE   rE   rF   r  '  s
   
z.SubgraphPythonWrapperCodegen.get_graph_outputsr  r;  c                   s0   |  }| j }r||jv rd S t | d S r>   )r$  r  rJ  r   r  )r   r  rl   r  r   rE   rF   r  .  s   z/SubgraphPythonWrapperCodegen.codegen_allocationc                 C     | j   d S r>   )r  r  r   rE   rE   rF   r  8  s   z5SubgraphPythonWrapperCodegen.write_triton_header_oncec                 C  rL  r>   )r  r  r   rE   rE   rF   r  A  s   z=SubgraphPythonWrapperCodegen.write_get_raw_stream_header_oncer>   )r  rM   r  r   r  r  r  r'  rF  rH  rD  rC  rE  rG  )r   r   r   rH  r   rn  rq  r  r}  r  r  r  r  r  r  r  r  r  r#   r  r  r   rE   rE   r   rF   r  
  s*    





	

	

r  )r:   r;   r<   r=   )rH   r;   rI   r;   )rK   rL   r<   rM   )r`   rL   r<   rM   )rc   rd   r<   rM   r>   )
rl   rM   ru   rv   rw   rx   ry   rz   r<   r{   r'  )
__future__r   r   r   r   r   r/  r  r  r1  rT  rY   r   	itertoolsr   typingr   r   r   r   r   r   r	   r  Z
torch._opsZtorch.utils._pytreeutilsZ_pytreer  r
   r  Ztorch._dynamo.utilsr   r   Z#torch._inductor.codegen.debug_utilsr   Z$torch._inductor.codegen.multi_kernelr   Z%torch._inductor.runtime.runtime_utilsr   Z%torch.fx.experimental.symbolic_shapesr   r   r   r   r   Ztorch.fx.noder   Ztorch.utils._ordered_setr   Z torch.utils._sympy.singleton_intr   Ztorch.utils._sympy.symbolr   r   rY  r   r   r   Z	codecacher   r   r    Zruntimer!   Zruntime.hintsr"   r#   r$   r%   r&   r'   r(   r)   Zvirtualizedr*   r  r,   r-   r.   r/   r0   r1   r2   Z	cpp_utilsr3   Ztriton_utilsr4   r5   r6   collections.abcr7   r8   r   r?   r9   ZdoprintrG  r   rO  rM   r=   r#  r;   rG   rJ   r_   rb   rt   r  r   r  r   r   r   r  r   r   r   r   r  r  r  r  r(  r   r-  r+  r:  rJ  rX  ru  r   r  rE   rE   rE   rF   <module>   s   $	$	




*	
XM	")                a