o
    ZhU                    @   sZ  U d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZmZmZmZmZ d dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZmZm Z  d d	l!m"Z"m#Z#m$Z$ d
dl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZD ddlEmFZFmGZGmHZHmIZI ddlJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZV ddlWmXZXmYZYmZZZm[Z[m\Z\m]Z]m^Z^m_Z_m`Z`maZambZbmcZcmdZdmeZe ejfdkZgehddd ZiejjkeldZmeg dZnddddddd d!d"d"d#
Zoeg d$Zpd%d&d'd(d)d*d+d,d-d.d/
Zqd0d1d2ZrejsejtgZuejvejwejsejtejxejyejzej{ej|g	Z}e~ej ed3< ejwejsejtejyejzgZe~ej ed4< d5d6 Zd7d8 Z		dd9eej fd:d;Zd<d= Zd>eRd?ejd@edAejdBejdCeLfdDdEZdFeeeNf dGedHedIejdJeeef f
dKdLZdMeRdNedOefdPdQZejhd9ejdRejfdSdTZejhd9ejdRejdUefdVdWZejh	dd9ejdRejdUee fdXdYZejG dZd[ d[ZG d\d] d]e5ZG d^d_ d_ZG d`da daeUZedb G dcdd ddeZede e  G dfdg dgeZG dhdi dieSZG djdk dkeZG dldm dmeZdne/dCeeej exf fdodpZG dqdr drZG dsdt dteZG dudv dveZG dwdx dxeZG dydz dze2ZG d{d| d|ZG d}d~ d~ZejG dd dZejG dd dZdS )    N)Sequence)Enum)AnyCallablecastOptionalUnion)dependencies)is_float_dtypeis_integer_dtype)
OrderedSet)CeilDivFloorDivModularIndexing)free_symbol_is_typesymbol_is_typeSymT   )counters   )	codecacheconfigcpp_buildercpu_vec_isairmetrics)LoopBody)BaseSchedulerNodeBaseSchedulingExternKernelSchedulerNodeForeachKernelSchedulerNodeFusedSchedulerNode	SchedulerSchedulerNode)cache_on_selfget_bounds_index_exprget_fused_kernel_namehas_free_symbolsis_multi_outputs_templateis_welford_reductionparallel_num_threadsPlaceholdersympy_index_symbolsympy_index_symbol_with_prefixsympy_product
sympy_subs)NullKernelHandleropsOpsValueV   )BackendFeatureBracesBufferCSECSEVariableDataTypePropagationDeferredLineDTYPE_TO_COMPUTATION_DTYPEIndentedBufferKernel
KernelArgsOpOverridesOptimizationContext)_get_dtype_from_loopbodies_get_loop_bodycexprcexpr_indexcodegen_randCppCSEVariableDTYPE_TO_CPP
INDEX_TYPELocalBufferContextmay_unify_binary_op_mask_typepromote_args(template_fusion_with_epilogues_supportedunify_mask_base_typevalue_to_cppwin32c                   C   s   t rdS dS )Nz__declspec(dllexport) _IS_WINDOWS rS   rS   J/var/www/auris/lib/python3.10/site-packages/torch/_inductor/codegen/cpp.pyget_export_declarationW      rU   Zschedule)+*^||minmaxrW   rX   rY   r[   r\   argminargmaxrZ   Zwelford)
sumprodxor_sumr[   r\   r]   r^   anywelford_reducewelford_combine)
r\   r[   r_   r`   ra   rc   rd   r]   r^   rb   z
at::Tensorlongdoubleboolzstd::stringzc10::ScalarTypezat::MemoryFormatz
at::Layoutz
at::Devicez
at::Scalar)
ZTensorintfloatrg   strZ
ScalarTypeZMemoryFormatZLayoutZDevicenumberzstd::vectorzstd::optional)Listr   VECTORIZABLE_DTYPESMASKED_VECTORIZABLE_DTYPESc                 C   s   |t v rtj}| dv rdS | dkrdS | dv r`t| }|tjkr)| dv r)ttj }t|r3d| dnd	| d
}t|rCd	| dnd	| d}| dv rO|n|}| dv rW|S d| d| dS t| rldt|  dS t| )N)ra   r_   rb   r   r`   r4   )r\   r^   r[   r]   r]   r^   -std::numeric_limits<>::infinity()std::numeric_limits<>::min()>::max())r\   r^   )r\   r[   IndexValue<z>{0, }Welford<>())	DTYPE_LOWP_FPtorchfloat32rG   rg   ri   r
   r)   AssertionError)reduction_typedtypecdtypeZmin_varZmax_varZinit_varrS   rS   rT   reduction_init   s6   


r   c                 C   sL   t t|  }t| rd| dS | dv r$|tjkrt tj }d| dS |S )Nrw   >ro   ru   )rG   r;   r)   rz   rg   ri   )r}   r~   scalar_typerS   rS   rT   reduction_acc_type   s   

r   indexc           
   	   C   s  |t jk}| dkr|rdnd}| d| d| S | dkr$| d| S | dkr/| d| S | d	kr:| d
| S | dv rI|  d| d| dS | dkrVd| d| dS | dkr|t|tre|\}}}	nt| |\}}}	d| d| d| d|	 d	S | dv rt|dr|jt jkr|js|d ur|  d| d| d| dS |  d| d| dS |d ur|  d| d| d| dS |  d| d| dS t| )Nr_   |rW    r`    * ra    ^ rb    || )r[   r\   z_propagate_nan(, )rc   welford_combine(rd   , {})ro   r~   z	_combine(z, static_cast<float>(), )))	rz   rg   
isinstancetuplereduction_projecthasattrr~   is_vecr|   )
r}   var
next_valuer   	src_dtypeis_boolconjunctionmeanm2weightrS   rS   rT   reduction_combine   sB   

r   c                 C   s:   t | r| d| d| dfS | dv r| dS |S )Nz.meanz.m2z.weightro   z.index)r)   )r}   accrS   rS   rT   r     s
   
r   codeiter_varnew_iter_var
loop_startloop_endreturnc              
   C   s   t  }t g}|dt d| dt| d| dt| d| d  ||  t| j	D ]3\}}t
|ttfs?J d}	t
|trL|j}	|j}td	|  d	 | |}
|	r`t|	|
}
||
 q2W d   |S 1 sqw   Y  |S )
a  
    f(iter_var) is transformed to f(new_iter_var) under the inner loop
      \/
    for (new_iter_var = loop_start; new_iter_var < loop_end; new_iter_var++) {
        f(new_iter_var)
    }
    Please be careful while using this function,
    as the variable defined in f(iter_var) will be invalid outside the for loop.
    For example:
    auto tmp0 = in_ptr[x0]; ->
    for (new_x0 = start; new_x0 < end; new_x0++){
        auto tmp0 = in_ptr[new_x0];
    }
    The tmp0 is invalid outside the loop.
    zfor (r    = ; < ; ++)N\b)r6   
contextlib	ExitStack	writelinerH   rD   enter_contextindent	enumerate_linesr   rj   r:   namelineresub)r   r   r   r   r   Ztransformed_codestack_r   Zdeferred_namenew_linerS   rS   rT   move_code_under_inner_loop	  s8   



r   acc_varacc_typer}   r~   lenc              
   C   sz   t  }t rd|  d| d| dn
| d|  d| d}||  |d| d	d
d|  d||| ddg |S )a  
    MSVC don't support dynamic array(VLA). So we use std::unique_ptr here.
    Ref: https://stackoverflow.com/questions/56555406/creating-dynamic-sized-array-using-msvc-c-compiler
    MSVC is the only one compiler without VLA. support. Since MSVC can't get good performance here.
    We just use unique_ptr make it works on MSVC.
    For other compilers, we continue to use VLA to get best performence.
    auto z_arr = std::make_unique<z[]>();r   _arr[];for (int i = 0; i < ; i++){    z
_arr[i] = r   rv   )r<   r   Z
is_msvc_clr   
writelines)r   r   r}   r~   r   init_fnZcode_bufferZacc_declrS   rS   rT   reduction_prefix_array9  s   
r   bufferr   new_namec                 C   st   t | jD ]2\}}t|ttfsJ t|tr'td|  d | |j|_qtd|  d | || j|< qd S )Nr   )r   r   r   rj   r:   r   r   r   )r   r   r   ir   rS   rS   rT   replace_acc_nameZ  s   
 "r   r   c                 C   s6   |  |s	tjjS ||d i}t| |}t||  S Nr4   )hassympySZeror/   simplify)r   r   replacement	new_indexrS   rS   rT   	stride_ati  s
   

r   
vec_lengthc                    s   d d fdd}fdd}| }t jddd}| tr+| t||} t jd	dd}| trA| t|||} t | } | |krPt| S | S )
a  
    Simplifies the index expression within the range of a vectorized loop.
    Given a vectorized loop variable `var` in the range of a loop with `vec_length`,
    this function transforms the `index` into an equivalent form. It handles
    simplifications for cases where `var` can be expressed as `vec_length * a + b`,
    where `b` ranges from 0 to `vec_length - 1`. The function reduces occurrences
    of `FloorDiv` and `ModularIndexing` in the `index` with best-effort optimizations.

    NOTE:
    The simplified index expression is intended for analysis purposes only, not
    for code generation. It replaces `FloorDiv` and `ModularIndexing` with free variables
    which are not dependent on the loop variable `var` in the vectorized range. Check
    https://github.com/pytorch/pytorch/pull/117221#discussion_r1449746217 for more details.

    Examples:
    1. If `var` is `x3` and `vec_length` is 16, and `x3 = 16*a + b`, then
       `FloorDiv(x3, div)` or `ModularIndexing(x3, div, mod)` becomes a free variable
       when `div` is divisible by 16.
    2. `ModularIndexing(x3, 1, mod)` can be simplified to `x3 + c` where `c` is a free
       variable when `mod` is divisible by 16.
    r   c                    s:   t | }t| krt d  } d7  |S )NZ_div_cr4   )r   r   gcdSymbol)divisorresult)div_freevar_idr   r   rS   rT   visit_indexing_div  s
   
z7simplify_index_in_vec_range.<locals>.visit_indexing_divc                    sx   t | |}t| krt d  } d7  |S | dkr:t|kr:t d   } d7  |S )NZ_mod_cr4   )r   r   r   r   )r   modulusr   )mod_freevar_idr   r   rS   rT   visit_modular_indexing  s   z;simplify_index_in_vec_range.<locals>.visit_modular_indexingr   T)integerr   )r   ZWildr   r   replacer   r   simplify_index_in_vec_range)r   r   r   r   r   Zoriginal_indexdivmodrS   )r   r   r   r   rT   r   u  s   


r   c                 C   s   |rt | ||} t| |S N)r   r   )r   r   r   rS   rS   rT   stride_at_vec_range  s   
r   c                   @   s"   e Zd ZU dZeed< eed< dS )ParallelDepthz{
    A class representing parallel depth.
    Includes the starting depth of parallelism and the depth of parallelism.
    parallel_depthstart_depthN)__name__
__module____qualname____doc__rh   __annotations__rS   rS   rS   rT   r     s   
 r   c                       s`   e Zd ZededefddZdddeeee	f  f fdd	Z
d
d Zdd Zdd Z  ZS )OuterLoopFusedSchedulerNodenode1node2c                 C   s   |j |j u sJ tdd ||fD sJ tdd ||fD rF| |j t|tu r/t| n|gt|tu r@t|  |S |g |S | |j ||g|S )Nc                 s   "    | ]}t |tttfv V  qd S r   )typer   r#   r!   .0noderS   rS   rT   	<genexpr>  s    
z3OuterLoopFusedSchedulerNode.fuse.<locals>.<genexpr>c                 s       | ]	}t |tu V  qd S r   r   r   r   rS   rS   rT   r         )	schedulerallrb   r   r   listget_outer_nodes)clsr   r   outer_loop_fusion_depthrS   rS   rT   fuse  s,   	
z OuterLoopFusedSchedulerNode.fuser   r"   outer_fused_nodesc                    sR   || _ || _g }| j D ]}t|ttfsJ |t|  qt 	|| d S r   )
r  r   r   r#   r!   extendr   	get_nodessuper__init__)selfr   r  r   Zflatten_snodes_node	__class__rS   rT   r    s   
z$OuterLoopFusedSchedulerNode.__init__c                 C      | j S r   )r  r  rS   rS   rT   r        z+OuterLoopFusedSchedulerNode.get_outer_nodesc              
      s   dt dt dtdtdtf
 fdd tt|d D ]}|| j}||d  j} |||d	s1 d
S q|D ]0}ttj	|j
d | }t|j
|krdt|tjrdt|j
| tjrd|d |j
| k rd d
S q4dS )Nleft_loop_nestright_loop_nestloop_fusion_depthcurrent_checking_depthr   c                    s   | j sJ |j s
J | j |  |j | g d}t fdd|D s&dS |dks,J |d  }dkrS|d }|t| j k sAJ |t|j k sJJ | |||sSdS dS )N)r   sizeoffsetstepsc                 3   s$    | ]}t  |t |kV  qd S r   )getattr)r   Zattr_compareZleft_loop_levelZright_loop_levelrS   rT   r     s    
zaOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._inner.<locals>.<genexpr>Fr4   r   T)loopsr   r   )r  r  r  r  Zouter_loops_attr_compare_list_innerr  rT   r  	  s.   



zNOuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attr.<locals>._innerr4   r   F,  T)LoopNestrh   rg   ranger   	loop_nest	functoolsreduceoperatormulrangesr   r   Integer)r  cpp_kernel_proxy_listr   idxr  r  cpp_kernel_proxyouter_rangesrS   r  rT   "check_outer_fusion_loop_level_attr   sR   	*


z>OuterLoopFusedSchedulerNode.check_outer_fusion_loop_level_attrc                    sP   |d j }t|} fdd|D |_|d }||j_|jjd  j |j_|S )Nr   c                    s   g | ]	}|j  jqS rS   )r  from_loop_levelr   )r   proxyr  rS   rT   
<listcomp>\  s    zJOuterLoopFusedSchedulerNode.merge_outer_fusion_kernels.<locals>.<listcomp>)kernel_groupOuterLoopFusedKernelinnerr  kernelr  r   )r  r$  r,  Zouter_loop_fused_kernelZouter_fused_proxyrS   r  rT   merge_outer_fusion_kernelsV  s   

z6OuterLoopFusedSchedulerNode.merge_outer_fusion_kernels)r   r   r   classmethodr   r  r   r   r!   r#   r  r   r(  r0  __classcell__rS   rS   r	  rT   r     s    #Vr   c                   @   s<   e Zd ZddefddZdd Zdd Zd	d
 Zdd ZdS )RecordOptimizationContextrP   	func_namec                 C   s   || _ d | _d | _d S r   )r4  current_nodeopt_ctx)r  r4  rS   rS   rT   r  i  s   
z"RecordOptimizationContext.__init__c                 C   sr   t jsJ t jjsJ t jj| _| jd usJ tj| jjv r'| jjtj | _nt | _| jd us2J | j| j_| S r   )	r3   interpreterr5  r@   keymetar6  r4  Zops_namer  rS   rS   rT   	__enter__n  s   


z#RecordOptimizationContext.__enter__c                 C   s(   | j sJ | js
J | j| j jtj< d S r   )r5  r6  r9  r@   r8  r  exc_typeexc_valexc_tbrS   rS   rT   __exit__|  s   

z"RecordOptimizationContext.__exit__c                 C   r  r   )r6  r  rS   rS   rT   get_opt_ctx  r  z%RecordOptimizationContext.get_opt_ctxc                 C      | j sJ | j S r   )r5  r  rS   rS   rT   get_fx_node     
z%RecordOptimizationContext.get_fx_nodeN)rP   )	r   r   r   rj   r  r:  r?  r@  rB  rS   rS   rS   rT   r3  h  s    r3  c                   @   s  e Zd ZdZedd Zedd Zedd Zedd
dZedd Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zed d! Zed"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zed.d/ Zed0d1 Zed2d3 Zed4d5 Zed6d7 Zed8d9 Zed:d; Z ed<d= Z!ed>d? Z"ed@dA Z#edBdC Z$edDdE Z%edFdG Z&edHdI Z'edJdK Z(edLdM Z)edNdO Z*edPdQ Z+edRdS Z,edTdU Z-edVdW Z.edXdY Z/edZd[ Z0ed\d] Z1ed^d_ Z2ed`da Z3edbdc Z4eddde Z5edfdg Z6edhdi Z7edjdk Z8edldm Z9edndo Z:edpdq Z;edrds Z<edtdu Z=edvdw Z>edxdy Z?edzd{ Z@ed|d} ZAed~d ZBedd ZCedd ZDedd ZEedd ZFedeGjHdeGjHfddZIedeGjHdeGjHfddZJedeGjHdeGjHfddZKedd ZLedd ZMdS )CppOverrideszMap element-wise ops to C++c                 C      d|  d|  d| dS )N	decltype()( + r   rS   abrS   rS   rT   add     zCppOverrides.addc                 C   rE  )NrF  rG   - r   rS   rI  rS   rS   rT   r     rM  zCppOverrides.subc                 C   rE  )NrF  rG  r   r   rS   rI  rS   rS   rT   r!    rM  zCppOverrides.mulNTc                 C   s   t | tsJ |d u r| j}tj| ||}tjjtjj|}|	d| |fd|i |t
v r>|tjkr>	 tj| ||| |S )Nto_dtyper   )r   rF   r~   r3   r/  get_to_dtype_exprcsegeneratecomputeupdate_on_argsry   rz   ri   cache_dtype_convert)xr~   r   Zuse_compute_typesexprcsevarrS   rS   rT   rO    s   zCppOverrides.to_dtypec                 C   s2   |t v sJ | dt ddt |  d|  dS )Nz missing from z.DTYPE_TO_CPPzc10::bit_cast<>(r   )rG   r   )rV  r~   r   rS   rS   rT   to_dtype_bitcast  s   zCppOverrides.to_dtype_bitcastc                 C      d|  dS )Nz	std::abs(r   rS   rV  rS   rS   rT   abs  rV   zCppOverrides.absc                 C   r[  )Nz	std::sin(r   rS   r\  rS   rS   rT   sin  rV   zCppOverrides.sinc                 C   r[  )Nz	std::cos(r   rS   r\  rS   rS   rT   cos  rV   zCppOverrides.cosc                 C      d|  d|  dS )NrF  z)(-r   rS   r\  rS   rS   rT   neg     zCppOverrides.negc                 C   r[  )Nz	std::exp(r   rS   r\  rS   rS   rT   exp  s   zCppOverrides.expc                 C   r[  )Nz
std::exp2(r   rS   r\  rS   rS   rT   exp2  rV   zCppOverrides.exp2c                 C   r[  )Nzstd::expm1(r   rS   r\  rS   rS   rT   expm1  rV   zCppOverrides.expm1c                 C   r[  )Nz	std::erf(r   rS   r\  rS   rS   rT   erf  rV   zCppOverrides.erfc                 C   r[  )Nz
std::erfc(r   rS   r\  rS   rS   rT   erfc  rV   zCppOverrides.erfcc                 C   r[  )Nzcalc_erfinv(r   rS   r\  rS   rS   rT   erfinv  rV   zCppOverrides.erfinvc                 C   r[  )Nz
std::sqrt(r   rS   r\  rS   rS   rT   sqrt  rV   zCppOverrides.sqrtc                 C   r[  )Nz1 / std::sqrt(r   rS   r\  rS   rS   rT   rsqrt  rV   zCppOverrides.rsqrtc                 C   sB   t jj}|dkr|  d|  dS |d u rd|  dS td|)Naccuracy + decltype()(1)zstd::log1p(r   8unrecognized config cpp.inject_log1p_bug_TESTING_ONLY = r   cppZinject_log1p_bug_TESTING_ONLYr|   rV  bugrS   rS   rT   log1p  s   zCppOverrides.log1pc                 C   r[  )Nz	std::tan(r   rS   r\  rS   rS   rT   tan  rV   zCppOverrides.tanc                 C   r[  )Nz
std::tanh(r   rS   r\  rS   rS   rT   tanh	  rV   zCppOverrides.tanhc                 C   s   t rd|  dS d|  dS )z
        On windows std::signbit only support float type.
        Ref: https://learn.microsoft.com/en-us/cpp/c-runtime-library/reference/signbit?view=msvc-170
        z std::signbit(static_cast<float>(r   zstd::signbit(r   rQ   r\  rS   rS   rT   signbit  s
   
zCppOverrides.signbitc                 C      d|  d| dS )Nz	std::pow(r   r   rS   rI  rS   rS   rT   pow  rb  zCppOverrides.powc                 C   r[  )Nz	std::log(r   rS   r\  rS   rS   rT   log  rV   zCppOverrides.logc                 C   r[  )Nzstd::nearbyint(r   rS   r\  rS   rS   rT   round!  rV   zCppOverrides.roundc                 C   r[  )Nzstd::floor(r   rS   r\  rS   rS   rT   floor%  rV   zCppOverrides.floorc                 C   sF   |  d| }|  d| }d|  d| d| d| d| d| d	S )
N /  % ((z
 < 0) != (z	 < 0) ? (z != 0 ? z - 1 : z) : r   rS   )rJ  rK  quotremrS   rS   rT   floordiv)  s   *zCppOverrides.floordivc                 C   r[  )Nz
std::ceil(r   rS   r\  rS   rS   rT   ceil0  rV   zCppOverrides.ceilc                 C   r[  )Nzstd::trunc(r   rS   r\  rS   rS   rT   trunc4  rV   zCppOverrides.truncc                 C      |  d| S Nr|  rS   rI  rS   rS   rT   truncdiv8  s   zCppOverrides.truncdivc                 C   rw  )Nz
std::fmod(r   r   rS   rI  rS   rS   rT   fmod=  rb  zCppOverrides.fmodc                 C   r[  )Nzstd::isinf(r   rS   r\  rS   rS   rT   isinfA  rV   zCppOverrides.isinfc                 C   r[  )Nzstd::isnan(r   rS   r\  rS   rS   rT   isnanE  rV   zCppOverrides.isnanc                 C   r[  )Nzstd::lgamma(r   rS   r\  rS   rS   rT   lgammaI  rV   zCppOverrides.lgammac                 C   r[  )Nz
std::acos(r   rS   r\  rS   rS   rT   acosM  rV   zCppOverrides.acosc                 C   r[  )Nzstd::acosh(r   rS   r\  rS   rS   rT   acoshQ  rV   zCppOverrides.acoshc                 C   r[  )Nz
std::cosh(r   rS   r\  rS   rS   rT   coshU  rV   zCppOverrides.coshc                 C   r[  )Nz
std::sinh(r   rS   r\  rS   rS   rT   sinhY  rV   zCppOverrides.sinhc                 C   r[  )Nz
std::asin(r   rS   r\  rS   rS   rT   asin]  rV   zCppOverrides.asinc                 C   r[  )Nzstd::asinh(r   rS   r\  rS   rS   rT   asinha  rV   zCppOverrides.asinhc                 C   rw  )Nzstd::atan2(r   r   rS   rV  yrS   rS   rT   atan2e  rb  zCppOverrides.atan2c                 C   r[  )Nz
std::atan(r   rS   r\  rS   rS   rT   atani  rV   zCppOverrides.atanc                 C   r[  )Nzstd::atanh(r   rS   r\  rS   rS   rT   atanhm  rV   zCppOverrides.atanhc                 C   rw  )Nzstd::copysign(r   r   rS   r  rS   rS   rT   copysignq  rb  zCppOverrides.copysignc              	   C   s   d|  dd|  df}t dd |D rtdd |D S t }tjjjtjd}tjjj| j	d}|
d| d	 |
d
| d|  d| d tjj| ||f}t||D ]\}}tjj|| q[||fS )Nfrexp()[0])[1]c                 s   "    | ]}t jj|d uV  qd S r   r3   r/  rQ  Ztry_getr   	cache_keyrS   rS   rT   r   x       z%CppOverrides.frexp.<locals>.<genexpr>c                 s       | ]
}t jj|V  qd S r   r  r  rS   rS   rT   r   y      r~   zint32_t r   r   z = std::frexp(, &r   )r   r   r6   r3   r/  rQ  newvarrz   int32r~   r   rS  splicezipput)rV  
cache_keysr   exponentmantissacse_varsr  cse_varrS   rS   rT   frexpu  s   zCppOverrides.frexpc                 C   rw  )Nzstd::hypot(r   r   rS   r  rS   rS   rT   hypot  rb  zCppOverrides.hypotc                 C   r[  )Nzstd::log10(r   rS   r\  rS   rS   rT   log10  rV   zCppOverrides.log10c                 C   r[  )Nz
std::log2(r   rS   r\  rS   rS   rT   log2  rV   zCppOverrides.log2c                 C   rw  )Nzstd::nextafter(r   r   rS   r  rS   rS   rT   	nextafter  rb  zCppOverrides.nextafterc                 C   f   t jj}|dkr
dS |dkr|  dS |dkr|  d|  dS |d u r,d|  d	|  d
S td|)Ncompile_errorcompile error!runtime_error	; throw 1rk  rl  rm  z	std::max(, decltype()(0))7unrecognized config cpp.inject_relu_bug_TESTING_ONLY = r   rp  Zinject_relu_bug_TESTING_ONLYr|   rq  rS   rS   rT   relu     
zCppOverrides.reluc                 C   rw  )Nzmin_propagate_nan(r   r   rS   rI  rS   rS   rT   minimum  rb  zCppOverrides.minimumc                 C   rw  )Nzmax_propagate_nan(r   r   rS   rI  rS   rS   rT   maximum  rb  zCppOverrides.maximumc                 C   s   |  d| d| S )N ?  : rS   )rJ  rK  crS   rS   rT   where  s   zCppOverrides.wherec                 C   rw  )Nzmod(r   r   rS   rI  rS   rS   rT   r     rb  zCppOverrides.modc                 C   s   t | t| S r   )rN   rG   )valr~   rS   rS   rT   constant     zCppOverrides.constantc                 C   s8   t tj| }tjjjtjj|t| d}t	||S )NZbounds)
rC   r3   r/  rename_indexingrQ  rR  rS  r%   r1   rO  )rW  r~   Zidx_strr   rS   rS   rT   
index_expr  s
   zCppOverrides.index_exprc              	   C   s   t  }tjj }|d| d tj|( |  | }|d| d W d    n1 s4w   Y  W d    n1 sCw   Y  |d tjj	| t
|d| d}|  d| d| S )	Nr    = [&]return r   rF  z())r  z() : )r6   r3   r/  rQ  r  r   swap_buffersr   rS  r  rN   )maskbodyotherr   Zbody_varr   
other_coderS   rS   rT   masked  s    
zCppOverrides.maskedc                 C   r  )N && rS   rI  rS   rS   rT   logical_and  r  zCppOverrides.logical_andc                 C   
   d|  S )N!rS   rJ  rS   rS   rT   logical_not     
zCppOverrides.logical_notc                 C   r  )Nr   rS   rI  rS   rS   rT   
logical_or  r  zCppOverrides.logical_orc                 C   r  )N != rS   rI  rS   rS   rT   logical_xor  r  zCppOverrides.logical_xorc                 C   rE  )NrF  rG   & r   rS   rI  rS   rS   rT   bitwise_and  rM  zCppOverrides.bitwise_andc                 C   r`  )NrF  z)(~r   rS   r  rS   rS   rT   bitwise_not  rb  zCppOverrides.bitwise_notc                 C   rE  )NrF  rG   | r   rS   rI  rS   rS   rT   
bitwise_or  rM  zCppOverrides.bitwise_orc                 C   rE  )NrF  rG  r   r   rS   rI  rS   rS   rT   bitwise_xor  rM  zCppOverrides.bitwise_xorc                 C   s   t  }|d | W t| j }|d| d| d |d| d| d| d |  |d	|  d
 W d    n1 sEw   Y  |d	|  d| d|  d| d	 W d    n1 sfw   Y  |d |S )N[&]()constexpr decltype() max_shift = sizeof(z) * CHAR_BIT;$if ((static_cast<std::make_signed_t<>>() < 0) || ( >= max_shift))return decltype(z)(0);z#)(static_cast<std::make_unsigned_t<z) << r   ()r6   r   r   rG   r~   rJ  rK  r   Zscalar_trS   rS   rT   bitwise_left_shift  s&   




zCppOverrides.bitwise_left_shiftc              
   C   s   t  }|d | Z t| j }|d| d| d| d |d| d| d| d	 |  |d
|  d|  d W d    n1 sKw   Y  |d
|  d|  d| d W d    n1 siw   Y  |d |S )Nr  r  r  z ) * CHAR_BIT - std::is_signed_v<z>;r  r  r  r  r  rG  z >> max_shift); >> r   r  r  r  rS   rS   rT   bitwise_right_shift  s"   



 
z CppOverrides.bitwise_right_shiftseedr  c                 C   rw  )Nznormalized_rand_cpu(r   r   rS   r  r  rS   rS   rT   rand  rb  zCppOverrides.randc                 C   rw  )Nz
randn_cpu(r   r   rS   r  rS   rS   rT   randn  rb  zCppOverrides.randnc              	   C   s   d|  d| d| d| d	S )Nzrandint64_cpu(r   r   rS   )r  r  lowhighrS   rS   rT   	randint64!  s   zCppOverrides.randint64c                 C      d|  d|  d|  dS )NrF  z)(1) / (decltype(z)(1) + std::exp(-r   rS   r\  rS   rS   rT   sigmoid%  rM  zCppOverrides.sigmoidc              
   C   s   t  }d|  d}d|  d}|d | + |d|  d| d| d |d	|  d
| d| d |d W d    n1 sFw   Y  |d |S )NrF  )(0)rm  r  auto left = z > 0 ? r  r   auto right = z < 0 ? return left - right;r  r6   r   r   )rV  r   Zscalar_zeroZ
scalar_onerS   rS   rT   sign)  s   


zCppOverrides.signNT)Nr   r   r   r   staticmethodrL  r   r!  rO  rZ  r]  r^  r_  ra  rc  rd  re  rf  rg  rh  ri  rj  rs  rt  ru  rv  rx  ry  rz  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r  r  r  r  r  r  r  r  r  r  r  r  r  r   Exprr  r  r  r  r  rS   rS   rS   rT   rD    s    


*






























































rD  rp  c                       s  e Zd ZdZ fddZedd Zedd Zedd	 Zed
d Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zed d! Zed"d# Zed$d% Zed&d' Zed(d) Zed*d+ Zed,d- Zed.d/ Zed0d1 Zed2d3 Zed4d5 Zed6d7 Zed8d9 Z ed:d; Z!ed<d= Z"ed>d? Z#ed@dA Z$edBdC Z%edDdE Z&edFdG Z'edHdI Z(edJdK Z)edLdM Z*edNdO Z+edPdQ Z,edRdS Z-edTdU Z.edVdW Z/edXdY Z0edZd[ Z1ed\d] Z2ed^d_ Z3ed`da Z4edbdc Z5eddde Z6edfdg Z7edhdi Z8edjdk Z9edldm Z:edndo Z;edpdq Z<edrds Z=edtdu Z>edvdw Z?edxdy Z@edzd{ ZAed|d} ZBed~d ZCedd ZDedd ZEedd ZFedd ZGedd ZHedd ZIedd ZJedd ZKedd ZLedd ZMedddZNedd ZOedd ZPedd ZQedd ZReSdd ZTeSdd ZU  ZVS )CppVecOverridesz.Map element-wise ops to aten vectorization C++c                    s^   t  |   fdd}tt D ]\}}t|dd tkr,|dvr,t |||j q S )Nc                    s    fdd}|S )Nc                     s@  dd | D }dd | D }t | }|rL|rLg }| D ]1}t|ttjfrFt|tjr5|js5t|tj	}nt
|tj	}t|trD|jn|}|| q|rjt|dkrYt|}ntjkrjt|dd  |dd < |r|rttjtsvJ fdd|D }|r|i |S tt}t|j}|d usJ || i |S )Nc                 S   s0   g | ]}t |ttjfst |tr|js|qS rS   )r   rh   r   r   rF   r   r   argrS   rS   rT   r+  N  s    zJCppVecOverrides.__new__.<locals>.wrap.<locals>.wrapper.<locals>.<listcomp>c                 S   s    g | ]}t |tr|jr|qS rS   )r   rF   r   r  rS   rS   rT   r+  T  s    r   r4   c                    s@   g | ]}t |tr|js tjtjtjfvrtj	|n|qS rS   )
r   rF   r   r  r  r  r  r3   r/  	broadcast)r   Znew_argfuncrS   rT   r+  t  s    )r   r   rh   r   r   	is_numberr1   r  rz   int64r  r2   valueappendr   rK   r  r  r3   r/  CppVecKernelr  r  r   )argskwargsZscalarsZvectorsnew_argsr  Z
scalar_opsscalar_func)r
  r  r  rS   rT   wrapperM  s@   



z6CppVecOverrides.__new__.<locals>.wrap.<locals>.wrapperrS   )r  r  )r
  r  r  rT   wrap@  s   Bz%CppVecOverrides.__new__.<locals>.wrapr
  )r  r  )	r  __new__varsr  itemsr  r  setattr__func__)r   r  Zkargsr  r   methodr	  r  rT   r  =  s   QzCppVecOverrides.__new__c                 C   r  )NrH  rS   rI  rS   rS   rT   rL    r  zCppVecOverrides.addc                 C   r  )NrN  rS   rI  rS   rS   rT   r     r  zCppVecOverrides.subc                 C   r  Nr   rS   rI  rS   rS   rT   r!    r  zCppVecOverrides.mulc                 C   r  r  rS   rI  rS   rS   rT   truediv  r  zCppVecOverrides.truedivc                 C   
   |  dS )Nz.abs()rS   r\  rS   rS   rT   r]    r  zCppVecOverrides.absc                 C   r  )Nz.sin()rS   r\  rS   rS   rT   r^    r  zCppVecOverrides.sinc                 C   r  )Nz.cos()rS   r\  rS   rS   rT   r_    r  zCppVecOverrides.cosc                 C   r  )Nz.exp()rS   r\  rS   rS   rT   rc    r  zCppVecOverrides.expc                 C   r  )Nz.exp2()rS   r\  rS   rS   rT   rd    r  zCppVecOverrides.exp2c                 C   s   d|  d}|  d| S )NrF  rm  z	.exp() - rS   )rV  vec_onerS   rS   rT   re    s   zCppVecOverrides.expm1c                 C   r  )Nz.erf()rS   r\  rS   rS   rT   rf    r  zCppVecOverrides.erfc                 C   r  )Nz.erfc()rS   r\  rS   rS   rT   rg    r  zCppVecOverrides.erfcc                 C   r  )Nz	.erfinv()rS   r\  rS   rS   rT   rh    r  zCppVecOverrides.erfinvc                 C   r  )Nz.sqrt()rS   r\  rS   rS   rT   ri    r  zCppVecOverrides.sqrtc                 C   L   t tjtsJ t | tsJ | jd usJ tj| j d|  d| dS )N( == r   r   r3   r/  r  rF   r~   _get_mask_typer  rS   rS   rT   eq      zCppVecOverrides.eqc                 C   s   t tjtsJ t | tsJ | jtjkr/|jtjksJ ttjj	| |f\}}| d| S | jd us6J tj
| j d|  d| dS )Nr  r  r   )r   r3   r/  r  rF   r~   rz   rg   rM   rS  r   )rV  r  Zx_castZy_castrS   rS   rT   ne  s    zCppVecOverrides.nec                 C   r  )Nr  r   r   r  r  rS   rS   rT   lt  r"  zCppVecOverrides.ltc                 C   r  )Nr  z > r   r  r  rS   rS   rT   gt  r"  zCppVecOverrides.gtc                 C   r  )Nr   <= r   r  r  rS   rS   rT   le  r"  zCppVecOverrides.lec                 C   r  )Nr   >= r   r  r  rS   rS   rT   ge  r"  zCppVecOverrides.gec                 C   r  Nr  rS   r  rS   rS   rT   and_  r  zCppVecOverrides.and_c                 C   r  )Nz.rsqrt()rS   r\  rS   rS   rT   rj    r  zCppVecOverrides.rsqrtc                 C      |  d| dS )Nz.pow(r   rS   rI  rS   rS   rT   rx       zCppVecOverrides.powc                 C   r  )Nz.log()rS   r\  rS   rS   rT   ry    r  zCppVecOverrides.logc                 C   r  )Nz.round()rS   r\  rS   rS   rT   rz    r  zCppVecOverrides.roundc                 C   r  )Nz.floor()rS   r\  rS   rS   rT   r{    r  zCppVecOverrides.floorc                 C   r  )Nz.ceil()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.ceilc                 C   r  )Nz.trunc()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.truncc                 C   r,  )Nz.fmod(r   rS   rI  rS   rS   rT   r  #  r-  zCppVecOverrides.fmodc                 C   r  )Nz	.lgamma()rS   r\  rS   rS   rT   r  '  r  zCppVecOverrides.lgammac                 C      t | |\} }|  d| S r*  rJ   rI  rS   rS   rT   r  +     zCppVecOverrides.logical_andc                 C   r  N~rS   r  rS   rS   rT   r  0  r  zCppVecOverrides.logical_notc                 C   r.  Nr  r/  rI  rS   rS   rT   r  4  r0  zCppVecOverrides.logical_orc                 C   r.  Nr   r/  rI  rS   rS   rT   r  9  r0  zCppVecOverrides.logical_xorc                 C   r.  r*  r/  rI  rS   rS   rT   r  >  r0  zCppVecOverrides.bitwise_andc                 C   r  r1  rS   r  rS   rS   rT   r  C  r  zCppVecOverrides.bitwise_notc                 C   r.  r3  r/  rI  rS   rS   rT   r  G  r0  zCppVecOverrides.bitwise_orc                 C   r.  r4  r/  rI  rS   rS   rT   r  L  r0  zCppVecOverrides.bitwise_xorc                 C   r  )Nz << rS   rI  rS   rS   rT   r  Q  r  z"CppVecOverrides.bitwise_left_shiftc                 C   r  )Nr  rS   rI  rS   rS   rT   r  U  r  z#CppVecOverrides.bitwise_right_shiftc                 C   s    t tjtsJ tj| | S r   )r   r3   r/  r  load)r   r  rS   rS   rT   	load_seedY  s   zCppVecOverrides.load_seedc                 C   .   t tjtsJ t }d|  d}t|||S )Nz)result[offset_idx] = normalized_rand_cpu(, offset[offset_idx]);r   r3   r/  r  r6   rE   r  r  r   rand_functionrS   rS   rT   r  ^  s
   
zCppVecOverrides.randc                 C   r7  )Nzresult[offset_idx] = randn_cpu(r8  r9  r:  rS   rS   rT   r  g  s   zCppVecOverrides.randnc                 C   s>   t tjtsJ t }d|  d| d| d}t|||tjS )Nz#result[offset_idx] = randint64_cpu(z, offset[offset_idx], r   r   )r   r3   r/  r  r6   rE   rz   r  )r  r  r  r  r   r;  rS   rS   rT   r  n  s   zCppVecOverrides.randint64c                 C   s0   | j |j ks
J d|  dt| | d| S )Nz;remainder vec implementation expect the same inputs' dtype.z - (z) * )r~   r  r  rI  rS   rS   rT   	remainderu  s   zCppVecOverrides.remainderc                 C   r  )Nz.tan()rS   r  rS   rS   rT   rt  |  r  zCppVecOverrides.tanc                 C   r  )Nz.tanh()rS   r  rS   rS   rT   ru    r  zCppVecOverrides.tanhc                 C   r  )Nz.reciprocal()rS   r  rS   rS   rT   
reciprocal  r  zCppVecOverrides.reciprocalc                 C   r  )Nz.atan()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.atanc                 C   r  )Nz.acos()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.acosc                 C   r  )Nz.asin()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.asinc                 C   r  )Nz.cosh()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.coshc                 C   r  )Nz.sinh()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.sinhc                 C   r  )Nz.log10()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.log10c                 C   r  )Nz.log2()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.log2c                 C   r,  )Nz.nextafter(r   rS   r  rS   rS   rT   r    r-  zCppVecOverrides.nextafterc                 C   r,  )Nz
.copysign(r   rS   rI  rS   rS   rT   r    r-  zCppVecOverrides.copysignc                 C   r,  )Nz.atan2(r   rS   rI  rS   rS   rT   r    r-  zCppVecOverrides.atan2c                 C   r,  )Nz.hypot(r   rS   rI  rS   rS   rT   r    r-  zCppVecOverrides.hypotc              
   C   s:   d|  d}d|  d}| d| d|  d| d|  d
S )	NrF  rm  z)(0.5)z * ((rH  z)/(rN  z)).log()rS   )rV  r  Zvec_one_halfrS   rS   rT   r    s   "zCppVecOverrides.atanhc                 C   r  )Nz.asinh()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.asinhc                 C   r  )Nz.acosh()rS   r\  rS   rS   rT   r    r  zCppVecOverrides.acoshc                 C   r  )Nr  r  r  r  rk  rl  rm  zat::vec::clamp_min(r  r  r  r  rq  rS   rS   rT   r    r  zCppVecOverrides.reluc                 C   r  )NrF  z)(1)/(decltype(z)(1) + z.neg().exp())rS   r\  rS   rS   rT   r    rM  zCppVecOverrides.sigmoidc                 C   r  )Nz.neg()rS   r\  rS   rS   rT   ra    r  zCppVecOverrides.negc                 C   s   t | jr| j|jksJ dd|  d| dS tdd | |fD s%J d|  d}tj|jdk rH| d	dtjj> d  d
| d| d}|  d| }d|  d| d| d}d|  d| d| d| d	}| d| d| d| d| d| dS )NzDdiv_floor_floating_vec implementation expect the same inputs' dtype.zdiv_floor_floating_vec(r   r   c                 s   s    | ]}t |jV  qd S r   )r   r~   )r   itemrS   rS   rT   r         z+CppVecOverrides.floordiv.<locals>.<genexpr>rF  r4   ::blend<rY  (1), r|  r  r}  r  z(0))r~  r   z	(0)) != (z(0)))z	::blendv(rN  r  )r
   r~   r   r3   r/  _get_raw_num_vectorstiling_factor)rJ  rK  _tr  Zhas_remZis_negrS   rS   rT   r    s   
((zCppVecOverrides.floordivc                 C   sT   t j|jdk r#d| d}| ddt jj> d  d| d| d}|  d| S )Nr4   rF  r   r@  rY  rA  r|  )r3   r/  rB  r~   rC  )rJ  rK  rD  rS   rS   rT   r    s   (zCppVecOverrides.truncdivc                 C   R   | j tjkr |j tjksJ ttjj| |f\}}| d| S d|  d| dS )Nr  at::vec::minimum(r   r   r~   rz   rg   rM   r3   r/  rS  rJ  rK  Za_castZb_castrS   rS   rT   r    
   zCppVecOverrides.minimumc                 C   rE  )Nr  at::vec::maximum(r   r   rG  rH  rS   rS   rT   r    rI  zCppVecOverrides.maximumc                 C   s   |  d|  S r  rS   r  rS   rS   rT   square
  r  zCppVecOverrides.squarec                 C   s   t tjtsJ |jtjkr2|jtjksJ ttjj| ||f\}}}d| d| d| d| d	S d| d| d| dtj	| |j d	S )NrF  
)::blendv(r   r   )
r   r3   r/  r  r~   rz   rg   rM   rS  _get_mask_cast)rJ  rK  r  Zblendv_aZblendv_bZblendv_crS   rS   rT   r    s   
*zCppVecOverrides.wherec                 C   s   t  }d|  d}d|  d}d|  d| d| d| d|  d}d|  d| d| d|  d| d}|d |  |d	| d
 |d| d
 |d W d    n1 s^w   Y  |d |S )NrF  r  rm  rL  r   r   r   r  r  r   r  r  r  r  )rV  r   Zvec_zeror  Zblendv_lZblendv_rrS   rS   rT   r    s   $$


zCppVecOverrides.signNTc              
   C   s   |t jt jt jt jt jt jt jt jt j	f	v sJ t
 d| t| ts&J | j}tj| ||}tjjtjj|}|d| |fd|i |tv rX|t jkrXtj| ||| |S )Nz does not support rO  r   )rz   rg   float64ri   bfloat16float16uint8int8r  r  r   r   rF   r~   r3   r/  rP  rQ  rR  rS  rT  ry   rU  )rV  r~   r   Zuse_compute_dtypesrW  rX  rS   rS   rT   rO  )  s*   
zCppVecOverrides.to_dtypec                 C   s@   t jj}|dkr|  d|  dS |d u r|  dS td|)Nrk  rl  rm  z.log1p()rn  ro  rq  rS   rS   rT   rs  ?  s   
zCppVecOverrides.log1pc                    s8  t tjtsJ t }tjj }tj| G}|d| d tj	|( |
  | }|d| d W d    n1 sCw   Y  W d    n1 sRw   Y  W d    n1 saw   Y  |d tjj| |j | d} fdd}|jr|}	n||}	t|t  }
||
}t |tsJ ||jrft }|d tj	| |
  |d	| d
 |
  |d| d W d    n1 sw   Y  |d |
 H tjjtjj|	}tjjtjj|}t |tsJ |t |tsJ | |_ |_tjj}|d|||| d W d    n	1 s1w   Y  W d    n	1 sAw   Y  W d    n	1 sQw   Y  |d tjjtjj|}n)|jr}tjjtjj|  d|	 d| }ntjjtjj|  d| d|
 }|d| |||fi  |S )Nr   r  r  r   r  c                    s8    t jkrtj  d|  dS tj  d|  dS )N::from(r   r  )rz   rg   r3   r/  r   _get_vec_type)r   r  rS   rT   maskify_or_vecify[  s
   
z1CppVecOverrides.masked.<locals>.maskify_or_vecify[&]if (z.all_zero())elser  r  r  )r   r3   r/  r  r6   rQ  r  r  r   r  r   rS  r  r~   r   rN   rG   rF   rR  	overridesr  rT  )r  r  r  r   r   Znew_maskr   Z	body_coderU  Zbody_code_vecr  Zother_code_vecZbody_vec_varZother_vec_varrY  rX  rS   r  rT   r  K  s    





 
zCppVecOverrides.maskedc                 C   s   t tjtsJ tj| }tjjtjj }tj||}|dkr't	| |S |d urQtjj
jtjjt|t| d}t||}t |trI|j}tj||}ntjd ||tjj}|d| |fi  |S )Nr   r  r  )r   r3   r/  r  r  itervars
tiling_idx_try_get_const_striderD  r  rQ  rR  rS  rC   r%   r1   rO  r2   r	  arange_load_or_store_non_contiguousrT  )rW  r~   r   
tiling_varstrider%  r	  rX  rS   rS   rT   r    s&   
zCppVecOverrides.index_exprc              	   C   s  d|  dd|  df}t dd |D rtdd |D S t| j }tjjr+tjjntjj}t }tjj	j
tjd}tjj	j
| jd}|jd| fi d	 |jd| fi d	 tj| j}|d
krgd| dnd| d| d}||d
kr|d| dnd| d| d || d| d |d |  |d| dtjj d ||  dt| d |dtjj d |d| dtjj d |dt| d |  |d W d    n1 sw   Y  ||d
kr| dt| dn| d | d!t| d || d"| d#t| d W d    n	1 s-w   Y  |d$ tjj| ||f}	t||	D ]\}
}tjj	|
| qG||fS )%Nr  r  r  c                 s   r  r   r  r  rS   rS   rT   r     r  z(CppVecOverrides.frexp.<locals>.<genexpr>c                 s   r  r   r  r  rS   rS   rT   r     r  r  r  )r  r4   at::vec::Vectorized<r   at::vec::VectorizedN<r   zat::vec::Vectorized<int32_t> r   zat::vec::VectorizedN<int32_t, > r   r  __at_align__ std::array<	> tmpbuf;.store(tmpbuf.data(), r   z!__at_align__ std::array<int32_t, z> tmpbuf_exponent;z> tmpbuf_mantissa;r   r   z@tmpbuf_mantissa[i] = std::frexp(tmpbuf[i], &tmpbuf_exponent[i]);z? = at::vec::Vectorized<int32_t>::loadu(tmpbuf_exponent.data(), z! = at::vec::VectorizedN<int32_t, z!>::loadu(tmpbuf_exponent.data(), r   z ::loadu(tmpbuf_mantissa.data(), z();)r   r   rG   r~   r3   r/  	tail_sizerC  r6   rQ  r  rz   r  rT  _get_num_vectorsr   r   rD   rS  r  r  r  )rV  r  r   r  r   r  r  n_vecZ
mantissa_tr  r  r  rS   rS   rT   r    sl   





zCppVecOverrides.frexpc                    s    fdd}|S )Nc                     s>  |rJ t j}t|tsJ t }|d | d j}||}|jr&|jn|j	}g }t
| } jdv }	|	r8dn|}
 jdkrEt
| d  n|
}
|  t| D ]D\}}t|tr|js^J |j|kseJ |d| d|j	 d	| d
 || d| dt| d |d| d qP|| qP|d|
 d|j	 d  | }|dt| d |  |d| d
 W d    n1 sw   Y  |	r|jrJ d}d| d| d}ndt| }|dkrd|
 d}n	d|
 d| d}|d| d| d W d    n	1 sw   Y  |d |S ) Nr  r   )r  r  rv  rg   rZ  rd  r   z> tmpbufr   z.store(tmpbufz	.data(), r   Ztmpbufz[i]z> tmpbuf_out;r   r   ztmpbuf_out[i] = ztmpbuf_out.data()at::vec::VecMask<,z>::fromztmpbuf_out.data(), r4   ra  z>::loaduz at::vec::VectorizedN<r  r  r  )r3   r/  r   r  r6   r   r~   rh  rg  rC  rG   r   r   r   rF   r   rD   r
  )r  r  r/  r   	vec_dtyperi  r  Zscalar_argsr   Zoutput_maskZoctypeZargidxr  resZ	load_argsZload_fnr  rS   rT   r.    sb   










 z)CppVecOverrides._scalarize.<locals>.innerrS   )r   r  r.  rS   ro  rT   
_scalarize  s   9zCppVecOverrides._scalarizec                 C   sV   t t}t t D ]\}}t|tr(||vr(| |j}||_t	| |t| q
d S r   )
r  r  rD  r  r   r  rp  r  r   r  )r   Zvec_varsr   r  r  rS   rS   rT   _initialize_scalarize%  s   z%CppVecOverrides._initialize_scalarizer  )Wr   r   r   r   r  r  rL  r   r!  r  r]  r^  r_  rc  rd  re  rf  rg  rh  ri  r!  r#  r$  r%  r'  r)  r+  rj  rx  ry  rz  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r6  r  r  r  r<  rt  ru  r=  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  ra  r  r  r  r  rK  r  r  rO  rs  r  r  r  r1  rp  rq  r2  rS   rS   r	  rT   r  :  sB   ]









































































L

8
<r  Zcppvecc                   @   s   e Zd Zedd ZdS )CppTile2DOverridesc                 C   s(   t tjtsJ tj| } t| |S r   )r   r3   r/  CppTile2DKerneltransform_indexingr  r  )rW  r~   rS   rS   rT   r  4  s   zCppTile2DOverrides.index_exprN)r   r   r   r  r  rS   rS   rS   rT   rr  3  s    rr  c                       s  e Zd ZeZeZdZdZ fddZ	e
efddZdd ZdNd
ee fddZejdd Z	dOdejfddZdejdefddZdejdejfddZdejdejfddZdd Zdejd ejd!ed"efd#d$Zd%edejfd&d'ZdNd(d)Z d*e!e"ef d+ed,ed-e#j$fd.d/Z%dNd ee& fd0d1Z'd2d3 Z(d4d5 Z)d6d7 Z*d8d9 Z+d:d; Z,d<d= Z-e.defd>d?Z/d@dA Z0ejdBdC Z1dDdE Z2dFdG Z3dHdI Z4				dPd
edJee dKeej fdLdMZ5  Z6S )Q	CppKernelr   r   c                    s   t  | i | _g | _d | _g | _g | _d | _t | _	g | _
t | _t | _t | _t | _t | _d| _t | _t| j| jdd| _t| j| jdd| _t | _t | _|| _i | _g | _d S )NFZtmp_acc)Zname_prefixZwrecps)r  r  active_rangesinner_itervarscall_rangesr"  rZ  reduction_depthr<   reduction_prefixreduction_prefix_generatorsreduction_suffixparallel_reduction_prefixparallel_reduction_suffixlocal_reduction_initlocal_reduction_storesis_reductionnon_parallel_reduction_prefixr7   newvar_prefixsuffixreduction_cseweight_recps_csepreloads
poststoresnum_threadsZreduction_omp_decreduction_var_names)r  r  r  r	  rS   rT   r  A  s2   

zCppKernel.__init__c           
      C   s   t jjr| js| jd | d}t jjrdnt }| d}	| j| d| d||| d | jt|||||| | j	|	 d| d | j
d| d	d
d| d||||	|d ddg d S )Nz(int max_threads = omp_get_max_threads();_localmax_threadsz	_arr[tid]r   r   r   zfor (int tid = 0; tid < z; tid++)r   r   r   rv   )r   rp  dynamic_threadsr}  r   r*   r  r  r   r  r~  r   )
r  r   r   r}   r~   reduction_combine_fnreduction_init_fnZ	acc_localr  Zacc_local_in_arrayrS   rS   rT   _gen_parallel_reduction_buffersf  s:   	



z)CppKernel._gen_parallel_reduction_buffersc                 C   s$   | j D ]}t| j|| d qd S )Nr  )r  r   stores)r  var_namerS   rS   rT   %update_stores_with_parallel_reduction  s   
z/CppKernel.update_stores_with_parallel_reductionNr   c                 C   s   |d u sJ t  }t 1}t| dr%|| j | | ||  || j	 || j
 || j W d    n1 sAw   Y  t| drQ|| j | jrl| jD ]}| j| \}}t||| d||}qW|S )Ncodegen_inner_loops_tail)r6   r   r   r   r  r  r  r   r   loadsrS  r  r  rw  rv  r   )r  r   r   r%  startendrS   rS   rT   gen_body  s$   




zCppKernel.gen_bodyc                 c   s`    | j }|rt||}t|tr|j}t|tsJ tj|_	|| _ z	|V  W || _ dS || _ w )z>Context manager to add an additional mask to loads and stores.N)

_load_maskr1   r+  r   r2   r	  rF   rz   rg   r~   )r  r  priorrS   rS   rT   r    s   
zCppKernel.maskedr4   r   r   c                 C   s(   | j | }||| | i}t||}|S r   )rZ  r/   )r  r   scaleitervar_idxr  r   r   r   rS   rS   rT   scale_index_with_offset  s   

z!CppKernel.scale_index_with_offsetr   c                 C   s   t | |S )z
        Convert an index expr to a string that can be used in cpp code.
        e.g. a sympy expression "s2" may actually appear as "ks1" in the cpp kernel.
        )rC   r  r  r   rS   rS   rT   index_to_str  s   zCppKernel.index_to_stritervarc                    s   t  fdd|jD S )z]
        Check if an index has free symbol CppCSEVariable that depends on `itervar`.
        c                 3   sF    | ]}|j jjv rtjj|j  trjj|j   V  qd S r   )r   rQ  varname_mapr   rF   Z
depends_onr   sr  r  rS   rT   r     s    z6CppKernel.index_indirect_depends_on.<locals>.<genexpr>)rb   free_symbolsr  r   r  rS   r  rT   index_indirect_depends_on  s   z#CppKernel.index_indirect_depends_onc                 C   s   ||j v p
| ||S r   )r  r  r  rS   rS   rT   index_depends_on  s   zCppKernel.index_depends_onc                 C   s   t t| j| jS r   )dictr  rZ  r"  r  rS   rS   rT   
var_ranges     zCppKernel.var_rangesrW  r  lowerupperc                 C   s   |s|sd S t |tj}|rt|tjj}tj	j
}n tj	j
}z| jtj	_
t|tjj}W |tj	_
n|tj	_
w | j}|rFtj	| |nd }	| ||rOdnd |	| j}
| jj||
dd d S )N0F)Z
assignment)r   r   TMPr1   r  rz   r  r	  r3   r/  rS  r  sexprr  indirect_assertr  rQ  rR  )r  rW  r  r  r  ZindirectrX  r   Zprior_computesize_strr   rS   rS   rT   check_bounds  s"   

zCppKernel.check_boundsr   c                 C   sR   | j |}| |}| dt| d}| j| j|}|d| ||fi  |S )N[]r5  )r  inputr  rD   rQ  rR  r  rT  )r  r   r   r   r   rX  rS   rS   rT   r5    s   
zCppKernel.loadc                 C   s   d|v sJ | j |}| |}|d u r#| dt| d| d}nB|dkr^tjjs>| jdkr>| dt| d| d}n'tj	
|}dt|  d	| d
}d| dt| d| d}ntd| | jt|| d S )Nbufr  ] = r   
atomic_addr4   z] += zstatic_cast<rY  r   zatomic_add(&z], r   store mode=)r  outputr  rD   r   rp  r  r  r3   graph	get_dtyperG   NotImplementedErrorr  r   r:   )r  r   r   r	  moder   r   r~   rS   rS   rT   store  s   
zCppKernel.storer   r   rtyper~   c                    s$   ddt t f fdd}|S )Nr  c                    s6   | d u r d  d dS t  | S )Nr   r   r   )r   )r  r   r   r~   r   r  rS   rT   r.  $  s   z.CppKernel._gen_reduction_prefix.<locals>.innerr   )r   rh   )r  r   r   r  r~   r   r.  rS   r  rT   _gen_reduction_prefix  s    zCppKernel._gen_reduction_prefixc                 C   s    | j D ]
}| j|| qd S r   )r{  rz  r  )r  r  Zgen_fnrS   rS   rT   finalize_reduction_prefix3  s   
z#CppKernel.finalize_reduction_prefixc              	   C   s"  |dv }|||f}|| j jv r| j j| S | j j| jd| dd}| j|  d| _|r0|n|}t||}	| j| 	||	||t
 | jd usKJ | j| j }
t| jd t| jD ]}|
| j|  | j|  }
q\| j| dt||||
 d | ||	|| t||}|| j j|< |S )	Nr^   r]   
reduction FwriteTr4   r   r   )r  reduction_cacherR  r  r  r
  r  r   r{  r  r   ry  rZ  r  r   r"  r  r   r   r  r   )r  r~   r   r}   r	  argmax_or_argminreduction_keyr   
init_dtyper   r   r   r   rS   rS   rT   	reduction7  s6   



zCppKernel.reductionc              
   C   sB   |  |}| j|}| jt|| dt| d| d d S )Nr  r  r   )r  r  r  r|  r   r:   rD   )r  r   r   r	  r   rS   rS   rT   store_reductionV  s
   
zCppKernel.store_reductionc                    s    j r) j t|t| ksJ  j  dt| dt|  jt|ks(J n&t|t|  _  fdd j D  _dd tt jD  _t| _ jd  j  j jd  fS )Nr  rH  c                       g | ]}  |qS rS   r  )r   rV  r  rS   rT   r+  e      z(CppKernel.set_ranges.<locals>.<listcomp>c                 S      g | ]}t tj|qS rS   r-   r   ZXBLOCKr   nrS   rS   rT   r+  f      
)rx  r   ry  r   r"  r  rZ  )r  lengthsZreduction_lengthsrS   r  rT   
set_ranges]  s   
zCppKernel.set_rangesc                 C   s&   | j d usJ tjjjt| j ddS )N    fallback)rx  r3   r  sizevars	size_hintr.   r  rS   rS   rT   r  p  s   
zCppKernel.size_hintc                    s6  t tsJ t 	jd usJ t |jtr!|j| 	n| 	|jd uo4|jj	 j
t }jrOrD
  n
	 | n	dkr^
 r^|   dtf fddddd	ddtd	tf 	
fd
dddtd	tf fdd		ddtd	tdtffdd|   t |jtrt tjtrtjjrtjj}| D ]C}tfdd| jD }t| j  }d| dt!| d}	|" }
 #d| d|
 d|	 d  #| d|
 d|
 d q| W d    d S 1 sw   Y  d S ) Nr4   
_loop_nestc                    s    fdd}   }t|tr|jD ]   qd S t|ts#J  jd ur/| r/|  t }|	
  | W d    d S 1 sKw   Y  d S )Nc                     s"    j sJ  j j } | jo| jS r   )r  r   r  parallel)root)r  	par_depthrS   rT   is_parallel_reduction  s   
zOCppKernel.codegen_loops_impl.<locals>.gen_kernel.<locals>.is_parallel_reduction)
get_kernelr   r-  r.  CppKernelProxyr  r  r   r   r   r   r  )r  r  r/  r   )r   gen_loop_nestr  )r  rT   
gen_kernel  s   



"z0CppKernel.codegen_loops_impl.<locals>.gen_kernelFc                 S   sB   |r| j }|r| j| }|S | j}|r|| j }|S || j }|S r   )r|  r~  rz  r}  r  )r/  r  	is_suffixr  prefixrS   rS   rT   get_reduction_prefix_suffix  s   


zACppKernel.codegen_loops_impl.<locals>.get_reduction_prefix_suffixr   depthc                    s  |   }| js	J | j| }t s}|jr.|s.||jdd}|r)|    | rF|jrF |j	rF|j
s@J  |j	 | | r]|jr]|j
rY |j
   |jru|s} ||jdd W d    d S W d    d S W d    d S 1 sw   Y  d S )NF)r  T)r  r  r   r   r  r  r   r   r  r  r  close)r  r  in_reductionr/  loopZstack_outerrz  )r   gen_loop_atr  is_reduction_loopthreadsworksharingrS   rT   gen_loop_with_reduction  s@   










"z=CppKernel.codegen_loops_impl.<locals>.gen_loop_with_reductionc                    s   t  9}| js
J | j| }| }|d u r 	 W d    d S  | |   | |d |j W d    d S 1 s@w   Y  d S r   )r   r   r  linesr   r   r   r  )r  r  r   r  Z
loop_lines)r   r  rS   rT   r    s   



"z1CppKernel.codegen_loops_impl.<locals>.gen_loop_atr  c                    s4   | j d u s|t| j kr |  d S | || d S r   )r  r   )r  r  r  )r  r  rS   rT   r    s   z3CppKernel.codegen_loops_impl.<locals>.gen_loop_nestc                    r  rS   r  )r   Zsize_valr  rS   rT   r+    s    z0CppKernel.codegen_loops_impl.<locals>.<listcomp>zstd::make_unique<z []>(r   zstd::unique_ptr<z	 []> buf_r   r   z* z = buf_z.get();)FF)r   F)r   )$r   r  r*   rx  r/  r-  decide_parallel_depthmax_parallel_depthr  r   r  r   r   r   r  r  mark_parallelsingler   r   r  rh   rg   r3   local_buffer_contextrI   local_buffersvaluesr.   
get_layoutr  rG   r~   rC   get_namer  )r  r  r   r  r   r  local_bufferZlocal_buf_sizeZlocal_buf_dtypeallocateZlocal_buffer_namerS   )r   r  r  r  r  r  r  r  r  r  r  rT   codegen_loops_implv  s   




!




$zCppKernel.codegen_loops_implc                 C   s   t | }| ||| d S r   )r  buildr  )r  r   r  r  rS   rS   rT   codegen_loops	  s   
zCppKernel.codegen_loopsc                 C   s   t jjrdS dS )NZAOTI_TORCH_CHECKZTORCH_CHECK)r3   r  Zaot_moder  rS   rS   rT   assert_function		  s   zCppKernel.assert_functionc           	      C   s   | j d usJ | j |j|j|j  }|  }d}d}|D ]-}tjjj|dd}|d| ks2||kr4 n|| tjj	k r> n|d7 }||9 }|| }qtjj
r[|dkr[t|dkr[d}t||jdS )Nr4   r   r  r  r   r   r   )rx  r   r   r  r3   r  r  r   rp  Zmin_chunk_sizer  r   r   )	r  r  r  r"  seqparr  rW  hintrS   rS   rT   r  	  s.   

zCppKernel.decide_parallel_depthc                 c   s    | j | j| j| jf}t | _ t | _t | _| j | _d V  | j| j  | j| j | j| j |\| _ | _| _| _d S r   )r  rS  r  rQ  r<   cloner|  r  )r  r  rS   rS   rT   write_to_suffix-	  s   zCppKernel.write_to_suffixc                 O   s   t |i |S r   )rF   )r  r  r  rS   rS   rT   create_cse_var:	     zCppKernel.create_cse_varc                 C   s   dt |  d| dS )Nzc10::convert<rY  r   )rG   )r  srcr~   r   rS   rS   rT   rP  =	     zCppKernel.get_to_dtype_exprc                 C   s    |  |||}| j|| d S r   )rP  rQ  r  )r  dstZ	dst_dtyper  r   rW  rS   rS   rT   rU  @	  s   zCppKernel.cache_dtype_convertr  r   c           
         s   |d u rd}j sdS g   fdd}|d ur/|j v sJ j | \}}||||s.dS nj  D ]\}}|\}}||||sE dS q4d }	|	r[|d| d|	 d	 dS dS )
NrP   Tc                    s   | |krdS d }t jD ]\}}||kr|} nqttkr/|r/| dkr/|j| kr/d} | dt|    | dt|  dS )NFr   r4   r(  r   T)r   rZ  r   ru  r"  r
  rD   )r  r  r   Zvar_idr   _varZ
conditionsr  rS   rT   genP	  s"   z)CppKernel.codegen_conditions.<locals>.genFr  zif(r  r   )rv  r  joinr   )
r  r   r  r   r  r  r  r  _rangeZjoined_conditionsrS   r  rT   codegen_conditionsD	  s,   
zCppKernel.codegen_conditionsr   )r4   r  r   NN)7r   r   r   rD  rY  rC   r  r  r  r  r   r   r  r  r   r6   r  r   contextmanagerr  r   r   r  rj   r  r   r  r  r  rg   r  r5  r  r   r8   rz   r~   r  rh   r  r  r  r  r  r  r  propertyr  r  r  r  rP  rU  r  r2  rS   rS   r	  rT   ru  ;  s    +
)


 


 
ru  c                       s  e Zd ZeZ	d? fdd	ZdejdejfddZ	de
jd	efd
dZde
jd	efddZde
jd	efddZe
jfde
jd	efddZdede
jd	efddZ	d?dedejde
jdee fddZ			d@dee dejde
jdee deeeef  ded	ee fddZdedejf fd d!Z	dAd"eeef dedejde
jdef
d#d$Zd?d%d&Zd'd( Zd)d* Zd+ed	efd,d-Z ded.ejd	efd/d0Z!d1d2 Z"d3d4 Z#d?d5d6Z$ddde
j%fdeej d7ee d8ee
j fd9d:Z&d? fd;d<	Z' fd=d>Z(  Z)S )Br  Nc                    s\   t  || t | _| jsJ |dksJ d|| _|| _|| _|r)|| _d S || _d S )Nr   z0Expect pass in Non-Zero tiling_factor explicitly)	r  r  r   pick_vec_isavec_isarC  r[  rg  	num_elems)r  r  r  rC  r[  rg  r	  rS   rT   r  x	  s   

zCppVecKernel.__init__r   r  c                    s`     ||rd S  fdd|jD D ]}t|tsJ |jr! d S qt|| j}|jr.|S d S )Nc                 3   *    | ]}t |tjr jj|j V  qd S r   r   r   r  rQ  r  r   r  r  rS   rT   r   	      

z5CppVecKernel._try_get_const_stride.<locals>.<genexpr>)r  r  r   rF   r   r   rC  r  )r  r   r  indirect_varr`  rS   r  rT   r\  	  s   

z"CppVecKernel._try_get_const_strider~   r   c                 C   s0   t | j|j d | j  }|dksJ |S )N   r4   )mathr  rC  itemsizer  	bit_widthr  r~   num_vectorsrS   rS   rT   rh  	  s
   zCppVecKernel._get_num_vectorsc                 C   s   | j |j d | j  S )Nr!  )rC  r#  r  r$  )r  r~   rS   rS   rT   rB  	  s   z!CppVecKernel._get_raw_num_vectorsc                 C   s8   |  |}|dkrdt|  dS dt|  d| dS )Nr4   ra  r   rb  rl  )rh  rG   r%  rS   rS   rT   rT  	  s   
zCppVecKernel._get_vec_typec                 C   s.   |t jkrdS | |}dt|  d| dS )NrP   rk  rl  r   )rz   rg   rh  rG   r%  rS   rS   rT   r   	  s   

zCppVecKernel._get_mask_typer  c                 C   s<   |j tjksJ t|| |}| dt|  d| dS )Nz.template cast<rl  rx   )r~   rz   rg   reprrh  rG   )r  r  r~   r&  rS   rS   rT   rM  	  s   
zCppVecKernel._get_mask_castr   	load_maskc           
      C   s   t | }| |}d}|r%|js| tj d| d}n| |tj }|dkr2| dt| n|}|tjkrE|   d| d}	|	S |rU| d| d| d| dn| 	| d	| d
t| j
 d}	|	S )a  
        Get a load line str that loads a vector from `var` at `index` of type `dtype`.
        If `load_mask` is not None, we do a masked load accordingly.
        Notes on the `dtype`:
        1. We always load `self.tiling_factor` number of elements regardless of the `dtype`.
           It means we load half of the vector lanes for 16-bit data types and quarter of the
           vector lanes for 8-bit data types.
        2. `torch.bool` and `torch.uint8` could mean masks and we load them as float mask vectors.
        NrS  r   r   rH  z.template loadu<rl  rY  ::loadu(r   )rG   rh  r   r   rz   ri   rM  rD   rg   rT  r  )
r  r   r   r~   r(  Zcpp_typer&  Zload_mask_strloadbufr   rS   rS   rT   _get_vec_load_line	  s    

 zCppVecKernel._get_vec_load_lineFr   store_value
accu_storec                    s  |r
|dus
J d|r|sJ  du rj  dtjdtffdddtjdtffddd	tdtf fd
d}t }|d | c |}	|}
dt|  d|
 d}|| |rr|| dt	|	 d t
jj  d}i }fdd|jD D ]}t|tsJ |jr||}| d| d||< qj|j|d}d}jdur|rJ dtjtsJ jjjrӈj d| d}nj d}t r|dj  n	|dj  |d| d| d t	j d! | d"  | } t h}t	|}|D ]}td#|  d# || |}q|dur8| d| dn| }|rN|d$| d ||  |rg|rVd%nd&}|| d'| d(| d) n|d*| d+| d, W d   n	1 s~w   Y  W d   n	1 sw   Y  |sd-d.|}|d/| d, W d   n	1 sw   Y  |d0 |r|d,  | dS j |}t|tsJ d1|_|S )2a  
        Load or store a vector in a non-contiguous way. The vector is initialized from an array that is
        filled in an inner loop over the tiling factor.
        :param var: buffer to load from or store to, i.e. `var[transformed(index)]`. If None, we load the index
                    as index expression, i.e. `transformed(index)`.
        :param index: index into the `var` or the index expression by its own if `var` is None.
                      The `index` could contain indirect indexing or the tiling itervar. When used in
                      the inner loop, the index is transformed as follows:
                      1. the index is linearized along the tiling dim.
                      2. the indirect indexing vector variables are transformed into arrays over the tiling dim.
        :param dtype: data type of `var` or `index` if `var` is None.
        :param buffer: the code buffer to write the generated code to. If None, we write to `self.loads`.
        :param store_value: the value to store. If None, we load the vector.
        :param accu_store: whether accumulate the store_value to store_ptr. If True, a store_value should be provided
        :return: a CppCSEVariable that represents the loaded vector or None if it is a store.
        Nzstore var must be providedr~   r   c                        | j dk r jd| j   S  jS N   )r#  r  r  r  rS   rT   get_result_size	     
zCCppVecKernel._load_or_store_non_contiguous.<locals>.get_result_sizec                    r.  r/  )r#  rC  r  r  rS   rT   get_tiling_size	  r2  zCCppVecKernel._load_or_store_non_contiguous.<locals>.get_tiling_sizevec_varc                    s   | j sJ t }|d | C | j}|d usJ |tjkr#tj}|}|}|dt|  d| d |  dt	| d}|| |d W d    n1 sWw   Y  |d j
 |}t|tsoJ |S )	NrV  rd  r   re  rf  r   zreturn tmpbuf;r  )r   r6   r   r   r~   rz   rg   ri   rG   rD   rQ  rR  r   rF   )r4  r   rm  result_sizetiling_sizer   rX  r   r1  r3  r  rS   rT   vec_to_array
  s*   





z@CppVecKernel._load_or_store_non_contiguous.<locals>.vec_to_arrayrV  rd  r   re  rf  r   r  c                 3   r  r   r  r  r  rS   rT   r   ,
  r  z=CppVecKernel._load_or_store_non_contiguous.<locals>.<genexpr>r  r  r  r  zunexpected store with load maskz.is_masked(r   z != 0z#pragma GCC unroll z#pragma unroll 
for (long  = 0; r   r   r   r   rW  +==r   z tmpbuf[r   ztmpbuf[r  r   ztmpbuf.data()r   r  r  T)r  rz   r~   rh   rF   r6   r   r   rG   rD   r,   rZ  r[  r  r   r   r  r  r   is_gccrC  r  r   r   r   r   r   r+  r  rQ  rR  )r  r   r   r~   r   r,  r-  r8  r   r5  r6  Zresult_declareZitervar_innerZreplacementsr   Z	array_varr(  r   Zindex_crhsr   Z	load_linerX  rS   r7  rT   r^  	  s   





  
@

z*CppVecKernel._load_or_store_non_contiguousr   c           	         s   | j |}| |}tj|}| j| j }| ||}|dkr(t	 
||S |dkr>| |||| j}| j| j|}n| |||}t|tsLJ |d| ||fi  d|_|S )Nr   r4   r5  T)r  r  r  r3   r  r  rZ  r[  r\  r  r5  r+  r  rQ  rR  r  r^  r   rF   rT  r   )	r  r   r   r   r~   r_  r`  r   rX  r	  rS   rT   r5  h
  s   
zCppVecKernel.loadr	  c                 C   s*  t |tst |tr|jsJ || j| j }| dt| }| ||}t }	|dkr|r^|t	j
krD| jdu rD| | d| dn| | d| dt| j d}
d| d|
 d}|t	j
kru| jdu ru|	| d| d	 |	S |	| d| dt| j d	 |	S | j||||	||d
 |	S )a2  
        Get a store line buffer that stores `value` into `var` at `index` of `dtype`. It handles
        both contiguous and non-contiguous store cases.
        :param value: Vectorized type templaterized on `dtype`.
        :param var: buffer to store into.
        :index: index into the `var`.
        rH  r4   Nr)  r   r   r  .store(r   )r   r,  r-  )r   rj   rF   r   rZ  r[  rD   r\  r<   rz   ri   rg  rT  r  r   r^  )r  r	  r   r   r~   r-  r_  Zvar_exprr`  r   r5  rS   rS   rT   _get_store_line|
  s:   
 	zCppVecKernel._get_store_linec                    sd  d v sJ t |tsJ ||js| |}| j }| |}tj	 }|d u rC| 
||||}| j| fdd d S |dkrtjjsj| jdkrj| j
| |||dd}| j| fdd d S | |}| tj}	t| }
t|tjj}t |tr|jsJ d	|
 d
|	 d
| d| d
| d
| d}| jt | d S td| )Nr  c                    
   t  | S r   r:   r\  r   rS   rT   <lambda>
     
 z$CppVecKernel.store.<locals>.<lambda>r  r4   T)r-  c                    rB  r   rC  r\  rD  rS   rT   rE  
  rF  zatomic_add_vec<r   rY  r   r  )r   rF   r   r  r  r  r  r3   r  r  rA  r  r  mapr   rp  r  r  rh  rz   r  rG   r1   r  r	  r   r:   r  )r  r   r   r	  r  r   r~   r   n_srcn_idxr   r   rS   rD  rT   r  
  s8   


*zCppVecKernel.storec               
   C   s  |t v sJ |dv }| j| jk}|r|n|}t|tsJ ||js'| |}|||f}|| jjv r8| jj| S d}	|	 dt	|  d}
t
||}| ||}| jj| jd| dd}t|tscJ | d}d	| }|  j| ||g7  _d
| _| j| ||||t | j| ||||| j tdd | j| jd  }|dkr=| jd usJ | j| ||||| j tdd | j| jd  }| j| jkr| jnd}t||| _| j| jjvr| jj| jd| j dd| _| j| jj| j< | j|  | t!j"j#rdnt$ }| j%|  || n| jj| j | _| j&r(|n|}| j'| d| (|||d
 d nD| jd usEJ | j)| j }t*| jd t+| j)D ]}|| j|  | j)|  }qV||||d}| j'| d| j(||fi | d | j,||||| j(| jd | j,||||t-td |dkr| j,||||| j(| jd |t.j/k}|r]t0|r| 1|dv sJ dd| d}d| d}| j2| dt-||| d ng|r| d| d}n[|r	|dv rd| d}nL|dksJ | d}n?d| (|d d! d" }|t.j/k}|rt.j3n|}d#t	|  d}
d$t	|  d%| 1| d}| d&|
 d'|
 d(| d%| d
}| j2| dt-||||d) d |}n|}t0|ryd	| }| j2| dt-||| d t4||}|| jj|< |S )*Nr  zat::vecz::Vectorized<r   r  Fr  Z_vecZmasked_Tc                 S      | | S r   rS   r  rS   rS   rT   rE  
      z(CppVecKernel.reduction.<locals>.<lambda>rc   c                 S   rJ  r   rS   r  rS   rS   rT   rE    rK  r4   r  r   r   )r   r   horizontal_reductionr   )r  r  )r4   r   z4Welford reduction does not support VectorizedN (N>2)zwelford_vec_reduce_all(r   z_vec_reduce_all()rb   r_   r\   r  z.all_zero()r[   z.all_masked()z	{ return rV  r  z; }ra  zat::vec::vec_reduce_all<r   z([](z& x, z& y) r  )5VECTORIZABLE_RTYPESr[  ry  r   rF   r   r  r  r  rG   r   reduction_acc_type_vecrR  r  r  r  r{  r
  r  r   reduction_init_vecr  r  r"  rC  r   weight_recp_vec_ranger  rS  weight_recps_valr  r   welford_weight_reciprocal_vecr   rp  r  r*   r  rg  r  reduction_combine_vecrZ  r  r   r  r   rz   rg   r)   rh  r|  ri   r   ) r  r~   r   r}   r	  r  rL  r  r  Zvec_nsZvecr   Zacc_type_vecr   Zacc_vecZmasked_acc_vecZreduction_sizeZreduction_factorr  Zacc_vec_r   r   r  r   r   Zmasked_next_valueZreduce_all_bodyrm  Zvec_reduce_all_funcZtmpvarZmasked_tmpvarr   rS   rS   rT   r  
  s<  





	
	


	


"


zCppVecKernel.reductionc                    s  |  |}| j }tj }|jr|tjkr|ntj	ntj
}tj|}tj|}t }	| j| jkrL|	| dt| dt|  d| d nf||krt|  d| }
|tjkrk| d| tj d}n.||  krudkrn nd	t|  d| d
}nd	t|  d| dt|  d| d| d
}|	d|
 d| d |
}|	| |||| | j|	 fdd d S )Nr  z] = static_cast<rY  r   r   z.template cast<bool,rx   r4   at::vec::convert<r   rl  r   r   r   c                    rB  r   rC  r\  rD  rS   rT   rE    rF  z.CppVecKernel.store_reduction.<locals>.<lambda>)r  r  r  r3   r  r  Zis_floating_pointrz   rf   ri   r  r/  rh  r<   r[  ry  r   rD   rG   rg   r  rA  r|  rG  )r  r   r   r	  r   Z	out_dtyper~   Zout_num_vectorssrc_num_vectorsr   Zconverted_valueconvertrS   rD  rT   r    sF   
"
zCppVecKernel.store_reduction
scalar_varc                 C   s   |j rJ |jtjkr| j| j|   d|j d}n|jd us$J | j| j| 	|j d|j d}t
|ts>J |j|_|j|_d|_ |S )NrS  r   r  T)r   r~   rz   rg   rQ  rR  rS  r   r   rT  r   rF   Zdependent_itervars)r  rW  r4  rS   rS   rT   r    s   
zCppVecKernel.broadcastr`  c              	   C   sb   |j rJ |jd usJ | j| j| |j d| d| d}t|ts(J |j|_d|_ |S )Nz	::arange(r   r   T)r   r~   rQ  rR  rS  rT  r   rF   )r  r   r`  rX  rS   rS   rT   r]    s   
zCppVecKernel.arangec           
      C   s   t | }| |}t|rd| dS |dv rNt| }| ||}|dkr6t|r/d| dnd| d}nt|r@d| dnd| d	}| d
| dS |dkrY|   dS t||}| d
| d}	|tj	kr{|dv sqJ |   d| dS |	S )Nrw   rx   ro   r]   rr   rq   rt   rp   rs   r  r   rb   z	::from(0))r[   r\   r_   rS  )
r;   rT  r)   rG   rN  r
   r   r   rz   rg   )
r  r}   r~   r   vec_typer   r   r  Zscalar_initZvec_initrS   rS   rT   rO    s2   




zCppVecKernel.reduction_init_vecc                 C   s   t | }| |}t|rd| dS |dv rD| |}| tj}|tjkr6dttj  d| d| dS dt|  d| d| dS |tjkrT|dv sOJ | 	  S |S )Nrw   r   ro   zIndexValueVec<r   )r[   r\   rb   r_   )
r;   rT  r)   rh  rz   r  rg   rG   ri   r   )r  r}   r~   r   rX  rH  rI  rS   rS   rT   rN    s   




z#CppVecKernel.reduction_acc_type_vecc                 C   s>   |rt | j|n| j}t|}d| | d| j d| dS )Nzstatic WeightRecp<rc  r  r   )r   rP  rD   rT  rQ  )r  r~   r  Zvec_num_range_threadZvec_num_range_thread_exprrS   rS   rT   rR    s   z*CppVecKernel.welford_weight_reciprocal_vecrL  r   c                 C   s  |t jk}|dkr-| jrd| d| dt| j dS |r$| d| S d| d| dS |dkrU| jrCd| d| dt| j dS |rL| d	| S d
| d| dS |dkr{| jrkd| d| dt| j dS |rodnd}	| d|	 d| S |dkr| jrd| d| dt| j dS | d| S |dkr| jrd| d| dt| j dS | d| S |dkr|r| jrd| d| dt| j d| j d	S d| d| d| j dS | jrd| d| dt| j dS d| d| dS |dkr:t|tr
|\}
}}nt||\}
}}| jr+d| d|
 d| d| dt| j dS d| d|
 d| d| d	S |dv r|d usFJ t| }|t jkrUtt j	 }| 
|}| 
t j}d}d}|d ur~|d uspJ dt|  }d| }| jr| d| d| d| | d | d| | dt| j dS | d| d| d| | d | d| | dS |d!krt|tr|jt jksJ ttjj|f\}| d| S t)"Nr\   zmax_masked_reduce(r   r   r  rJ  r[   zmin_masked_reduce(r  rF  r_   zsum_masked_reduce(r   rW   r   r`   zprod_masked_reduce(r   ra   zxor_sum_masked_reduce(r   rc   r   r  rd   r   z}, r   ro   rP   z_combine_vec<rY  rb   )rz   rg   rg  rD   rQ  r   r   r   rG   ri   rh  r  rj   r  rF   r~   rM   r3   r/  rS  r  )r  r}   r   r   Zuse_weight_recpsr   rL  r   r   r   r   r   r   r   rH  rI  Zt_extraZ	arg_extrarS   rS   rT   rS  
  s   

&
*




0
z"CppVecKernel.reduction_combine_vecc           	   	      s  t |tsJ |jd usJ |js(t |tr|jrd| d}t ||||S |}|}|r:| |j d| d}|rH| |j d| d}|rf|rfd| d| d| d| d	}| d| d| }n#|rw| d| }| d| }n|s{J | d| }| d| }| |j d| d}|r|js| |j d| d}d| d| d}| jr| |j d| |j d	| d
t	| j d}d| d}| j
 d| d| dS )Nr  z).all_masked()r   r&  z) & (r   z) | ~(z::set(z::from(1), (r   z, "index out of bounds: z"))r   rF   r~   r   r  r  rT  r   rg  rD   r  )	r  r   r  r  r  Zlower_scalarZupper_scalarZcondZ
cond_printr	  rS   rT   r  i  sF   zCppVecKernel.indirect_assertc           	         s  t |tsJ |jst |||S t| }| |}t| }| |}d| d}|tjkrG|tjkrG| 	| d| d| d| d}|S |tjkr^|tjkr^| d| d| d}|S ||kr||  krldkryn nd	| d| d}|S d	| d| d| d| d| d}|S )
Nr  r   z::from<rl  rY  z.to<rx   r4   rT  )
r   rF   r   r  rP  rG   rh  rz   rg   r   )	r  r  r~   r   Zsrc_cpp_typerU  Zdst_cpp_typeZdst_num_vectorsrW  r	  rS   rT   rP    s(   

"$zCppVecKernel.get_to_dtype_exprr   )NNF)F)*r   r   r   r  rY  r  r   r   r   r\  rz   r~   rh   rh  ri   rB  rj   rT  r   rF   rM  r   r+  r<   r   rg   r^  r5  rA  r  r  r  r  r]  rO  rN  rR  r{   rS  r  rP  r2  rS   rS   r	  rT   r  u	  s    

*
 


,! >&"

_%r  c                       s   e Zd ZdZeZ		d fdd	Zdd Zdd Z	dd	d
Z	de
dejf fddZd fdd	Zdd Z fddZdejdejfddZ  ZS )rs  an  
    A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
    the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
    tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
    tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
    logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
    and store are generated into kernel.preloads and kernel.poststores buffers.

    The loop structure looks like below:
    for ...
      for i_outer ...
        for ...
          for inner_most ...
            // generated by CppTile2DKernel
            float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
            float tmp1[16*16]; // into kernel.preloads
            for i_inner ... { // the kernel inner loop
              vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
            }
            at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
          for inner_most ... (tail)
            // generated by CppVecKernel
            ...
      for i_outer ... (tail)
        for ...
          for ...
            // generated by CppKernel
            ...
    Nc                    sP   t  ||||d | || _|| _|| _|r|n|| _|r |n|| _d| _d S )Nr4   T)r  r  tiling_indicesinner_tail_sizeouter_tail_sizeinner_num_elemsouter_num_elemsinner_is_tiling_idx)r  r  r  rC  rY  rZ  r[  r	  rS   rT   r    s   	
zCppTile2DKernel.__init__c                 C   s   t | j| j  dS )Nr  )r,   rZ  	outer_idxr  rS   rS   rT   inner_itervar  r  zCppTile2DKernel.inner_itervarc                 C   sh   | j | j }| j | j }t||| j}t||| j}| jd u o3|dko3||o3|| o3|| S r   )rZ  r_  r[  r   rC  r  r   )r  r   Z	outer_varZ	inner_varZouter_strideZinner_striderS   rS   rT   need_vec_transpose  s   


z"CppTile2DKernel.need_vec_transposec                 C   s  t j|}| j}| dt| }d}	tt|| j| j | j }
t| j }|r4|	|}}	||
}
}d}| j	|A rC| j
| j}}n| j| j
}}|rR|dkrRdnd}t|tjr]|jrft|tjr|jsdt|  d| d	| d
|
 d
|	 d
| d
t| d
t| d}n!dt|  dt| dt| d| d	| d
|
 d
|	 d
| d}|r| j }n| j|s| jj| j|dd}nd}| j|}|rt| }d| d| d}| d| d| d| d| d
}| j| |dt|}|r| jt|| |S | j| |S )NrH  Z__place_holder__Tr  truefalseztranspose_mxn<rl  rY  r   r   Fr  zalignas(std::max(std::size_t(z), alignof(z)))r   r  rX   r   )r3   r  r  rC  rD   r   rZ  r[  r  r^  r\  r]  r   r   r   r  rG   rQ  r  containsrR  r  getr   r   rj   r  r:   )r  r   r   r   is_store
store_moder~   factorr  r  Zld_srcZld_dstZneed_defineMNr  Zload_or_storetile_varZ	cpp_dtypeZalignasZdefine_linerS   rS   rT   gen_transposed_tile_load_store  s|   



&"z.CppTile2DKernel.gen_transposed_tile_load_storer   r   c                    s   | j |}| |}|  }| |rT| j|||dd}| dt|| j  }tj	
|}| |d|}| j| j|}	|	d| ||fi  t|	tsOJ d|	_|	S | |}
t ||
S )NF)rf  rH  r   r5  T)r  r  r  r`  ra  rl  rD   r  r3   r  r  r+  rQ  rR  r  rT  r   rF   r   rt  r  r5  )r  r   r   r   r.  rk  r*  r~   r   rX  r   r	  rS   rT   r5  '  s"   


zCppTile2DKernel.loadc                    s  d|v sJ t |tsJ ||js| |}| j|}|  }| |}| |rt| j	|||d|d}| dt
|| j  }| jsRtj|ttjtjg v ra| d| dt
| j d}	n| d| d}	| jt||	 d S | |}
t ||
|| d S )Nr  T)rf  rg  rH  r@  r   r   )r   rF   r   r  r  r  r`  r  ra  rl  rD   r  rg  r3   r  r  ry   rz   rQ  rR  r  r   r:   rt  r  r  )r  r   r   r	  r  r   r.  rk  Zstorebufr   r   r	  rS   rT   r  =  s*   




zCppTile2DKernel.storec                 C   sj   |   }| jr|d| d| dt| j d| d	 d S |d| d| dt| j d| d	 d S )Nr:  r;  r   r   r   )r`  r^  r   rD   r]  r\  )r  r   r.  rS   rS   rT   r  Z  s   ""z#CppTile2DKernel.codegen_inner_loopsc                    sz   t  ||}| jd | jk r| jnt| j\| _| _| j| jd kr0| j| _| j	| _
d| _|S | j| _| j| _
d| _|S )Nr4   r   FT)r  r  rY  ry  reversedr_  r[  r[  rg  r]  r  r^  rZ  r\  )r  groupreduction_groupr  r	  rS   rT   r  e  s   
zCppTile2DKernel.set_rangesr   c                 C   s   | j || j|  dS )Nr9  )r  r_  r`  r  rS   rS   rT   rt  w  s
   z"CppTile2DKernel.transform_indexingr  r   )r   r   r   r   rr  rY  r  r`  ra  rl  rj   r   r   r5  r  r  r  rt  r2  rS   rS   r	  rT   rs    s    
<rs  _bodyc                 C   s   | j gt| j  }d}d}|D ]Q}|jjD ]J}|jdks#|jdv r$q|jdvr+d}t|dr_|j	r_t
j|j	v s;J |j	t
j }|jrI|jtvrLd}q|dur[||jkrZtd q|j}qd}qq||fS )	z
    Returns the low precision data type (torch.float16/torch.bfloat16) contained in the nodes
    and if all the nodes can codegen with this data type without converting to float.
    Otherwise returns None and True.
    NFplaceholder)	get_indexr  )r5  r  r]  ra  r  Tr9  z.bf16 and fp16 are mixed in the scheduler node.)
root_blockr   	subblocksr  r  nodesoptargetr   r9  r@   r8  r~   ry   warningswarn)rp  
sub_blocksZ_lowp_fp_typeZ	_use_fp32	sub_blockr  r6  rS   rS   rT   get_loop_body_lowp_fp  s,   


r|  c                       sF   e Zd ZdZ fddZdeee ee f fddZdd Z	  Z
S )	TilingSelectz
    Implement the heuristic to select the tiling factors and tiling indices.
    In the future, we can implement advanced heuristic in a subclass.
    c                    s   t    d S r   )r  r  r  r	  rS   rT   r    r  zTilingSelect.__init__r   c           "         s  t |}t|}|sJ tdd |D rg g fS tj}t|d d   r7t fdd|dd  D r7 }t j	|d}| 
|||}|rt|dd d	\}}	t|t|	 }
tjjr_d
d }dd }dd }dd tt|
D }t|}|d | ||d  }}i }i }|D ]}|jgt|j  }|D ]p}|jjD ]i}|jdv r|jdkrdnd}|j||f|j| jd  }|||r|||||}|jdkr|d u rn|dvr||j| t|jtr|jds|jdv s|j|vrd||j< q||j  d7  < qqqt| }t| }d}d}||ks0|dkr4|| |kr4g g fS |	s_|r_t|dkr_t ||d  gs_||d  |d k r_|dk r_g g fS |t!v rt j	|d}|D ]N} | dk r{| t|
 } | dk s| t|
krqnt |
rt"jj#j$|
|  dd}!|!|k rt"jj#%|!| |d } nqn|
|  |k r|d } nqnt|dkr|g|fS t|dkr||g|fS g g fS )Nc                 s       | ]}|t vV  qd S r   )rm   r   r~   rS   rS   rT   r         z-TilingSelect.select_tiling.<locals>.<genexpr>r   c                 3   s     | ]}t |d   kV  qdS )r   N)r|  )r   	loop_bodyZ_lowp_fp_dtyperS   rT   r     s
    
r4   r  c                 S      t | d S r   r   sizesrS   rS   rT   rE        z,TilingSelect.select_tiling.<locals>.<lambda>r8  c                 S   s&   ||d  }t | ||}|jr|S d S Nr   )r   r  )r   rZ  rC  rY  r  r`  rS   rS   rT   _try_get_stride  s   z3TilingSelect.select_tiling.<locals>._try_get_stridec                 S   s(   | |vr
d|| < d S ||   d7  < d S r   rS   )Z	node_namenon_contig_indexing_op_counterrS   rS   rT   _update_negative_op_count  s   z=TilingSelect.select_tiling.<locals>._update_negative_op_countc                 S   sD   t |dko!t | dko!|d dkr|d n|d t |  t | k S Nr4   r   r  )rZ  rY  rS   rS   rT   _is_valid_indices  s   
z5TilingSelect.select_tiling.<locals>._is_valid_indicesc                 S   r  rS   r  r  rS   rS   rT   r+    r  z.TilingSelect.select_tiling.<locals>.<listcomp>)r  r5  r  r  r   r   r4   Zmasked_subblock)r1   r  r  rr  gQ?#   r0  
   r  )&rB   rA   rb   rz   ri   r|  r   r   r  	nelements_select_tiling_indicesr\   r   r   rp  Zenable_tiling_heuristicsr  r   rs  r   rt  r  r  ru  rw  r  Zindexing_from_argsr  r   rj   
startswithr_   r'   ry   r3   r  r  Zguard_lt)"r  fn_listvar_sizes_listloop_bodies
all_dtypesr~   rC  rY  rn  ro  rx  r  r  r  rZ  ry  r  reduction_varsZ
op_counterr  rp  rz  r{  r  Zarg_idxr   r`  Zop_numZnon_contig_indexing_op_numZratio_thresholdZquantity_thresholdZfactor_lowpZtiling_indiceZ
call_rangerS   r  rT   select_tiling  s   




















zTilingSelect.select_tilingc                 C   s  g }t ||D ]\}}tj|g|R  }|dd t|j|jD 7 }qtt  }g }	tt  }
tt  }|D ][}|j	D ]U}t
d|jsFq<t|||}|dkrQq<|dkrn|t|jdd   |	t|jdd   q<tdd |j	D r|
t|jdd   q<|t|jdd   q<q7||
 | }t|dd	 d
\}}t|t| }t|dkr|d gS |rt|dd  S ||
@ | }t|}t|dkr|d |v r|d |d kr|S t||	jd
dd  S )Nc                 S      g | ]}|j qS rS   )r   )r   deprS   rS   rT   r+  h  s    z7TilingSelect._select_tiling_indices.<locals>.<listcomp>z^d\d+$r   r4   c                 s       | ]	}t |tjV  qd S r   )r   r   ZSIZEr  rS   rS   rT   r   w  r   z6TilingSelect._select_tiling_indices.<locals>.<genexpr>c                 S   r  r   r  r  rS   rS   rT   rE  |  r  z5TilingSelect._select_tiling_indices.<locals>.<lambda>r  r  r   )r  r	   Zextract_read_writes	itertoolschainZreadsZwritesr   rh   r  r   searchr   r   rL  r
  r   r\   r   sortedcount)r  r  r  rC  Z	all_indexfn	var_sizesrwZcontig_varsZcontig_vars_listZnon_contig_stride_constZnon_contig_stride_otherr   r   r`  Zcontig_onlyrn  ro  Znum_itervarsZcontig_and_const_strideZcontig_vars_sortedrS   rS   rT   r  _  sL    




z#TilingSelect._select_tiling_indices)r   r   r   r   r  r   r   rh   r  r  r2  rS   rS   r	  rT   r}    s    
 ,r}  c                       s   e Zd Z fddZdd ZdefddZdefd	d
Zdd Z	dd Z
dd Zdee fddZdd Zdd Zd!dee fddZdeded fdd Z  ZS )"r  c                    s:   t  |j|jj || _d | _d | _t	 | _
g | _d S r   )r  r  r  wsr  r,  r  rx  r   r  picked_vec_isakernelsr  r,  r	  rS   rT   r    s   

zCppKernelProxy.__init__c                 C   s&   |D ]}t |tsJ t| qd S r   )r   r#   r9   propagate_scheduler_node)r  ru  r  rS   rS   rT   data_type_propagation  s   z$CppKernelProxy.data_type_propagationscheduler_nodec                 C   s<   t |jtsdS t| t|jd d uot|jd  S )NTr   r4   )r   rp  r   r9   r  r|  )r  r  rS   rS   rT   is_lowp_fp_scheduler  s   
z#CppKernelProxy.is_lowp_fp_schedulerr  c                 C   s@   dt jjfdd}|jgt|j  }|D ]}||j qd S )N	sub_graphc              	      sJ  dt jjdtt j fdddt jjdtt j fdddt jjdt jffdd	dt jjdt jffd
ddt jjdt jffdd}t| j}g |D ]}|jdv r|  tv rt	 fdd|j
D rnqP|jd }| |$ | jd||t jfd|fdd t jd7  _W d    n1 sw   Y  qP|jdkr|  tv r|j\}}}}}|| rqPtj|| | | jd||fd|| t jd7  _W d    n1 sw   Y  qP|jdkr'|j\}}}	}
|tv r&t jt jt jt jfv sJ |tv rt jnt j|	|
f|_qP|jdkrR|jd tv rR|j\}}
 t	 fdd|j
D rJqP||
t jf|_qP|jdkr|jd tv r|j\}} t	 fdd|j
D ruqP| ||t jf|_qP|jdkr|j\}}}|tv r|||s| | | jd|||fd|| t jd7  _W d    n	1 sw   Y  tv rt	fdd|j
D s|jd }| |$ | jd||t jfd|fdd t jd7  _W d    n	1 sw   Y  qP	 qPd t jjffd!d"}||  d S )#Nr   r   c                 S   sd   | j dkrtj| jd S | j dkr| jd S | j dkr0t| jdkr)| jd S | jddS dS )	z6Get input dtype for nodes that may consumes lowp fp dtr  r4   rZ  r  rO  r   r   N)rw  r3   r  r  r  r   r  re  r   rS   rS   rT   get_input_dtype  s   




z]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_input_dtypec                 S   sZ   | j dkrt| jdksJ tj| jd S | j dv r!| jd S | j dkr+| jd S dS )	z6Get output dtype for nodes that may produce lowp fp dtr5  r   r4   )rO  r  r  r  rZ  r   N)rw  r   r  r3   r  r  r  rS   rS   rT   get_output_dtype  s   




z^CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.get_output_dtypedtc                    s   |t v sJ  | |kS )z]Check if the given node produces output with expected low precision floating point data type.)ry   r   r  )r  rS   rT   is_lowp_fp_source  s   z_CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sourcec                    s2   |t v sJ  |  }r||kS | jdkrdS dS )zZCheck if the given node accept input with expected low precision floating point data type.rO  TF)ry   rw  )r   r  Zinput_dtype)r  rS   rT   is_lowp_fp_sink  s   
z]CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_sinkc                    s$   |  ot  fdd| jD S )zCheck if the node is a lowp fp sources which are all directly fed to ops that accepts lowp fp input
                thus no need to promote to float
                c                 3       | ]}| V  qd S r   rS   r   userr  r  rS   rT   r         

z}CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote.<locals>.<genexpr>r   usersr  )r  r  )r  rT   is_lowp_fp_source_no_promote  s   zjCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.is_lowp_fp_source_no_promote)r5  r  c                 3   r  r   rS   r  r  rS   rT   r     r?  zWCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<genexpr>r   rO  r  c                       |  uS r   rS   r  to_type_noderS   rT   rE    rK  zVCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.<lambda>r4   r  r  r  r  c                 3   r  r   rS   r  r  rS   rT   r     r?  c                 3   r  r   rS   r  r  rS   rT   r   "  r?  rZ  c                 3   r  r   rS   r  )r~   r  rS   rT   r   J  r?  c                    r  r   rS   r  r  rS   rT   rE  R  rK  r  c                    s"   dt jjf fdd}||  d S )Nr  c                    s   dt jjfdd dd | jD } fdd|D }|D ]7}| D ]0\}| jv rRtfdd|D sCv rRtd	d |D rRjd
 }| |  q"q| j	d u r_| 
  d S d S )Nto_nodec                 S   s   t dd | jD S )Nc                 s   s    | ]}|j d kV  qdS )rO  Nrw  r   usrrS   rS   rT   r   a  r?  zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_to.<locals>.<genexpr>r  )r  rS   rS   rT   _used_by_to`     zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>._used_by_toc                 S   s   g | ]	}|j d kr|qS )rO  r  r   rS   rS   rT   r+  c  s    zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<listcomp>c                    s   g | ]} |r||j iqS rS   )r  r   )r  rS   rT   r+  f  s
    c                 3   s$    | ]}|j d   j d  kV  qdS r  Nr  r  r  rS   rT   r   l     " zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node.<locals>.<genexpr>c                 s   s    | ]
}|j d  tv V  qdS r  )r  ry   r  rS   rS   rT   r   o      
r  )rz   fxNoderu  r  r   Zall_input_nodesreplace_all_uses_withZ
erase_nodeZowning_moduleZlint)r  Zall_to_nodesZall_to_nodes_and_usersZ
node_usersr  Zval_nodeto_lowp_fp_legalized_nodes)r  r   rT   _eliminate_duplicate_to_nodeY  s2   





zCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype.<locals>._eliminate_duplicate_to_node)rz   r  Graph)r  r  r  rS   rT   eliminate_to_dtypeX  s   )z`CppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype.<locals>.eliminate_to_dtype)rz   r  r  r   r~   r   ru  rw  ry   r   r  r  Zinserting_afterZcall_methodri   r  r   Zcpp_to_dtype_countr3   r  r  Zinserting_beforeZreplace_input_withrO  rP  r  r
  r  )r  r  Zsub_graph_nodesr  r1   r   r   Z	value_varr   r}   r	  rV  r  rS   )r  r~   r  r  r  r  r  r  rT   add_to_dtype  s   




	








	,zDCppKernelProxy.legalize_lowp_fp_dtype_loopbody.<locals>.add_to_dtype)rz   r  r  rs  r   rt  r  r  )r  r  r  rz  r{  rS   rS   rT   legalize_lowp_fp_dtype_loopbody  s    ]z.CppKernelProxy.legalize_lowp_fp_dtype_loopbodyc                    s   t  fdd|D rJ|D ]:}|jjgt|jj  }|D ](}|jjD ]!}|jdv rE|j	s0J t
j|j	v s8J |j	t
j }|jtv sEJ q$qqd S |D ]}t|tsUJ t|jts]J |j}| si | qLd S )Nc                 3   s$    | ]}t |to |V  qd S r   )r   r#   r  r   r  r  rS   rT   r     s
    
z8CppKernelProxy.legalize_lowp_fp_dtype.<locals>.<genexpr>)r5  r  )r   rp  rs  r   rt  r  r  ru  rw  r9  r@   r8  r~   ry   r   r#   r   Zis_memory_copyr  )r  ru  r  rz  r{  Zfx_noder6  r  rS   r  rT   legalize_lowp_fp_dtype  s8   




z%CppKernelProxy.legalize_lowp_fp_dtypec                     sx  t  t ks
J | jtdd d\|  fdd} fdd|t}tj j|jO  _tj j|jO  _t	
|| _| jrO| jsa|g| _| dd  | j|  d S tjjjdd	 t }| \}}t |t |ksJ d
}tt }	tdd |	D rd}d}
d }|rd}|d }|d }t | jj|kr| jj| j}| jj| j}|o| }
t |dkrt jd7  _| jj|d |d d}|t|d |d }|j |j! }|j"d|j!fi|_#tj$j%r|r|t|d |d |}n|}|j"g|_&|j"|j!|j fi|_#||g| _|}nt |dkr|d t | jd kr:|d |d ks<J t jd7  _| jj|d |d d}d|j!f|j!|j fd}|j |j! }| jj|d |d d}d|j!f|j!|j fd}|j |j! }|t'|d |}|j"|d |j"|d i|_#g }tj$j%r|rdD ]3\}}|dkr|nd }|dkr|nd }|t'|d |||}|j"|| |j"|| i|_#|(| qn;|t|d |d }|j"|d |j"|d i|_#|j"g|_&|(| |j"|d |j"d|j fi|_#|j"|j"g|_&|(| |g| | _|}n|g| _| |
| | j|  W d    d S 1 s5w   Y  d S )Nc                 S   r  r   r  r  rS   rS   rT   rE    r  z2CppKernelProxy.codegen_functions.<locals>.<lambda>r  c                    sP    j | g|R  }t jd8  _| |W  d    S 1 s!w   Y  d S r   )
new_kernelr   Zgenerated_kernel_count)r   r  r/  )r,  runrS   rT   codegen_kernel  s
   $z8CppKernelProxy.codegen_functions.<locals>.codegen_kernelc              	      s   |  \}}d}t D ]L\}}|fttdffv r-|r'J ||| qd}|dfksBJ d| d d |   ||d W d    n1 sVw   Y  qd S )NFrS   Tzunexpected group: r  r   )r  r  r   r  r  r  )r/  r  r  Z	in_suffixr  r  )r  rn  ro  r  rS   rT   r    s*   
z-CppKernelProxy.codegen_functions.<locals>.runFZinplace_buffersTc                 s   r~  r   )rn   r  rS   rS   rT   r     r  z3CppKernelProxy.codegen_functions.<locals>.<genexpr>r   r4   )rh  r   maintailr  )r  )r  r  )r  r  r  ))r   r,  r\   r  ru  r3   r  removed_buffersZinplaced_to_remover  r  r  r  rZ  r  aggregate_reduction_buffers
set_kernelrz   	_inductorr   patchr}  r  rA   rB   rb   r  r  r   generated_cpp_vec_kernel_counttiler  r  
tiled_sizer   rv  rp  Zenable_loop_tail_vecrw  rs  r
  ) r  r  r  r  Zscalar_kernelZtiling_selectZtiling_factorsrY  Zcould_masked_vecr  Z_inner_loop_reduction_outer_notZ_outer_loopZinner_loop_reductionZouter_loop_levelZinner_loop_levelZouter_loop_reductionr  Z
vec_kernelrg  Ztail_kernel
outer_loopr'  r[  Z
inner_loopZinner_rangesrZ  Ztile2d_kernelZouter_rZinner_rZ_inner_tail_sizeZ_outer_tail_sizer/  rS   )r  rn  r,  ro  r  r  rT   codegen_functions  s  	













 $z CppKernelProxy.codegen_functionsc                 C   s.   |D ]}|  | t| q| || d S r   )r  r9   Zpropagate_loopbodyr  )r  r  r  r  rS   rS   rT   codegen_loop_bodies`  s   
z"CppKernelProxy.codegen_loop_bodiesru  c                    s   |  | | | t|dksJ dd   fdd|D }ttjtr6tjjr6dd fdd|D }d	d |D }| || d S )
Nr4   c                 W   s0   |    |   ttjtr| j| S | |S r   )Zdecide_inplace_updatemark_runr   r3   r/  r0   rp  Zcodegen)r   
index_varsrS   rS   rT   r  l  s
   

z(CppKernelProxy.codegen_nodes.<locals>.fnc                    s   g | ]}t  |qS rS   )r  partialr   )r  rS   rT   r+  t  s    z0CppKernelProxy.codegen_nodes.<locals>.<listcomp>c                 S   s   t j| }| |_|S r   )r3   r  Zlocalize_functionZoriginal_fn)r  Z
wrapped_fnrS   rS   rT   wrap_fn{  s
   z-CppKernelProxy.codegen_nodes.<locals>.wrap_fnc                    s   g | ]} |qS rS   rS   )r   r  )r  rS   rT   r+    s    c                 S   s   g | ]}|j d  qS )r4   )rn  r   rS   rS   rT   r+    r  )	r  r  r   r   r3   r  rI   r  r  )r  ru  r  r  rS   )r  r  rT   codegen_nodesf  s   


zCppKernelProxy.codegen_nodesc                 C   s   |  | j|| d S r   )r  r  )r  r   r  rS   rS   rT   r    r  zCppKernelProxy.codegen_loopsc                 C   s   | j D ]}|  qd S r   )r  r  r  r/  rS   rS   rT   r    s   

z4CppKernelProxy.update_stores_with_parallel_reductionNr   c              	   C   st   |d usJ d}| j D ],}t }|||r(d}||  ||  W d    n1 s2w   Y  qd S )N
C10_LIKELYC10_UNLIKELY)r  r   r   r  r   r   r  r  )r  r   Z	if_prefixr/  r   rS   rS   rT   r    s   

zCppKernelProxy.gen_bodyinner_loop_reduction_outer_notr  	LoopLevelc                    s   d fdd} j d }|r|sJ || n|   j|j  j|j  j|j  j|j  j|j  j|j  j	|j	 d S )Nr  r  c              	      s  t  jdks	J  jd } jd }t|tsJ t|tkr5||j |   j	|j|j  n|   j	|j t
 }t }||d| jr]||  |	|j W d    n1 sgw   Y  t \}||d| jr||  t|tkr|j}|D ]}| d| j dt| j d}t|j|| t|j|| q|	t|j| j| j d	| j| j n|	|j W d    n1 sw   Y  | _d S )
Nr   r   r  r  r  r   z_tail - r  r  )r   r  r   r  r   ru  r  rC  rz  r  r6   r   r   r  r   r   r   r|  r  rD   r  r   r  r   r  )r  Zmain_loop_kernelZtail_loop_kernelZ
suffix_bufr   r  r   r   r  rS   rT   !aggregate_reduction_prefix_suffix  sf   






zUCppKernelProxy.aggregate_reduction_buffers.<locals>.aggregate_reduction_prefix_suffixr   )r  r  )
r  r  rz  r  r|  r}  r~  r  r  r  )r  r  r  r  Zmain_kernelrS   r  rT   r    s   
8
z*CppKernelProxy.aggregate_reduction_buffersr   )r   r   r   r  r  r#   r  r   r  r  r  r  r   r  r  r  r   r6   r  rg   r  r2  rS   rS   r	  rT   r    s&    
 b 9!
r  c                       s$   e Zd Z fddZdd Z  ZS )r-  c                    s   t  |j|jj g | _d S r   )r  r  r  r  r  r.  r  r	  rS   rT   r    s   
zOuterLoopFusedKernel.__init__c              	   C   sr   g }dd | j D }|D ]}|j}|d usJ ||tt||j |jd|j qtt|jt	||jdS )Nc                 S   s   g | ]}|  qS rS   )r  )r   r  rS   rS   rT   r+    s    z>OuterLoopFusedKernel.decide_parallel_depth.<locals>.<listcomp>r  )
r.  rx  r
  r  r   r   r   r   r[   r\   )r  r  r  Zkernels_parallel_depthZnested_kernelsr/  rx  rS   rS   rT   r    s.   
z*OuterLoopFusedKernel.decide_parallel_depth)r   r   r   r  r  r2  rS   rS   r	  rT   r-    s    r-  c                   @   s   e Zd ZdZdZdZdS )ReasonFusedNodesZsame_vars_reduceZcompatible_reductionZcompatible_ranges_no_reductionN)r   r   r   SAME_VARS_REDUCECOMPATIBLE_REDUCTIONCOMPATIBLE_RANGES_NO_REDUCTIONrS   rS   rS   rT   r  
  s    r  c                       sd  e Zd ZdZeejejgZe	de
jdee fddZ fddZdefd	d
Zdd Zdd Zdd Zdee fddZdd Zdd Zdd ZdededefddZdd Zdd  Zd!d" Zd#d$ Zd%ee  fd&d'Z!d(e"fd)d*Z#d(e$e"e%e f fd+d,Z&d(edefd-d.Z'd/ed0e(e d1e(e fd2d3Z)d4d5 Z*d6d7 Z+d8d9 Z,d?d;d<Z-d=d> Z.  Z/S )@CppSchedulingi  devicer   c                 C   r  r   )backend_features)r   r  rS   rS   rT   get_backend_features  s   z"CppScheduling.get_backend_featuresc                    s"   t  | |r|   d| _d S NF)r  r  reset_kernel_group_ready_to_flush)r  r   r	  rS   rT   r     s   
zCppScheduling.__init__statusc                 C   
   || _ d S r   r  )r  r   rS   rS   rT   _set_flush_status&     
zCppScheduling._set_flush_statusc                 C   s   t dd |D S )Nc                 s   s$    | ]}t ttjjj|V  qd S r   )r   rG  r3   r  r  r   r  rS   rS   rT   r   *  r  z)CppScheduling.group_fn.<locals>.<genexpr>)r   )r  r  rS   rS   rT   group_fn)  r  zCppScheduling.group_fnc                 C   s   t  | _d S r   )KernelGroupr,  r  rS   rS   rT   r  ,  s   z CppScheduling.reset_kernel_groupc                    s  |  s|  rt||S | r| rJ t||S | ||tjkrt|t	tfs0J t|t	tfs9J |j
\}\}}|j
\}\}}|dkrO|dksUJ ||f fdd t|t|k re|n|}t|t	snJ t|t|k rx|n|}	 |	}
|j|
d |j
\}\}}|j
\}\}}||krt||S  |}t|	t	r|	j|d n!t|	tsJ |	jD ]}t|t	sJ |j|d qt|	j|	j}	|j
\}\}}|j
\}\}}||ksJ ||ft||S | ||rt||| ||S t||S )NrS   c           	         s   t | trAt| jdksJ | jd }tt  }| jD ]} |\}}|d u r)|}||ks5J ||| jf|| q|t|fS t | tsHJ | j	}t |t
jsSJ | \}}}|jt|j fS r  )r   r!   r   snodesr   r   updater   r#   r   r   ComputedBufferget_default_sizes_bodyr  indexing_exprsr  )	r   r  r  snodevexprsZcomp_bufferr   r  get_indexing_ranges_exprsrS   rT   r  A  s    


z5CppScheduling.fuse.<locals>.get_indexing_ranges_exprs)extra_indexing_constraints)Z
is_foreachr    r  is_templater!   _why_fuse_nodesr  r  r   r#   rn  r   recompute_size_and_bodyr  r   can_fuse_vertical_outer_loopr   _get_outer_loop_fusion_depth)r  r   r   r   vars1reduce1vars2reduce2node_to_recompref_nodeZref_indexing_constraintsZ#node_to_recomp_indexing_constraintsr  rS   r  rT   r  /  s`   


zCppScheduling.fusec                 C   sb   |j \}\}}|j \}\}}||kr||krtjS |dkr&||| kr&tjS | ||r/tjS d S )NrS   )rn  r  r  r  &_can_fuse_nodes_with_compatible_rangesr  )r  r   r   r   r  r  r  r  rS   rS   rT   r    s   zCppScheduling._why_fuse_nodesc                 C   s  |j \}\}}|j \}\}}|dko|dk}t|t|k}	t|dkp+t|dk}
|r2|	r2|
s4dS t|t|k r>|n|}t|t|k rJ|n|}t|trSdS t|tsZJ t|jtj	rcdS t|jtj
slJ |jj }d }t|trtttdf   }|jD ]}t|jtj	r nt|jtj
sJ |t|jj  qt|dkrdS ttt|}nt|tsJ t|jtj
sJ |jj }||krdS dS )NrS   r4   F.T)rn  r"  r`   r   r   r!   r#   r   r   ZTemplateBufferr	  dataget_sizer   r   r   r  rL  r   nextiter)r  r   r   r   r  r  r  r  c1c2c3r  r  Zranges2Zranges1Z
ranges_setr  rS   rS   rT   r    sB   


z4CppScheduling._can_fuse_nodes_with_compatible_rangesc                 C   sN   t |ttfs	J t |ttfsJ tdd ||fD rdS | ||d uS )Nc                 s   s    | ]}t |tV  qd S r   )r   r   r   rS   rS   rT   r     r  z:CppScheduling._can_fuse_horizontal_impl.<locals>.<genexpr>F)r   r!   r#   rb   r  r  r   r   rS   rS   rT   _can_fuse_horizontal_impl  s   z'CppScheduling._can_fuse_horizontal_implc                 C   sD   |  s|  r
dS t| t|  tjjkrdS | ||S r  )r  r   r  r   rp  Zmax_horizontal_fusion_sizer&  r%  rS   rS   rT   can_fuse_horizontal  s   z!CppScheduling.can_fuse_horizontalr   r   c                 C   sR   |   }r't|jtjo&t|jtjo&t|jjdko&|jjd 	 |j
kS dS )Nr4   r   F)Zget_template_noder   layoutr   ZMultiOutputLayoutr   MultiOutputr   Zinputsr  r   )r  r   r   Ztemplate_bufrS   rS   rT   can_fuse_multi_outputs_template  s   z-CppScheduling.can_fuse_multi_outputs_templatec                 C   sX  d}t dd ||fD s|S t|tr| d n|}t|ttfs%J t|tr0| d n|}t|ttfs;J |j\}\}}|j\}\}	}
|dkr[|	dkr[|dkr[|
dkr[|S t dd ||fD rq|j|jkro|jS |S tt	|t	|	}|dkr|d | |	d | krt
dd ||fD rt|tu r|n|}|j|kr|S |S |S |S )	Nr   c                 s   r   r   )r   r   r!   r#   r   rS   rS   rT   r     s    
z=CppScheduling._get_outer_loop_fusion_depth.<locals>.<genexpr>r  rS   c                 s   r   r   r   r   rS   rS   rT   r     r   r4   c                 s   r   r   r   r   rS   rS   rT   r     s    
)r   r   r   r   r!   r#   rn  r   r[   r   rb   r   )r  r   r   ZDISABLE_OUTER_LOOP_FUSIONZ_node1Z_node2r   r  r  r  r  r   Z_compare_noderS   rS   rT   r    sL    
z*CppScheduling._get_outer_loop_fusion_depthc                 C   sJ   |   o$|   o$| |j@ o$| ||o|   o$| ||dkS r   )r  Zget_operation_namesZ	ancestorsr&  r  r  r%  rS   rS   rT   r    s   
z*CppScheduling.can_fuse_vertical_outer_loopc                 C   s   |  ||rdS dS r  )r  r%  rS   rS   rT   get_fusion_pair_priority(  s   z&CppScheduling.get_fusion_pair_priorityc                 C   sT   |  rdS |  rt||g\}}|  o|S | ||o#|  p)| ||S r  )r  rL   r  r&  r  )r  r   r   Ztemplate_fusion_supportedr   rS   rS   rT   can_fuse_vertical/  s   
zCppScheduling.can_fuse_verticalru  c                    s  t dd |D r|S ddd}d}d}d}|D ]}t|jtjs$J |j \}}}|j D ]i\}	|	t	D ]_ t  fdd|j
D rR |krR }|d7 }|dkr^|      S t jd tjjjr jd |j
v rdurt fdd|j D r jd d	kr jd  jd d
}|}q:q1q|s|S dfdd}
|D ]}||kr|j|
d q|D ]}||kr|j|
d q|S )aI  
        Apply loop split optimization.
        When one of the indexing_exprs contains a division, we eliminate the division by splitting the loop
        to avoid non-contiguous loads, subject to the following conditions:
            1. No reduction and no mudular index for all nodes.
            2. The indexing_exprs of all nodes contain only one (or more, but all the same) division,
               where the divisor is an integer and not too small (the divisor > 8), the dividend is
               one of the iter_vars, and this var, i.e. the dimension that needs to be split, is
               contiguous in all other indexing_exprs.

        For example, if the node's var_ranges: {z0: 2, z1: 9216, z2: 960} and indexing_exprs:
        {'index0': 8847360*z0 + 960*z1 + z2, 'index1': 32*z0 + (z2//30), 'index2': z2},
        we will split z2 -> 30*z2 + z3, then the node's var_ranges will be changed to
        {z0: 2, z1: 9216, z2: 32, z3: 30} and indexing_exprs will be changed to
        {'index0': 8847360*z0 + 960*z1 + 30*z2 + z3, 'index1': 32*z0 + z2, 'index2': 30*z2 + z3}.
        c                 s   s@    | ]}t |jd  d  dkptdd |jj D V  qdS )r4   r   c                 s   s    | ]}| tV  qd S r   )r   r   )r   rW  rS   rS   rT   r   Q  r  z9CppScheduling.try_loop_split.<locals>.<genexpr>.<genexpr>N)r   rn  rb   rp  r  r  r   rS   rS   rT   r   O  s    

z/CppScheduling.try_loop_split.<locals>.<genexpr>Nr   Fc                 3   s    | ]}  |V  qd S r   )r   )r   r   )div_exprrS   rT   r   e  r?  r4   c                 3   s0    | ]\}}|krt | jd  dv V  qdS )r   r  N)r   r  )r   Zname_Zexpr_)r-  r   rS   rT   r   p  s    r!  Tc                    s   | \}}|\}}| }| }||  ||< ||d  tj||dd\\}	}
}|	 }||d }||  | ||< t|||g||	|} sY|jt	|j
 f ||f||	|ffS )Nr4   r  )r  )r   copyinsertr	   Zindex_vars_no_squeezepopr   r   r  r   r  r  )r  r  r  Z
index_sizeZreduce_sizer  Zreduce_varsZ	split_idxZnew_index_sizeZnew_index_varsr   r  	iter_varsZdivisor_var)r  split_number	split_varrS   rT   
loop_split  s.   
z0CppScheduling.try_loop_split.<locals>.loop_split)recompute_sizes_body_func)r  r5  )rb   r   r   r   r	  r
  r  r  findr   r1  r  r   corenumbersr#  r   r  )r  ru  Znum_divZ	div_expr_Z	match_divZmatched_noder   r   Zoriginal_bodyrW  r4  rS   )r-  r  r   r2  r3  rT   try_loop_split<  sl   

zCppScheduling.try_loop_splitr   c                    s   | j tj}g  g t|tsJ dtf fdd}||si|t_     tjjj	dd, |
 D ]}t|ttfsCJ | }t}|| || q8W d   dS 1 sbw   Y  dS dS )a  
        Generate the code for the outer loop fused scheduler node.
        1. Codegen with fused outer loop: depends on the analysis of
            the outer loop fused scheduler node, with or without the local buffer.
        2. If failed, fallback to standard codegen.
        r   c              	      s  t tsJ     dtfdd g }i t fdd D rtt   D ]t t	s:J 
   sMt dkrNq1 d tfddjD rˈj}t |tjskJ | }jt  }fd	d
}| r| sq1t|j|j|j|d |j|d }fdd}d}|||}	|	stj| dt| |d}	||	 g |	j< |	j | q1tj}
t|dkr|D ]}|jdusJ |
 ||j  qڈ D ]"}t |t!t	fsJ t"}|#|  | |  q$js3|
j%D ]
}t&j'j%(| q	 W d   dS t)j*t)j+tt|
j,d -}.|g t/j01 W d   dS 1 s`w   Y  dS )zN
            Codegen code with fused outer loop and local Buffer.
            r   c                 S   sH   t | ttfs	J |  }t|dd dj\}\}}t|t| }|S )Nc                 S   s   t |  S r   )rh   r  r\  rS   rS   rT   rE    r  z~CppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_ranges.<locals>.<lambda>r  )r   r#   r!   r  r\   rn  r   )r   ru  r   rn  ro  rx  rS   rS   rT   get_call_ranges  s   
zlCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.get_call_rangesc                 3   s&    | ]}t  |jd  kV  qdS )r4   N)r   r   r  )r:  r   rS   rT   r     s
    
zfCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.<genexpr>r4   r   c                 3   s    | ]
}|j   v V  qd S r   )r   r  r  r  rS   rT   r     r  c                     st   d d} t jj D ]\}} | | 7  | |9 } qj } fdd|o9tfddjD S )Nr   r4   c                    s   |  kS r   rS   r\  )contiguous_index_exprrS   rT   is_contiguous_index  s   zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.is_contiguous_indexc                 3   s2    | ]}t |jto |jj V  qd S r   )r   r   r#   rp  Zget_read_exprr  r  )r<  scheduler_bufferrS   rT   r     s    
zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguous.<locals>.<genexpr>)rm  rp  r  r  Zget_write_exprr  r   r  )r`  r   r  Zwrite_index_expr)r=  r  )r;  r<  rT   is_all_write_read_contiguous  s   

zyCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.is_all_write_read_contiguousNc                    s<   |D ]}| |j krtfdd |j D r|  S qd S )Nc                 3   s>    | ]}|j d urt fddtjjj|j  jD V  qd S )Nc                 3   s    | ]
}|j   v V  qd S r   )r   r  r  visited_scheduler_nodesrS   rT   r     s
    
zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>.<genexpr>)r   r   r3   r  r   Zname_to_bufr  )r   global_bufferr?  rS   rT   r     s    
	zCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_buffer.<locals>.<genexpr>)r(  r   r   )local_buffer_layoutr  Z	local_buf)local_to_global_buffersr@  rS   rT   try_share_local_buffer  s   zsCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_buf.<locals>.try_share_local_bufferZlocal_buffer_datar   )r   r(  F)Zlocal_buffer_numberT)2r   r   clearr   r   r   r   rj   r  r#   rL  r  r  r   Zget_outputsr  r   r   r	  r  r   Zis_contiguousZFixedLayoutr  r~   r  r`  ZBufferr
  r   rI   r  Zadd_local_bufferr!   r  r  r(  r  r3   r  remover   Z!cpp_outer_loop_fused_inner_countsZCppOuterLoopFusedCountr  r0  finalize_kernelr  r  from_iterable)r   r  rA  Zglobal_buffer_layoutZsize_offsetr>  rB  rD  Zlocal_buf_prefixZlocal_buffer_usedscoper   r  r&  Zremoved_bufferZouter_fusion_cpp_kernel_proxyr$  r,  Z
nodes_list)r:  rC  r   r=  r  r@  rT   $try_outer_loop_fusion_with_local_buf  s   	
	






$$zSCppScheduling.codegen_outer_loop_node.<locals>.try_outer_loop_fusion_with_local_bufFr  N)r,  r   r  r   r   rE  rz   r  r   r  r   r!   r#   r  r  r  rG  )r  r   r  rK  r  _nodesr&  rS   rJ  rT   codegen_outer_loop_node  s*   
 "
"z%CppScheduling.codegen_outer_loop_nodec                 C   sp   | j }t|tr| | n| }| |}t|}|| ||| | 	 }|t
jkr6| d dS dS )zC
        Turn an set of pre-fused nodes into a C++ kernel.
        TN)r,  r   r   rM  r  r9  r  r  rG  _get_scheduled_num_argsr  MAX_FUSED_KERNEL_ARGS_NUMr  )r  r   r,  ru  r&  args_numrS   rS   rT   codegen_nodel  s   



zCppScheduling.codegen_nodec                 C   s   t |tot |jtjS r   )r   r#   r   r   CppTemplateBuffer)r  r   rS   rS   rT   is_cpp_template  s   zCppScheduling.is_cpp_templatetemplate_nodeepilogue_nodesprologue_nodesc                 C   s  |rJ dd |D }t d d  d7  < t d d  t|7  < | |s*J dtt|}|j\}\}}|dks<J ttj|j}d	d |D }t	d
d |D sWJ ddd }|||j
|}	|j||	|d\}
}|
 t|jsx|  |D ]}|  qz| }W d   n1 sw   Y  t|
 |g|}| |||
j}W d   n1 sw   Y  t|jrt|jdksJ d|jd jD ]}t|jtsJ dt|jjtjsJ d|j  q|
|| tj j|
jO  _|   dS )zG
        Codegen a CPP template, possibly with fused epilogues
        c                 S   s   g | ]}t |ttfr|qS rS   )r   r#   r!   )r   Zepilogue_noderS   rS   rT   r+    s    z2CppScheduling.codegen_template.<locals>.<listcomp>ZinductorZcpp_templated_kernel_counterr4   Zcpp_epilogue_fusion_counterzlTemplate node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBufferrS   c                 S   r  rS   r  r  rS   rS   rT   r+    s    c                 s   r  r   )r   r   r	  r  rS   rS   rT   r     r   z1CppScheduling.codegen_template.<locals>.<genexpr>z9Epilogue nodes must all be instances of ir.ComputedBufferc                    s>    sdS |   |v sJ ||    j}t fdd|D  S )NFc                 3   s(    | ]}t |jto|jj v V  qd S r   )r   r   r   r  rU  rS   rT   r     s    

zZCppScheduling.codegen_template.<locals>.template_buffer_has_other_users.<locals>.<genexpr>)r  r  r   )Ztemplate_bufferoutputs_by_namerU  r  rS   rW  rT   template_buffer_has_other_users  s   
zGCppScheduling.codegen_template.<locals>.template_buffer_has_other_users)$flag_template_buffer_has_other_usersrU  NzSMulti outputs template should be with 1 output template buffer of MultiOutputLayoutr   z?Multi outputs template should be with ExternKernelSchedulerNodez7Multi outputs template has multi users with MultiOutput)r   r   rS  r   r#   rn  r   rR  r   r   rX  Zmake_kernel_renderr(   r  r3   Zset_kernel_handlerdefine_kernelr  Zoutputsr  r   r   r)  call_kernelr  r  Zfree_buffers_in_scheduler)r  rT  rU  rV  r   ZrnumelZctbZepilogue_ir_nodesrY  rZ  r/  renderr   src_codeZnode_schedulekernel_namer  rS   rS   rT   codegen_template  sl   	





zCppScheduling.codegen_templatec                 C   s
   | j  S r   )r,  get_num_argsr  rS   rS   rT   rN    r  z%CppScheduling._get_scheduled_num_argsc                 C   r  r   r  r  rS   rS   rT   ready_to_flush  r  zCppScheduling.ready_to_flushc                 C   s   d S r   rS   r  rS   rS   rT   codegen_sync  s   zCppScheduling.codegen_syncNc                 C   s  t jj}tjjrt|tjjnd}dd|| g}t jj	r!|nd}|
ttj|}|
ttj|}|
dd}|d}|d|}	|||	d	   d
}
t }|d u r\| jjn|}| \}}}t jj	sr|d|d |j|dd t jj	s|d |j|| d|
d |S )NrP   r   rp  r/  z#pragma CMTz//z
extern "C"r   r4   z;
zasync_compile.cpp_pybinding(z, '''T)stripz''')F)ZgpuZcpp_definition)r3   r  wrapper_coder   rp  Zdescriptive_namesr&   r  Znext_kernel_suffixZcpp_wrapperr   rj   r+   KERNEL_NAMEDESCRIPTIVE_NAMErfindr6  r<   r,  r  cpp_argdefsr   r  r[  getvalue)r  r^  ru  Zkernel_argsr  Z
fused_namer_  kernel_decl_name
first_char	last_charZkernel_definitionZcompile_wrapperr  r   	arg_typesrS   rS   rT   r[    s8   

zCppScheduling.define_kernelc                 C   sF   | j  }|r| || j j}| j tjj| |   | 	d d S r  )
r,  codegen_groupr[  scheduled_nodesr\  r3   r  re  r  r  )r  r^  r_  rS   rS   rT   flush  s   
zCppScheduling.flushr   )0r   r   r   rO  r   r5   ZINPLACE_BUFFERSZREDUCE_TO_SINGLE_ELEMENTr  r1  rz   r  r  r  rg   r  r  r  r  r   r  r  r  r&  r'  r   r*  r  r  r+  r,  r   r#   r9  r   rM  r   r!   rQ  rS  r   r`  rN  rb  rc  r[  rq  r2  rS   rS   r	  rT   r    sd    R8	
6o
 B

W
%r  c                       sL   e Zd Z fddZdd Zdd Zdd Zdd
efddZdd Z	  Z
S )r  c                    sH   t    t | _t | _t| j| _t	 | _
| j
| j g | _d S r   )r  r  r>   r  r6   
loops_codeWorkSharingr  r   r   r   r   rp  r  r	  rS   rT   r    s   


zKernelGroup.__init__c                 G   s   || j t g|R  S r   )r  r*   )r  r   r  rS   rS   rT   r  !  r  zKernelGroup.new_kernelc                 C   s*   |  j |7  _ | j}| j}||| d S r   )rp  rr  r  r  )r  r  ru  r   r  rS   rS   rT   rG  $  s   zKernelGroup.finalize_kernelc                 C   s   | j  \}}}t|}|S r   )r  ri  r   )r  arg_defsZ
_call_argsZ
_arg_typesrP  rS   rS   rT   ra  *  s   zKernelGroup.get_num_argsNr   c              	   C   sh  | j   | js
dS t }tjjotjdv }|r|	dg |
t  |d u r.ttjn|}|d u r9ttjn|}| j \}}}dd|}t }|
d| d| d| d	 | G |rtjj}	|	d urtd
t|	 d nd}
|	d|
|  dg | j D ]\}}|
d| d| d q|| j W d    | S 1 sw   Y  | S )NrP   )linuxrO   z!#include <ATen/record_function.h>z,
   zextern "C" z void r  r   Zgraph_r   zRECORD_FUNCTION("z#", c10::ArrayRef<c10::IValue>({}));r   r   r   )r   r  rp  r6   r   rp  enable_kernel_profilesysplatformr   r   r   Z
cpp_prefixrj   r+   rf  rg  r  ri  ljustr  rU   r   r3   r  graph_idaliasesr  rr  rj  )r  r   r   rw  rk  r_  rt  r   Zfunc_export_declr{  r  oldnewrS   rS   rT   ro  /  s>   


zKernelGroup.codegen_groupc                 C   s&   | j  \}}}|j||d|d d S )NF)Ztritonrn  )r  ri  Zgenerate_kernel_call)r  r  r_  r   Z	call_argsrn  rS   rS   rT   r\  W  s   
zKernelGroup.call_kernelr   )r   r   r   r  r  rG  ra  rj   ro  r\  r2  rS   rS   r	  rT   r    s    	(r  c                   @   s<   e Zd Zdd Zdd Zdd Zdd Zd	d
 Zdd ZdS )rs  c                 C   s    || _ d| _d | _t | _d S r  )r   in_parallelr  r   r   r   )r  r   rS   rS   rT   r  _  s   zWorkSharing.__init__c                 C   sz   | j r|| jkr|   | j s;|| _d| _ tjjr | jd n
| jd| d | j	| j
  | jd d S d S )NTz#pragma omp parallelz!#pragma omp parallel num_threads(r   zint tid = omp_get_thread_num();)r  r  r  r   rp  r  r   r   r   r   r   )r  r  rS   rS   rT   r  e  s   zWorkSharing.parallelc                 C   s   | j r	| jd | j S )Nz#pragma omp single)r  r   r   r  rS   rS   rT   r  u  s   zWorkSharing.singlec                 C   s   | j   d| _d S r  )r   r  r  r  rS   rS   rT   r  z  s   

zWorkSharing.closec                 C   s   | j   | S r   )r   r:  r  rS   rS   rT   r:  ~  s   
zWorkSharing.__enter__c                 C   s   | j ||| d S r   )r   r?  r;  rS   rS   rT   r?    r  zWorkSharing.__exit__N)	r   r   r   r  r  r  r  r:  r?  rS   rS   rS   rT   rs  ^  s    rs  c                   @   s   e Zd ZU dZeej ed< dZeej ed< ej	j
Zejed< ej	j
Zejed< ej	jZejed< dZeed< d	Zeed
< d	Zeed< d	Zeed< d	Zeed< dd Zdd Zdd ZdS )r  Nr   r  r  r  r  r   r  Fsimd_ompsimd_vec	collapsedr  c                 C   s$   t  }|r| | _d S d| _d S r  )r   r  r  simd_nelements)r  r  rS   rS   rT   __post_init__  s   	zLoopLevel.__post_init__c                 C   sP   t |}t| j| j}||_d|_t|j|| |_| j	|_	d|_
| j|_|S )NTF)r   r#  r  r   r  r  r  r   r  r  r  r  )r  rh  Zsympy_factorr  rS   rS   rT   r    s   
zLoopLevel.tilec           	      C   sZ  t | j}t | j}tjjr||krd S | jr#| jdkr#d| j dnd}| jrEd}| jdkr8|d| j d7 }| jrD|	dd| }n| j
rKd}n| jrTd	| }n| js^t r^d
}nd}t d| j d| }| j d| }| jjr| j dt | j }n| j dt | j dt | j d}d| d| d| d}| js|s|gS ||gS )Nr4   zsimd simdlen(z) rP   z#pragma omp forz
 collapse(r   z for z#pragma omp z#pragma GCC ivdepr   r=  <r<  z+=(z == 0 ? 1 : zfor(r   )rD   r  r  r   rp  Zno_redundant_loopsr  r  r  r   r  r  r   r>  rH   r   r  r  r  )	r  Zoffset_exprZ	size_exprZsimdline1Z
offset_strr  Z	steps_strline2rS   rS   rT   r    sH   




zLoopLevel.lines)r   r   r   r   r   r   r   r   r  r   r   r  r  ZOner  r  rh   r  rg   r  r  r  r  r  r  rS   rS   rS   rT   r    s   
 	r  c                   @   s   e Zd ZU dZdZeee  ed< dZ	ee
 ed< ede
fddZdd Zed	d
 Zdd Zdd Zde
fddZdd ZdefddZdS )r  aV  
    A loop-nest-like structure. It is built with the `build` method
    as a loop nest and then will perform loop-tiling at some depth.

    A typical case is for vectorization, where we typically do loop-tiling
    at the innermost loop level. A more complicated case is when we do
    2D tiling at both the innermost and outer levels.
    Nr  r/  c           
      C   sz   | j }| j}| j}|dusJ d}tt||D ]\}\}}t||}|s)|g}n|| ||kr6| j|_qt|}	|	S )z4Build a LoopNest with the given `kernel` as the leafN)	rZ  r"  ry  r   r  r  r
  r  r  )
r/  rZ  r"  ry  r  Zloop_idxr   r  r  r  rS   rS   rT   r    s   

zLoopNest.buildc                 C   s
   t | jS r   )rg   r  r  rS   rS   rT   __bool__  r  zLoopNest.__bool__c                 C   s   | j du rtdddS d}d}| j d j}td}| j D ]}|j|kr& n
||j }|d7 }q|t| j k rtt|tjrtt| j | jtjrt|d | j | jk rt|}d}| j | j}t|t| j D ]}| j | j|kro n|d7 }qct||dS )a  
        Maximal allowed depth for parallelism: All reduction or non-reduction levels.
        When the range of the first inner loop beyond the maximum parallel depth is much
        larger than the range of all outer loops within the maximum parallel depth,
        change the starting depth of parallelism to the first inner loop and recalculate
        the maximum parallel depth.
        Nr   r  r4   r  )	r  r   r  r   r#  r  r   r   r  )r  r   	max_depthr  Z
loop_sizesr  r   rS   rS   rT   r    s2   
	






zLoopNest.max_parallel_depthc                 C   s   |j |  j ksJ d| jd usJ t| j|j ksJ | j|j }|j |_|jr1t jd7  _t	|jd |j D ]}d| j| _
q:d S )Nz?Parallel depth cannot exceed the maximal allowed parallel depthr4   T)r   r  r  r   r   r  r  r   Zparallel_reduction_countr  r  )r  r  r  r   rS   rS   rT   r  )  s   zLoopNest.mark_parallelc                 C   s*   | j sJ | j | || j |< | j | S )z
        Do loop-tiling at the `depth` level with `factor`.
            for (x0 = 0; x0 < x0_end; x0++)
            ->
            for (x0 = 0; x0 < x0_end; x0 += factor)
        See details in Note [tiled_size].
        )r  r  )r  r  rh  rS   rS   rT   r  6  s   

zLoopNest.tiler   c                 C   rA  r   r/  r  rS   rS   rT   r  B  rC  zLoopNest.get_kernelc                 C   r  r   r  r  rS   rS   rT   r  F  r  zLoopNest.set_kernellevelc                 C   sH   | j sJ t| j |ksJ |t| j krd n| j |d  }t|| jS r   )r  r   r  r/  )r  r  r  rS   rS   rT   r)  I  s   
 zLoopNest.from_loop_level)r   r   r   r   r  r   r   r  r   r/  ru  r  r  r  r$   r  r  r  r  r  rh   r)  rS   rS   rS   rT   r    s   
 	
&r  r  r   )r   dataclassesr  r  r"  r   r   rx  rx  collections.abcr   enumr   typingr   r   r   r   r   r   rz   Ztorch.fxZtorch._inductorr	   Ztorch._prims_commonr
   r   Ztorch.utils._ordered_setr   Ztorch.utils._sympy.functionsr   r   r   Ztorch.utils._sympy.symbolr   r   r   Z_dynamo.utilsr   rP   r   r   r   r   r   r   r  r   r   r   r   r   r    r!   r"   r#   utilsr$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   Zvirtualizedr0   r1   r2   r3   commonr5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   Z	cpp_utilsrA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   ry  rR   	lru_cacherU   Z_loggingZgetArtifactLoggerr   Zschedule_logZNATIVE_OMP_RTYPESZRTYPE_TO_CPPrM  ZPYTHON_TO_CPPZCONTAINER_PYTHON_TO_CPPrO  rP  ry   rN  ri   rg   rQ  rR  r  r  rm   r   r~   r   rn   r   r   r   r   r   r   rj   r   rh   r   r   r   r   r   	dataclassr   r   r3  rD  Z_initialize_pointwise_overridesr  rq  rr  ru  r  rs  r   r|  r}  r  r-  r  r  r  rs  r  r  rS   rS   rS   rT   <module>   sl  
 $8	8@

	"
.
0


!>
 !!   
1     
z    >      4  ], f    [#      G(U