o
    Zh                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZ d dlZd dlZd dlZddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl#m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4 ddl5m6Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZB ddlCmDZD ddlEmFZF ddlGmHZHmIZImJZJ erd dlKmLZL ddl*mMZMmNZN eOePZQdd ZRG dd deSZTG d d! d!e@ZUeU jVZWe@ jVZXejYd"ejZd#ej[d$ej\d%ej]d&ej^d'ej_d(ej`d)ejad*ejbd+ejcd,ejdd-ejed.iZfd/d0 Zgd1d2 ZhG d3d4 d4e?Zieijd5 G d6d7 d7e;ZkejlG d8d9 d9Zmd:d; Znd<d= ZoG d>d? d?eIZpG d@dA dAeJZqdS )B    )annotationsN)defaultdict)inf)AnyCallablecastOptionalTYPE_CHECKINGUnion   )is_integer_dtype)
OrderedSet)FloorDivModularIndexing)symbol_is_typeSymT)ValueRanges   )configir)HalideCodeCache)get_reduction_combine_fn)is_metric_table_enabledlog_kernel_metadata)AddParenHandler)HalideInputSpec
HalideMeta)get_bounds_index_exprget_kernel_metadataparallel_num_threadssympy_index_symbol
sympy_subs)_opsV   )	BackendFeatureCSEVariableDeferredLineIndentedBufferKernelArgTypeOpOverridesPythonPrinterSizeArg	TensorArg)DTYPE_TO_CPP)cexpr)constant_repr
SIMDKernelSIMDScheduling)Sequence)ReductionType	StoreModec                 C  sv   t | tr*d|   krdks*n ttj}| |jkrdS | |jkr$dS d| dS t | tr7dt|  dS t	| S )Ni   izhl.Int(64).min()zhl.Int(64).max()zhl.i64()zhl.f64()

isinstanceinttorchZiinfoint64minmaxfloatr0   repr)valinfo rA   M/var/www/auris/lib/python3.10/site-packages/torch/_inductor/codegen/halide.pyhalide_constant=   s    


rC   c                      s   e Zd Zd fddZ  ZS )UnsupportedreturnNonec                   s   t  d|  d S )Nz!halide backend does not support: )super__init__)selfthing	__class__rA   rB   rH   K      zUnsupported.__init__rE   rF   )__name__
__module____qualname__rH   __classcell__rA   rA   rK   rB   rD   J   s    rD   c                      s   e Zd Zedd Zedd Zdd Zdd Zd	d
 Zdd Z	e	Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Z fd+d,Zd-d. ZeZd/d0 Zd1d2 Z  ZS )3HalidePrinterc                 C  s   dt jj d|  dS )Nhl.cast(, r6   )r#   kernelindex_dtypeexprrA   rA   rB   
cast_indexP   s   zHalidePrinter.cast_indexc                 C     d|  dS )Nhl.cast(hl.Float(32), r6   rA   rX   rA   rA   rB   
cast_floatT      zHalidePrinter.cast_floatc                 C  s   d| dS )Nhl.f32(r6   rA   rI   rY   rA   rA   rB   _print_FloatX   s   zHalidePrinter._print_Floatc                 C  *   t |jdks	J d| |jd  dS )Nr$   r_   r   r6   lenargs_printr`   rA   rA   rB   _print_ToFloat[      zHalidePrinter._print_ToFloatc                 C  0   t |jdks	J | d| |jd  dS )Nr$   	hl.floor(r   r6   rd   re   rZ   rf   r`   rA   rA   rB   _print_floor_      zHalidePrinter._print_floorc                 C  ri   )Nr$   	hl.trunc(r   r6   rk   r`   rA   rA   rB   _print_Truncc   rm   zHalidePrinter._print_Truncc                 C  ri   )Nr$   hl.ceil(r   r6   rk   r`   rA   rA   rB   _print_ceilingi   rm   zHalidePrinter._print_ceilingc                 C  s   d|  | | dS Nzhl.sqrt(r6   )r]   rf   r`   rA   rA   rB   _helper_sqrtm   s   zHalidePrinter._helper_sqrtc                 C  sH   |  |jd }|  |jd }|  |jd }d| d| d| dS )Nr   r$   r   
hl.select(rU   r6   )doprintre   )rI   rY   cpqrA   rA   rB   _print_Wherep   s   zHalidePrinter._print_Wherec                 C  r   t |jdkr| |jd S t |jd }| tj|jd |  }| tj|j|d   }d| d| dS )Nr$   r   r   hl.min(rU   r6   )rd   re   rf   sympyZMinrI   rY   midabrA   rA   rB   
_print_Minv   s   zHalidePrinter._print_Minc                 C  rz   )Nr$   r   r   hl.max(rU   r6   )rd   re   rf   r|   ZMaxr}   rA   rA   rB   
_print_Max   s   zHalidePrinter._print_Maxc                 C  ri   )Nr$   hl.abs(r   r6   rk   r`   rA   rA   rB   
_print_Abs   rm   zHalidePrinter._print_Absc                 C  rb   )Nr$   zhl.cos((r   r6   rc   r`   rA   rA   rB   _print_OpaqueUnaryFn_cos   rh   z&HalidePrinter._print_OpaqueUnaryFn_cosc                 C  rb   )Nr$   z	hl.cosh((r   r6   rc   r`   rA   rA   rB   _print_OpaqueUnaryFn_cosh   rh   z'HalidePrinter._print_OpaqueUnaryFn_coshc                 C  rb   )Nr$   z	hl.acos((r   r6   rc   r`   rA   rA   rB   _print_OpaqueUnaryFn_acos   rh   z'HalidePrinter._print_OpaqueUnaryFn_acosc                 C  rb   )Nr$   zhl.sin((r   r6   rc   r`   rA   rA   rB   _print_OpaqueUnaryFn_sin   rh   z&HalidePrinter._print_OpaqueUnaryFn_sinc                 C  rb   )Nr$   z	hl.sinh((r   r6   rc   r`   rA   rA   rB   _print_OpaqueUnaryFn_sinh   rh   z'HalidePrinter._print_OpaqueUnaryFn_sinhc                 C  rb   )Nr$   z	hl.asin((r   r6   rc   r`   rA   rA   rB   _print_OpaqueUnaryFn_asin   rh   z'HalidePrinter._print_OpaqueUnaryFn_asinc                 C  rb   )Nr$   zhl.tan((r   r6   rc   r`   rA   rA   rB   _print_OpaqueUnaryFn_tan   rh   z&HalidePrinter._print_OpaqueUnaryFn_tanc                 C  rb   )Nr$   z	hl.tanh((r   r6   rc   r`   rA   rA   rB   _print_OpaqueUnaryFn_tanh   rh   z'HalidePrinter._print_OpaqueUnaryFn_tanhc                 C  rb   )Nr$   z	hl.atan((r   r6   rc   r`   rA   rA   rB   _print_OpaqueUnaryFn_atan   rh   z'HalidePrinter._print_OpaqueUnaryFn_atanc                   sT   |j r	t |S |j\}}| | |}| | |}| d| d| dS )Nrj   z / r6   )
is_integerrG   _print_FloorDivre   r]   ru   rZ   )rI   rY   xdivrK   rA   rB   r      s   
zHalidePrinter._print_FloorDivc                 C  ri   )Nr$   	hl.round(r   r6   rk   r`   rA   rA   rB   _print_Round   rm   zHalidePrinter._print_Roundc                 C  s   |j \}}d| d| dS )N() / (z+hl.f32(0)))re   )rI   rY   r   r   rA   rA   rB   _print_IntTrueDiv   s   
zHalidePrinter._print_IntTrueDivc                 C  s>   |j \}}| |}t|}dd|  d| dd| dS )Nr_   g      $@z)*hl.round((z	)*hl.f32()))re   rf   r8   )rI   rY   r?   nrA   rA   rB   _print_RoundDecimal   s   

"z!HalidePrinter._print_RoundDecimal) rO   rP   rQ   staticmethodrZ   r]   ra   rg   rl   ro   Z_print_TruncToIntrq   rs   ry   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Z_print_RoundToIntr   r   rR   rA   rA   rK   rB   rS   O   s<    

	
	rS   z	hl.Bool()zhl.BFloat(16)zhl.Float(16)zhl.Float(32)zhl.Float(64)z	hl.Int(8)z
hl.Int(16)z
hl.Int(32)z
hl.Int(64)z
hl.UInt(8)zhl.UInt(16)zhl.UInt(32)zhl.UInt(64)c                 C  s   t |  S N)_halide_typedtyperA   rA   rB   halide_type      r   c                 C  s<   t | r| jr| tjkrtj} | tjtjfv rtj} t| S r   )	r   	is_signedr9   r:   int32float16bfloat16float32r   r   rA   rA   rB   halide_acc_type   s
   r   c                   @  s  e Zd Ze		ddddZedd	d
Zedd Zedd Zedd Z	edd Z
edd Zedd Zedd Zedd Zedd Zedd Zedd  Zed!d" Zed#d$ Zed%d& Zed'd( Zed)d* Zed+d, Zed-d. Zed/d0 Zed1d2 Zed3d4 Zed5d6 Zed7d8 Zed9d: Zed;d< Zed=d> Z ed?d@ Z!edAdB Z"edCdD Z#edEdF Z$edGdH Z%edIdJ Z&edKdL Z'edMdN Z(edOdP Z)edQdR Z*edSdT Z+edUdV Z,edWdX Z-edYdZ Z.ed[d\ Z/ed]d^ Z0ed_d` Z1edadb Z2edcdd Z3ededf Z4edgdh Z5edidj Z6edkdl Z7edmdn Z8edodp Z9edqdr Z:edsdt Z;edudv Z<edwdx Z=edydz Z>ed{d| Z?ed}d~ Z@edddZAedd ZBedd ZCedd ZDdS )HalideOverridesNTr   torch.dtype	src_dtypeOptional[torch.dtype]c                 C  s,   |t jkrd|  dS dt| d|  dS )Nr   z != 0)rT   rU   r6   )r9   boolr   )r   r   r   Zuse_compute_typesrA   rA   rB   to_dtype   s   
zHalideOverrides.to_dtypec                 C  s\   |t jt jfv rdt| d|  d} dt| d|  d}|t jt jfv r,d| d}|S )NrT   rU   r6   zhl.reinterpret(r\   )r9   r   r   r   )r   r   r   linerA   rA   rB   to_dtype_bitcast   s   z HalideOverrides.to_dtype_bitcastc                 C  s   |  t||S r   )r   rC   )clsvaluer   rA   rA   rB   constant  s   zHalideOverrides.constantc                 C  r[   )Nr   r6   rA   r   rA   rA   rB   abs  r^   zHalideOverrides.absc                 C  s0   t | dsd|  dS d|  d| j d|  dS )Nnamehl.exp(r6   z"hl.fast_exp(hl.cast(hl.Float(32), z)) if z!.type().bits() <= 32 else hl.exp(hasattrr   r   rA   rA   rB   exp
  s   
zHalideOverrides.expc                 C  r[   )Nr   r6   rA   r   rA   rA   rB   libdevice_exp  r^   zHalideOverrides.libdevice_expc                 C  r[   rr   rA   r   rA   rA   rB   sqrt  r^   zHalideOverrides.sqrtc                 C  h   t | dsd|  d| dS d| j d| d}d|  d| d	|  d
|  d| d| j d|  d| dS )Nr   r{   rU   r6   rT   	.type(), hl.select((<)|hl.is_nan(), ) if z.type().is_float() else hl.min(r   r   r   rA   rA   rB   minimum     
8zHalideOverrides.minimumc                 C  r   )Nr   r   rU   r6   rT   r   r   >r   r   r   z.type().is_float() else hl.max(r   r   rA   rA   rB   maximum   r   zHalideOverrides.maximumc                 C  s6   t |drd|j d| d}d|  d| d| dS )Nr   rT   r   r6   rt   rU   r   )r   r   rv   rA   rA   rB   where(  s   
zHalideOverrides.wherec                 C  r[   )Nzhl.cos(r6   rA   r   rA   rA   rB   cos.  r^   zHalideOverrides.cosc                 C  r[   )Nzhl.sin(r6   rA   r   rA   rA   rB   sin2  r^   zHalideOverrides.sinc                 C     t d)NlgammarD   r   rA   rA   rB   r   6     zHalideOverrides.lgammac                 C  r[   )Nzhl.erf(r6   rA   r   rA   rA   rB   erf:  r^   zHalideOverrides.erfc                 C  r[   )Nzhl.cosh(r6   rA   r   rA   rA   rB   cosh>  r^   zHalideOverrides.coshc                 C  r[   )Nzhl.sinh(r6   rA   r   rA   rA   rB   sinhB  r^   zHalideOverrides.sinhc                 C  r[   )Nzhl.acos(r6   rA   r   rA   rA   rB   acosF  r^   zHalideOverrides.acosc                 C  r[   )Nz	hl.acosh(r6   rA   r   rA   rA   rB   acoshJ  r^   zHalideOverrides.acoshc                 C  r[   )Nzhl.asin(r6   rA   r   rA   rA   rB   asinN  r^   zHalideOverrides.asinc                 C  r[   )Nz	hl.asinh(r6   rA   r   rA   rA   rB   asinhR  r^   zHalideOverrides.asinhc                 C     d|  d| dS )Nz	hl.atan2(rU   r6   rA   r   yrA   rA   rB   atan2V     zHalideOverrides.atan2c                 C  r[   )Nzhl.atan(r6   rA   r   rA   rA   rB   atanZ  r^   zHalideOverrides.atanc                 C  r[   )Nz	hl.atanh(r6   rA   r   rA   rA   rB   atanh^  r^   zHalideOverrides.atanhc                 C  r   )Ncopysignr   r   rA   rA   rB   r   b  r   zHalideOverrides.copysignc                 C  r   )Nerfinvr   r   rA   rA   rB   r   f  r   zHalideOverrides.erfinvc                 C  r   )Nz	hl.hypot(rU   r6   rA   r   rA   rA   rB   hypotj  r   zHalideOverrides.hypotc                 C  r   )N	nextafterr   r   rA   rA   rB   r   n  r   zHalideOverrides.nextafterc                 C     |  d| S Nz & rA   r   rA   rA   rB   logical_andr     zHalideOverrides.logical_andc                 C  s
   |  dS )Nz == 0rA   r   rA   rA   rB   logical_notv     
zHalideOverrides.logical_notc                 C  r   Nz | rA   r   rA   rA   rB   
logical_orz  r   zHalideOverrides.logical_orc                 C  r   )Nr    ^ r6   rA   r   rA   rA   rB   logical_xor~  r   zHalideOverrides.logical_xorc                 C  r   r   rA   r   rA   rA   rB   bitwise_and  r   zHalideOverrides.bitwise_andc                 C  s
   d|  S )N~rA   r   rA   rA   rB   bitwise_not  r   zHalideOverrides.bitwise_notc                 C  r   r   rA   r   rA   rA   rB   
bitwise_or  r   zHalideOverrides.bitwise_orc                 C  r   )Nr   rA   r   rA   rA   rB   bitwise_xor  r   zHalideOverrides.bitwise_xorc                 C  r   )Nz << rA   r   rA   rA   rB   bitwise_left_shift  r   z"HalideOverrides.bitwise_left_shiftc                 C  r   )Nz >> rA   r   rA   rA   rB   bitwise_right_shift  r   z#HalideOverrides.bitwise_right_shiftc                 C  r   )Nzhalide_helpers.rand(rU   r6   rA   seedoffsetrA   rA   rB   rand  r   zHalideOverrides.randc                 C  r   )Nzhalide_helpers.randn(rU   r6   rA   r   rA   rA   rB   randn  r   zHalideOverrides.randnc              	   C  s   d|  d| d| d| d	S )Nzhalide_helpers.randint64(rU   r6   rA   )r   r   lowhighrA   rA   rB   	randint64  s   zHalideOverrides.randint64c                 C  s"   t | d dtjjd| S )Nr    + Zload_seed_offset)opsloadr#   rV   re   Zseed_offset)r   r   rA   rA   rB   	load_seed  s   "zHalideOverrides.load_seedc                 C  r[   )Nz1./hl.sqrt(r6   rA   r   rA   rA   rB   rsqrt     zHalideOverrides.rsqrtc                 C  r[   )Nzhl.tan(r6   rA   r   rA   rA   rB   tan  r^   zHalideOverrides.tanc                 C  r[   )Nzhl.tanh(r6   rA   r   rA   rA   rB   tanh  r^   zHalideOverrides.tanhc                 C  r[   )Nz3(hl.reinterpret(hl.UInt(32), hl.cast(hl.Float(32), z)) >> 31) != 0rA   r   rA   rA   rB   signbit  r^   zHalideOverrides.signbitc                 C  s   |  d|  d| d| S )Nz - hl.trunc(/z)*rA   r   rA   rA   rB   fmod  s   zHalideOverrides.fmodc                 C  r   )Nzhl.pow(rU   r6   rA   r   rA   rA   rB   pow  r   zHalideOverrides.powc                 C  r[   )Nzhl.log(r6   rA   r   rA   rA   rB   log  r^   zHalideOverrides.logc                 C  r[   )Nz hl.is_inf(hl.cast(hl.Float(32), r   rA   r   rA   rA   rB   isinf  r   zHalideOverrides.isinfc                 C  r[   )Nz hl.is_nan(hl.cast(hl.Float(32), r   rA   r   rA   rA   rB   isnan  r   zHalideOverrides.isnanc                 C  r[   )Nr   r6   rA   r   rA   rA   rB   round  r^   zHalideOverrides.roundc                 C  r[   )Nrj   r6   rA   r   rA   rA   rB   floor  r^   zHalideOverrides.floorc                 C  r   )Nr   r   z + hl.f32(0))rA   r   rA   rA   rB   int_truediv  r   zHalideOverrides.int_truedivc                 C     d| j  d|  d| dS )Nz"hl.floor(hl.cast(hl.Float(max(32, .type().bits())), ) / r6   r   r   rA   rA   rB   floordiv  s   zHalideOverrides.floordivc                 C  sL   t t d|tj}t t |dtj}t ||}d|j d| dS )N0rT   r   r6   )r   r   ltr9   int8subr   )r   r   leftrightr  rA   rA   rB   sign  s   zHalideOverrides.signc                 C  r[   )Nrn   r6   rA   r   rA   rA   rB   trunc  r^   zHalideOverrides.truncc                 C  r  )Nz"hl.trunc(hl.cast(hl.Float(max(32, r  r  r6   r  r   rA   rA   rB   truncdiv  s   zHalideOverrides.truncdivc                 C  r[   )Nrp   r6   rA   r   rA   rA   rB   ceil  r^   zHalideOverrides.ceilc                 C  r[   )Nr   z, 0)rA   r   rA   rA   rB   relu  r^   zHalideOverrides.reluc                 C  sR   t j|}t jjt j|t j|t|d}|tjtj	fvr't
||S |S Nbounds)r#   rV   prepare_indexinggenfuncindex_to_strused_dims_from_indexr   r9   r   r:   r   r   )r   rY   r   indexvarrA   rA   rB   
index_expr   s   

zHalideOverrides.index_exprc                 C  s.   t |tj}t |||}||_tt|S r   )r   r   r9   r   halide_clampindirect_indexing_sizer    str)r   Z	index_varsizecheckZwrap_negrA   rA   rB   indirect_indexing  s   z!HalideOverrides.indirect_indexingc                 C  sN   t jt j|d }t|ttjfsd|j d| d}d| d| dS )Nr$   rT   r   r6   z	hl.clamp(z, 0, )	r#   rV   kexprrename_indexingr7   r8   r|   Integerr   )r   r   r*  r+  endrA   rA   rB   r'    s   zHalideOverrides.halide_clampc                 C  s~   t j| |}| }W d    n1 sw   Y  |jjr"t|}t jjd|j dt| dg t	
|d}t|||S )NrT   r   r6   r  )r#   rV   Z
mask_loadsr  Zis_boolr   r!  r   rC   r   wrapr   r   )maskbodyotherZnew_maskresultrA   rA   rB   masked  s   zHalideOverrides.maskedc                 C  r   )Nfrexp)NotImplementedErrorr   rA   rA   rB   r7  .  r   zHalideOverrides.frexp)NT)r   r   r   r   )r   r   r   r   )TT)ErO   rP   rQ   r   r   r   classmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r	  r
  r  r  r  r  r  r  r  r  r&  r,  r'  r6  r7  rA   rA   rA   rB   r      s   




























































r   halidec                      sN   e Zd ZedZ	dd fd	d
Zdd Zdd ZdddZ	dd Z
  ZS )HalideCSEVariablez\b(tmp\d+)\[\?\]Nr  ValueRanges[Any]r   r   rE   rF   c                   s   t  ||| d | _d S r   )rG   rH   	used_dimsrI   r   r  r   rK   rA   rB   rH   9  s   
zHalideCSEVariable.__init__c                 C  sd   t | jpd}t|| D ]}t|tr(|jd us"J |||f||j qtj	
|| _d S )NrA   )r   r=  	itertoolschainvaluesr7   r;  updater#   rV   sort_used_dims)rI   r   re   kwargsusedargrA   rA   rB   update_on_argsB  s   
z HalideCSEVariable.update_on_argsc                 C  s6   t |dkr| j dS | j ddtt| dS )Nr   z[()][rU   ])rd   r   joinmapr)  )rI   dimsrA   rA   rB   	index_strJ  s   zHalideCSEVariable.index_strr)  c                 C  s"   | j d u r| j dS | | j S )Nz[?])r=  r   rM  rI   rA   rA   rB   __str__P  s   
zHalideCSEVariable.__str__c                   s<   | j d urtdd | j D sJ |  fdd| j D S )Nc                 s      | ]	}t |tjV  qd S r   r7   r|   Expr.0r   rA   rA   rB   	<genexpr>W      
z-HalideCSEVariable.subs_str.<locals>.<genexpr>c                   s   g | ]}  ||qS rA   )getrT  r   replacementsrA   rB   
<listcomp>Z      z.HalideCSEVariable.subs_str.<locals>.<listcomp>)r=  allrM  )rI   rZ  rA   rY  rB   subs_strV  s   zHalideCSEVariable.subs_strr   )r  r<  r   r   rE   rF   )rE   r)  )rO   rP   rQ   recompileundefined_rerH   rG  rM  rO  r^  rR   rA   rA   rK   rB   r;  6  s    
	
r;  c                      sB   e Zd ZU ded< ded< ded< d fdd	ZdddZ  ZS )DimensionInfozOptional[sympy.Expr]rY   
sympy.Exprr*  striderE   rF   c                   s<   t    tjj|dr| }| }|| _|| _|| _d S Nr   )	rG   rH   r#   graphsizevarsstatically_known_ltrY   r*  rd  )rI   rY   r*  rd  rK   rA   rB   rH   c  s   

zDimensionInfo.__init__NFc                 C  s   | j d usJ | j }|r|dkrdS |rHi |}|jD ]'}t|tjrBt|tjs+J tj	
|j}t|ts9J t||||< qt||}tj	|S )Nr   hl.Var())rY   free_symbolsr   r   TMPr7   r|   Symbolr#   rV   lookup_cse_varr   r;  r    r^  r!   r"  )rI   rZ  	zero_varsrY   symr%  rA   rA   rB   rM  l  s   

zDimensionInfo.index_strrN   NF)rO   rP   rQ   __annotations__rH   rM  rR   rA   rA   rK   rB   rb  ]  s   
 	rb  c                 C  sj   t jj| |r
dS zt jj| }t jj|}W n
 ty$   Y dS w ||kr1t jj| | ||kS NTF)r#   rf  rg  statically_known_equals	size_hint	TypeErrorZguard_equals)r  r  r   r   rA   rA   rB   eq}  s   rv  c                 C  s   t jj| |r
dS zt jj| }t jj|}W n ty4   t| |}|| kr1| |k Y S Y dS w ||k rAt jj| | ||k S rr  )	r#   rf  rg  rh  rt  ru  r|   gcdZguard_lt)r  r  r   r   rw  rA   rA   rB   r    s   r  c                      sZ  e Zd ZU eZeZded< de fddZdfddZ	dgddZ
dh fddZdd Zdd Zdi fddZdd Zdjd"d#Zd$d% Zd&d' Zdid(d)Zd*d+ Zdkd-d.Zdld0d1Zdmd2d3Z	dndod8d9Zdpd>d?Zd@dA ZdqdHdIZe dJdrdLdMZdrdNdOZdmdPdQZ dRdS Z!dsdUdVZ"dndWdXZ#e$dYdZ Z%dndmd[d\Z&d]d^ Z'dtdcddZ(  Z)S )uHalideKernelzCallable[[sympy.Expr], str]r-  tilingdict[str, sympy.Expr]rE   rF   c                   s|   t  j|fi | | j| _| j| _| j| _t | _| j| _	| j| _
i | _i | _i | _i | _i | _i | _tt| _d| _d S rp  )rG   rH   r3  ZcomputeloadsZstoresr(   Zindexing_code_dominside_reductionZneeds_dom_indexingZhas_reductionbuffer_dimensionsbuffer_offsetshalide_varsindex_replacementsreduction_renamesdom_renamesr   listbuffer_aliaseshas_indirect_indexing)rI   ry  rD  rK   rA   rB   rH     s   

zHalideKernel.__init__r   r   r)  c                 C  s   t |S r   )r   )rI   r   rA   rA   rB   dtype_to_str  r   zHalideKernel.dtype_to_strNc                 C  s$   | j | d|d t|||S )Nz = hl.Func(r6   )r3  	writeliner;  r>  rA   rA   rB   create_cse_var  s   zHalideKernel.create_cse_varindicesSequence[sympy.Expr]c                   s  j s	js	jrJ tjtjjjt	dt
tt j|}tt   dd tjdd jD D dd  fdd	} fd
d}|D ]8}|trc|ttdtdtd| |trw|ttdtd|  t |j qItdd  D _d}tjD ]} fdd|j ! D }|j"fddd |s|#|$d|j% d}tj&j'g }	|t(|k rt)|j%sfdd|D }
|t(|
7 }|
sJ |t*tjjj+|
 |
,fdd|D  |
rt*tj-|
t)dr$|j% t)drJ g }
t(|}d}t.dt(j }|j/r?t.dt(j j|< j|< |	#|f 9 fdd|D }|t(|7 }t(|
}fdd|
D }
t(|
|k sy|dksyJ |
,| |
s |t(|k rt)|j%r|D ]}zRd}dt)|j0s|	| \}}|d7 }|9 t)|j0rd}tj&j1}t)|j2|s|	| \}}|d7 }||| 7 }||9 }t)|j2|r|j |3 < W q t4y"   |sJ tj&j1}tj&j'}|	D ]\}}||| 7 }||9 }qtjj5t||j0|j2jj |3 < Y qw qjD ]}j67| d|j8d  q'jrN9d!fd"dj: D  d#S d#S )$a  
        Hook called right before codegen with every index that will be
        used in the fused kernel.

        This populates self.halide_vars/index_replacements/reduction_renames which is an alternate indexing
        scheme that avoids using divide and modulus.  Instead of xindex/yindex/rindex
        we base indexing on a larger number of vars whose product combines to those.

        This function populates self.halide_vars, self.index_replacements, and self.reduction_renames
        fallbackc                 S  s   i | ]}|  |qS rA   symbolrX  rA   rA   rB   
<dictcomp>  s    z2HalideKernel.finalize_indexing.<locals>.<dictcomp>c                 S  s   g | ]}|j  qS rA   )nodesrA  )rT  treerA   rA   rB   r[        z2HalideKernel.finalize_indexing.<locals>.<listcomp>c                 S  s   t tjj| S r   )r|   simplifyr#   rf  rg  Zremove_precomputed_replacementsrX   rA   rA   rB   r    s   z0HalideKernel.finalize_indexing.<locals>.simplifyc                   sJ   | v r#|  }  |j|j| tjj|t|j	|
  d S d S r   )addrootlookupdivisorr#   rf  rg  Zevaluate_minr   lengthr  )baser  modulusnodeall_used_symbolssym_to_noderA   rB   visit_modular_indexing  s   z>HalideKernel.finalize_indexing.<locals>.visit_modular_indexingc                   s>   | v r|  }  |j|j| t|j|  d S d S r   )r  r  r  r  r   r  r  )r  r  r  r  rA   rB   visit_floor_div  s   
z7HalideKernel.finalize_indexing.<locals>.visit_floor_divr  r  r  c                 s  rP  r   )r   r   ZINDIRECTrT  ro  rA   rA   rB   rU    rV  z1HalideKernel.finalize_indexing.<locals>.<genexpr>Fc                   s   g | ]
}|   v r|qS rA   r  rX  )r  rA   rB   r[        c                   s
    | j S r   r  )r   )rt  rA   rB   <lambda>  s   
 z0HalideKernel.finalize_indexing.<locals>.<lambda>keyr$   r   c                   s"   g | ]}t |j r|jqS rA   rv  r  r  rX  )r  r  rA   rB   r[    s
    c                   s2   g | ]}t  |jrt |jr|j  qS rA   )r  r  rX  )r  r0  r  rA   rB   r[     s    

Thhrc                   s   g | ]}t |j r|jqS rA   r  rX  r  rA   rB   r[  8  s    c                   s$   g | ]}t | st|  qS rA   )rv  r|   r  rT  s)	next_sizerA   rB   r[  ;  s    z
 = hl.Var(r6   Zrdomc                      i | ]
\}}| j | qS rA   r  rT  vrvrN  rA   rB   r  h  r  N);r  r  r  	functoolspartialr#   rf  rg  rt  r   dictfromkeysrK  rG   r   r   r   r?  r@  from_iterablerange_treeshasr   replacer|   Wildr   rB  rj  anyr  reversedr  rA  sortappendr  numelSOnerd   rv  reduceevaluate_maxextendrw  r    Zis_reductionr  Zeror  r  
IndexErrorsimplify_with_rangesindexing_coder  r   codegen_rdomitems)rI   r  r  r  r$  Zhad_fallbackr  r  Zhandled_countZadded_sym_sizeZsizes_to_addro  Z	new_sizesZ	prior_lenr  idxr*  r  rY   Z
full_indexrd  rK   )r  r  r0  r  rI   r  rt  r  rB   finalize_indexing  s   








.

zHalideKernel.finalize_indexingc                   s    j rdnd}| jv r j| S i } j D ]$} j s#| jv r#qtd|j}|s.J td| |	d ||< q 
| d fdd| D  | j|< |S )	zCRDom based indexing uses explicit iteration ranges for Func updatesioz^h(\d+)$r  r$   domc                   r  rA   r  r  rN  rA   rB   r  z  r  z3HalideKernel.setup_dom_indexing.<locals>.<dictcomp>)r|  r  r  keysr  r_  matchr   r    groupr  r  )rI   prefixrenamesr%  mrA   rN  rB   setup_dom_indexingk  s   


zHalideKernel.setup_dom_indexingc              	     sl    fdd|  D } j| dd| d t| D ]\}} j| d| d| d q d S )	Nc                   s$   g | ]}d    | dqS )hl.Range(0, r6   )r-  r.  )rT  r*  rN  rA   rB   r[    s    z-HalideKernel.codegen_rdom.<locals>.<listcomp>z = hl.RDom([rU   ]) = rH  rI  )rA  r  r  rJ  	enumerater  )rI   r   varsZrsizesr  ZrsymrA   rN  rB   r    s   
 zHalideKernel.codegen_rdomr$  rc  c                   s*   t  |}t|| j}tjj|| jS r   )	rG   r   r!   r  r#   rf  rg  r  r  )rI   r$  rK   rA   rB   r     s   zHalideKernel.prepare_indexingc                 C  s$   t |tjr| |jjS | j| S )zThe size of an index symbol)r   r   rk  rm  r   r(  r  )rI   ro  rA   rA   rB   sym_size  s   
zHalideKernel.sym_sizer%  is_storer   c                   s4  g t |jdd dD ] }t|tjtjfr| qt|tjtjtj	fs+J |qt
jj}dd D g }t
|}t|t
jrJ|jn|gD ]i}fdd|jD tdkrd||7 }qMtd	krud   |7  < qMg }tt|D ]0}	||	 d
usJ ||	 \}
}t|
t@ rfdd|
D  ||7 }q}||
|f q}g ||f}qM fdd}g }|D ]\}}|D ]	}||7 }q|||| qÈ D ]\}}||||g q|jdd d |s	jr|tt
jjd	d	 n tjj|d jd	s)| dtt
jj r!d	n|d jd	 |rc sc|j!v rRtjj"|j!| rR#||j!|   j!| }ntjj$|drc#|| d}|}t%& D ].}	'||| rz||f  S  rJ | d|	 }|j(| vrj(| | qid
S )zEConvert address-based indexing into dimensions using self.halide_varsc                 S  s   | j S r   r  r   rA   rA   rB   r    s    z5HalideKernel.indexing_to_dimensions.<locals>.<lambda>r  c                 S  s   i | ]}|t jjqS rA   )r|   r  r  r  rA   rA   rB   r    r  z7HalideKernel.indexing_to_dimensions.<locals>.<dictcomp>c                      g | ]}| v r|qS rA   rA   rT  r  )
split_exprrA   rB   r[    r\  z7HalideKernel.indexing_to_dimensions.<locals>.<listcomp>r   r$   Nc                      g | ]}| vr|qS rA   rA   r  )	part_varsrA   rB   r[    r\  c                   s   t | } t|dkr,t jdd}| ||d  }|r,t|d |d || S  r2J | t t| fdd|D d }t j	j
}t| t jrn| jD ]}t|t jrm||9 }t | | } t t || }qPt| ||S )Nr$   Zwild)excluder   c                   s   i | ]
}|  |d  qS )r$   )r  r  rN  rA   rB   r    r  zRHalideKernel.indexing_to_dimensions.<locals>.expr_to_dimension.<locals>.<dictcomp>)r|   factorrd   r  r  rb  r  r  r!   r  r  r7   ZMulre   r/  Zceiling)rY   symsZstride_wildr  r  rd  term)r  rI   symbolsrA   rB   expr_to_dimension  s*   

z>HalideKernel.indexing_to_dimensions.<locals>.expr_to_dimensionc                 S  s   t jjj| jtdS )Nr  )r#   rf  rg  rt  rd  r   )drA   rA   rB   r    s    Z_view))sortedrj  r   r   HALIDErk  r  UNBACKED_INTSIZEPRECOMPUTED_SIZEr|   r  r  expandr.  r7   Addre   rd   ranger   r  popr  r  r  rb  r#   rf  rg  rs  rd  insertr~  statically_known_geqapply_offset_to_dimensionZstatically_known_gtr?  countinstall_dimsr  )rI   r%  r$  r  ro  r   Zsplit_failedpartZnew_split_failedr  Z
other_varsZ
other_partr  rL  r  rY   r  Zorig_varrA   )r  r  rI   r  r  rB   indexing_to_dimensions  s   


 

z#HalideKernel.indexing_to_dimensionsc                 C  s   || j vr|| j |< || j|< dS | j| |ks#t| j | t|kr%dS |r.| j | |kS t| j | |D ]'\}}|j|jkrC dS |j|jksO|j|jkr]tjj	
|j|j|_d|_q6dS )z>Try to set self.buffer_dimensions[var], return True on successTFN)r}  r~  rd   ziprd  r*  rY   r#   rf  rg  r  )rI   r%  rL  r   r  oldnewrA   rA   rB   r    s(   


zHalideKernel.install_dimsc                 C  s   |dkrd S t tt|D ].}|| jdks"tjj||| jr<t||| j}|||| j 8 }||  j	|7  _	q|dksCJ d S )Nr   r$   )
r  r  rd   rd  r#   rf  rg  r  r   rY   )rI   rL  r   r  r  rA   rA   rB   r    s   
z&HalideKernel.apply_offset_to_dimensionc                 C  s   t tj  }|jD ]I}t|tjsJ t|tjr2| |j	}t|t
r)|jdus+J ||j q	t|tjr>|| q	t|tjtjtjtjfrLq	td| | |S )zIDetect which range trees are used to populate HalideCSEVariable.used_dimsNzunhandled symbol )r   r|   rl  rj  r7   r   r   rk  rm  r   r;  r=  rB  r  r  r  r  r  ZINDEXr8  rC  )rI   r$  r=  ro  Zcse_varrA   rA   rB   r#    s"   

z!HalideKernel.used_dims_from_indexc                   sP   t dd  D sJ  fddt| j| j D }t|t ks&J |S )Nc                 s  rP  r   rQ  rS  rA   rA   rB   rU  7      z.HalideKernel.sort_used_dims.<locals>.<genexpr>c                   r  rA   rA   r  r=  rA   rB   r[  8  s
    z/HalideKernel.sort_used_dims.<locals>.<listcomp>)r]  r?  r@  r  r  rA  rd   )rI   r=  ZorderedrA   r  rB   rC  6  s   
zHalideKernel.sort_used_dimsFc                   sH   d  fdd|D }t|dkrd}|S t|dkr"| d}|S )NrU   c                 3  s    | ]	}|  V  qd S r   )rM  )rT  r  rZ  rn  rA   rB   rU  C  r  z.HalideKernel.make_index_str.<locals>.<genexpr>r   ()r$   ,)rJ  rd   )rI   rL  rZ  rn  rM  rA   r  rB   make_index_strB  s   
zHalideKernel.make_index_strr   c           
   
   C  s  | j |}| |}| ||d\}}| d| | d}tj|}|tj	tj
fv r6tj}d| d}| jrt| jtrE| jjdusGJ tg | || jjR }| | |}|jr| j|j d | j|j d| j d | | jp~d	}	| j| d
t| d|	 d | j| d| dt| d|j d |S | j| d| j d| dt| d |S | || |S )z"Codegen a load from an InputBufferFrH  rI  r\   r6   Nz!_mask = hl.RDom([hl.Range(0, 1)])z_mask.where(r   z = hl.cast(rU   r  z + hl.cast(z_mask)z = hl.select(z
, hl.cast(z, 0)))re   inputr   r  r  r#   rf  	get_dtyper9   r   r   r   
_load_maskr7   r;  r=  r   r#  newfuncrC  r3  r  r   r-  Z_load_otherr   r!  )
rI   r   r$  r%  rL  r   r   r=  r5  r4  rA   rA   rB   r   K  s@   

  zHalideKernel.loadc                 C  s   | j jtdd| S )Nz\[.* )csevarname_mapr_  r  rI   r   rA   rA   rB   rm  r  rM   zHalideKernel.lookup_cse_varr   r&   moder5   c              
   C  s>  t |tsJ | j|}| |}| ||d\}}| |s$|durR|  }| ||}|	|}	d
dgt| p>d}
| jt|| d|
 d| d n| j|dd	}t|}	tj|}|du rx| d| d
t| d|	 d}n|dkr| d| dt| d|	 d}ntd| | jt|| dS )z"Codegen a store to an OutputBufferTNrU   ri  r   rH  z] = hl.undef(z.type()))rn  z] = hl.cast(r6   Z
atomic_addz] += hl.cast(zstore mode=)r7   r;  re   outputr   r  Zis_indirect_indexingr  r  r^  rJ  rd   r3  r  r'   r)  r#   rf  r  r   r8  )rI   r   r$  r   r  r%  rL  rZ  rM  	value_strZ
undef_dimsr   r   rA   rA   rB   storeu  s*   

""zHalideKernel.storer   reduction_typer4   +Union[CSEVariable, tuple[CSEVariable, ...]]c                   sT  | j sJ | jr
J |||f}|| jjv r| jj| S t|tr3|dks&J | j|  | jj|< }|S t|tr=|jdus?J t	| j
 |  fdd|jD } t	|j rj| | | t	g |j R }|| j
}tj||}	t|}
|dv r|j d| }| j| d| d| d	 g }d
}t| j
D ]%\}}|| d| d |d
kr|d  d| 7  < || j| 9 }q| j| dd|  nN|dkr| ||}nCt||
}ttt  |||}W d   n1 sw   Y  d|
 dt |	 d	}| j| d|  | j| d|  || jj|< |S )zCodegen a reduction operationZwelford_combineNc                   r  rA   rA   r  Zreduction_varsrA   rB   r[    r\  z*HalideKernel.reduction.<locals>.<listcomp>)ZargmaxZargmin_z = hl.z(rdom, r6   r$   rH  rI  *r  r   Zwelford_reducerT   rU   )!r|  r  r  Zreduction_cacher7   tuplewelford_combine_implr;  r=  r   r  r  r!  rC  r^  r   Z	ReductionZdefault_accumulatorr   r   r3  r  r  r  r  rJ  Zwelford_reduce_fallbackr   r#   set_ops_handlerr   r   rC   )rI   r   r   r  r   	cache_keyZresult_tuple
result_varr  defaultZacc_typer$  partsrd  r  ro  
combine_fncombine_strZdefault_strrA   r  rB   	reduction  sZ   





zHalideKernel.reductionc                 C  sv  t |tr
|jd usJ t |tr|jd usJ t |tr"|jd us$J tg |j|j|jR p3| j}|t| j8 }| | |}dd |||fD }|j}| j	
| dd| d | j	
| d| d | j	
| d| d	 | j	
| d
| d | j	
| d|| j  | j	
| d|| j  | j	
| d|| j  | j	
| d| d| d | j	
| d| d| d | j	
| d| d| d| d | d| d| d| d| d| d| d| d| d| dg}| j	
| dd| d g }	td D ]}
|	| |j | j	
|	d!  d"| d#|
 d$ qt|	S )%Nc                 S  s   g | ]	}d |j  dqS )rT   z.type(), 0)r  rS  rA   rA   rB   r[    s    z5HalideKernel.welford_combine_impl.<locals>.<listcomp>z = hl.Tuple([rU   r  z
_mean_1 = z[0]z_m2_1 = z[1]z_weight_1 = z[2]z
_mean_2 = z_m2_2 = z_weight_2 = z	_delta = z
_mean_2 - Z_mean_1z_new_weight = z_weight_1 + Z	_weight_2z_w2_over_w = hl.select(z_new_weight == 0.0, 0.0, z_weight_2 / z_new_weight)z
_mean_1 + z	_delta * Z
_w2_over_wz_m2_1 + z_m2_2 + z_weight_1 * Z_new_weightr   r  r  rH  rI  )r7   r;  r=  r   r  r  r  rC  r   r3  r  rJ  r^  r  r  r  )rI   meanm2weightr=  r  r  pfxrB  unpackedr  rA   rA   rB   r    sD   &&z!HalideKernel.welford_combine_impldtypestuple[torch.dtype, ...]r  UCallable[[tuple[CSEVariable, ...], tuple[CSEVariable, ...]], tuple[CSEVariable, ...]]values_origtuple[CSEVariable, ...]c              
     s  j sJ t|t|ksJ g }ttj   |D ]:}t|tr%|jd us'J t|jtj@ r7|	| n|	
| g |jg jd d   |j q jritjtj@ skJ dd t||D }jd j}j d}| d}	j| d| d tjdksJ d	g j\}
|
t|	i|
t|	d it|dkrd
d }g}g}n dd }fddtt|D }fddtt|D }j d||  ttt  |||}W d    n	1 sw   Y  j d||  t|dkr3fS  fdd|D }t|D ]\}}j| d d| d qAt|S )Nr$   c                 S  s&   g | ]\}}d t | d| dqS )rT   rU   r6   )r   )rT  r   r   rA   rA   rB   r[    s    z%HalideKernel.scan.<locals>.<listcomp>r  Z_rdomz.xz = hl.RDom([hl.Range(1, z)])z&multi-dimensional scan not implementedc                 S  s   | d S re  rA   r   rA   rA   rB   maybe_tuple$  r   z&HalideKernel.scan.<locals>.maybe_tuplec                 S  s   dd |  dS )Nz
hl.Tuple([rU   r  )rJ  r   rA   rA   rB   r)  +  s   c                   "   g | ]}  d | d qS rH  rI  r^  rT  r  )r  scan_renames_prirA   rB   r[  .      c                   r*  r+  r,  r-  )r  scan_renames_currA   rB   r[  2  r/  r  c                   s   g | ]
}  qS rA   )r  rC  )rT  r  )all_used_dimsrI   rA   rB   r[  C  r  rH  rI  )r|  rd   r   r|   rl  r7   r;  r=  r  r  r!  rB  r  rC  r  r-  r.  r  r  r   r3  r  r    r^  r  r#   r  r   r   r  r  )rI   r$  r  r'  rA  r   initialr  Zscan_domscanZscan_varr)  Z	read_leftZ
read_rightr  Zunpack_varsr  r  rA   )r1  r  r0  r.  rI   rB   r3    sn   




"zHalideKernel.scanr  r;  c                C  s,   | j j| j||d}t|tsJ ||_|S r  )r  generater3  r7   r;  r=  )rI   r   r=  r  r%  rA   rA   rB   r!  H  s   zHalideKernel.genfuncc                 C  s"   | j  }t|tsJ ||_|S r   )r  Znewvarr7   r;  r=  )rI   r=  r%  rA   rA   rB   r  P  s   
zHalideKernel.newfuncc                 C  s   t j|  S )a  
        We map all tensors to 1D buffers in Halide since Halide has trouble representing some strides that PyTorch
        supports.  If there are gaps in the underlying layout the numel we pass to Halide includes the gaps while
        PyTorch's numel excludes them.
        )r#   rf  
get_bufferZ
get_layoutZstorage_sizer
  rA   rA   rB   halide_buffer_numelV  s   z HalideKernel.halide_buffer_numelc                   s   dd }g }| j  \}}}}tt|||dD ].\} || f t trF jdkr2 jdu s4J |	 fdd| j
 jdD  q|S )	zX
        Halide requires scalar inputs before outputs, so need to reorder args.
        c                 S  s6   | \}}t |trdS d|jv rdS d|jv sJ dS )Nr$   Zout_ptrr   Zin_ptrr   )r7   r,   r   )Z	arg_tupleZ	_call_strrF  rA   rA   rB   	arg_orderc  s   

z.HalideKernel.halide_argdefs.<locals>.arg_orderr  r   Nc              	   3  s.    | ]}d t | j j j jdfV  qd S )Nalias_of)r-   bufferr   r   r   )rT  aliasrF  rA   rB   rU  s  s    
z.HalideKernel.halide_argdefs.<locals>.<genexpr>rA   )re   Zpython_argdefsr  r  r  r7   r-   r   r9  r  r  rW  r   )rI   r7  r5  r  r   r   Zcall_strrA   r<  rB   halide_argdefs^  s   

zHalideKernel.halide_argdefsr   c                   s  g }   D ]U\}}t|trd}d}d}d}n4 fdd j|j D } fdd j|j D }t|t|ks<J t j|j }t|j	  d}|
t||j||||jd qtj }|jdkrwtjjg}	tjj}
d	t i}d}nT|jd
ksJ d|jdksJ dtjjg}	tjj}
tj|}d|	d vrdD ]\}}|j|kr|j|kr|	
d| |   nq|	
d d	|ji}td|j}|	
d |	
d tjj s|	
d tjj!r|	
d d j"v r|	
d t#|d$|	|
||dS )z)Compute metadata required by codecache.pyNlongc                      g | ]
}t  |jqS rA   )r/   r.  r*  rS  rN  rA   rB   r[        z3HalideKernel.halide_kernel_meta.<locals>.<listcomp>c                   r?  rA   )r/   r.  rd  rS  rN  rA   rB   r[    r@  r  )shaperd  r   r9  cpuZparallelismcudazonly cpu/cuda supportedr   zonly default device supportedZcuda_capability))      )rD  r   )      )rF  r   )rE  r$   Zcuda_capability_Zuser_contextZstrict_floatZ
no_runtimeZ
no_assertsdebugZ64Zlarge_buffers-)target	schedulerscheduler_flagscuda_device)%r=  r7   r,   r}  r   rd   r/   r~  r.   r   r  r   r9  r#   rf  get_current_device_or_throwtyper   r:  Z
cpu_targetZscheduler_cpur   r$  Z
gpu_targetscheduler_cudar9   rC  Zget_device_propertiesmajorminorZmulti_processor_countr<   ZassertsrH  rW   r   rJ  )rI   argtypesr  rF  rA  rd  r   r   current_devicerJ  ZschdulerrL  rM  Z
capabilityrQ  rR  rA   rN  rB   halide_kernel_meta  s~   















zHalideKernel.halide_kernel_metac                   s`   j jrtd  }t }|jddd |    D ]F\}}t|t	r5|
|j d j d q|js<J |d|jv rCdnd	}t|j}t j|j }|
|j d
| d| d| d q|d |    D ]\}}|
|j d|j  qr j  D ]\}	}
|
|	 d
|
  q| j  fdd} jjD ]}t|trtj||}|
| q|
d |
d   D ]\}}t|t	rtjjj|jdd}|
|j d| d qɈ j|j }g }t|D ]k\}}  tjjj|j!dd|}|"d| d d|jvr`|
|j d| d z|
|j d| dt#|j$ d W n
 t%y@   Y nw z|
|j d| dt#|j! d W q t%y_   Y qw q|
|j dd&| d q|'d |d(  |j)r|jdt*+|j)d |j,d!|j)d|j-d"	dd |. S |jd#|j,d$dd |. S )%z3Called at the end to generate a final kernel stringinplace_buffersz
            import halide as hl
            from torch._inductor.runtime import halide_helpers
            from math import inf, nan

            @hl.generator(name="kernel")
            class Kernel:
        Tstripz = hl.InputScalar(r6   outzhl.OutputBufferzhl.InputBufferr  r   rU   z&
            def generate(g):
        z = g.c                   s2   t t jj| d }|jd usJ |t|S )Nr$   )r   r;  r  r	  r  r=  r)  )r  r%  rN  rA   rB   update_index  s   z1HalideKernel.codegen_kernel.<locals>.update_indexr  zassert g.using_autoscheduler()r$   r  z.set_estimate(r  z.dim(z).set_min(0)z).set_stride(z).set_extent(z.set_estimates([r  r   zN
            if __name__ == "__main__":
                hl.main()
            z:
                else:
                    hl.load_plugin(z))
                    target = hl.Target(z=)
                    autoscheduler = hl.AutoschedulerParams(a  )
                    with hl.GeneratorContext(target, autoscheduler):
                        gen = Kernel()
                        pipeline = gen._build_pipeline()
                        # gen.compile_to_callable() does not run the autoscheduler
                        pipeline.apply_autoscheduler(target, autoscheduler)
                        kernel = pipeline.compile_to_callable([
                                gen._get_input_parameter(a.name)._to_argument()
                                for a in gen._get_arginfos()
                                if a.dir == hl.ArgInfoDirection.Input
                            ], target)
                zR
                  else:
                      with hl.GeneratorContext(hl.Target(zX)):
                          kernel = Kernel().compile_to_callable()
                  )/re   rV  rD   rU  r(   spliceZ	do_indentr=  r7   r,   r  r   rW   r:  r   r   rd   r}  aliasesr  r3  _linesr)  r;  ra  r  r#   rf  rg  rt  rY   r  _autoscheduler_workaroundsr*  r  r8   rd  ru  rJ  Zdo_unindentrstriprK  r   Zfind_libautoschedulerJ  rL  getvalue)rI   r   metacoder  rF  ZargclsZargtypendimr  r  rZ  r   hintrL  Zrange_hintsr  dimrA   rN  rB   codegen_kernel  s   

&



 

zHalideKernel.codegen_kernelc                 C  s6   t |dkrtjjdkrtj jdkrtd| } | S )Nr$   ZAnderson2021rC  r   )	rd   r   r:  rP  r#   rf  rN  rO  r<   )r   rL  rA   rA   rB   r^  N  s
   
z'HalideKernel._autoscheduler_workaroundsc                 C  s^   t jj}dd |  D }t j }|jdkr$||jt j}|| |j	|||dd dS )zCodegen a call to this kernelc                 S  s    g | ]\}}|j d u r| qS r   r8  )rT  r   rF  rA   rA   rB   r[  \  s     z,HalideKernel.call_kernel.<locals>.<listcomp>rC  F)deviceZtritonN)
r#   rf  wrapper_coder=  rN  rO  Zwrite_get_raw_streamr$  r  Zgenerate_kernel_call)rI   r   r  wrapperZ	call_argsrT  stream_namerA   rA   rB   call_kernelY  s   



zHalideKernel.call_kernelc                 C  s   dS rp  rA   )rI   r+  rA   rA   rB   generate_asserth  s   zHalideKernel.generate_assertrY   r*  lowerupperc                 C  s   d S r   rA   )rI   rY   r*  rm  rn  rA   rA   rB   check_boundsk  s   zHalideKernel.check_bounds)ry  rz  rE   rF   )r   r   rE   r)  )NN)r  r  )r$  rc  )r%  r)  r$  rc  r  r   rp  )r   r)  r$  rc  )r   r)  r   )
r   r)  r$  rc  r   r&   r  r5   rE   rF   )
r   r   r   r   r  r4   r   r  rE   r  )r$  r%  r  r&  r'  r(  rE   r(  )rE   r;  )rE   r   )rY   rc  r*  rc  rm  r   rn  r   )*rO   rP   rQ   r   Z	overridestexprr-  rq  rH   r  r  r  r  r  r   r  r  r  r  r#  rC  r  r   rm  r  r  r  r3  r   unknownr!  r  r6  r=  rU  rf  r   r^  rk  rl  ro  rR   rA   rA   rK   rB   rx    sJ   
 

 ,	
h


	
'
=
&T


$
Sy

rx  c                   @  s&   e Zd ZeZed
ddZdd Zd	S )HalideSchedulingrg  torch.devicerE   OrderedSet[BackendFeature]c                 C  s,   t tjtjtjg}tjjr|tj	 |S r   )
r   r%   ZTUPLE_REDUCTIONZPREFER_STORE_LOOP_ORDERZREDUCE_TO_SINGLE_ELEMENTr   r:  Zscan_kernelsr  ZSCAN)r   rg  r5  rA   rA   rB   get_backend_featurest  s   z%HalideScheduling.get_backend_featuresc           
      C  s   t jj}||jv r|j| }|S d|  }||j|< |d t }|d| d |j	|dd |d t
||\}}| d| }	||| |	 td	r\t|d
| |S )z6Codegen kernel definition to go in output wrapper codeZhalide_kernel_zEfrom torch._inductor.runtime.hints import HalideMeta, HalideInputSpeczasync_compile.halide(z, '''TrW  z''')
Zkernel_metadatar  )r#   rf  rh  Zsrc_to_kernelZnext_kernel_suffixZadd_import_oncer(   r  rU  r[  r   define_kernelr`  r   r   )
rI   Zsrc_codeZnode_schedulerV   ri  Zkernel_nameZcompile_wrapperZoriginsZdetailed_originsZmetadata_commentrA   rA   rB   rw    s.   




zHalideScheduling.define_kernelN)rg  rs  rE   rt  )rO   rP   rQ   rx  Zkernel_typer9  ru  rw  rA   rA   rA   rB   rr  q  s
    rr  )r
__future__r   dataclassesr  r?  loggingr_  collectionsr   mathr   typingr   r   r   r   r	   r
   r|   r9   Ztorch._loggingZ_prims_commonr   Zutils._ordered_setr   Zutils._sympy.functionsr   r   Zutils._sympy.symbolr   r   Zutils._sympy.value_rangesr   r  r   r   Z	codecacher   r   Zmetricsr   r   Zops_handlerr   Zruntime.hintsr   r   utilsr   r   r   r    r!   Zvirtualizedr"   r   r#   commonr%   r&   r'   r(   r)   r*   r+   r,   r-   cppr.   Z	cpp_utilsr/   Zsimdr0   r1   r2   collections.abcr3   r4   r5   	getLoggerrO   r  rC   RuntimeErrorrD   rS   ru   rp  Zpexprr   r   r   r   Zfloat64r  Zint16r   r:   Zuint8Zuint16Zuint32Zuint64r   r   r   r   Z_initialize_pointwise_overridesr;  	dataclassrb  rv  r  rx  rr  rA   rA   rA   rB   <module>   s    ,
}  
H'       ^