o
    wZh>C                  	   @   s  d dl Z d dlmZmZ d dlmZ e edZe edZe rd dl	Z	d dl	m
Z e	j		dTd	d
Ze	j		dTddZe	j				dUddZe	j				dUddZe	je	jddiddde	jddiddde	jddiddde	jddidddgg de	j		dTddZe	je	jddidddgg de	j		dTddZe	je	jdddddde	jdddddde	jdddddde	jddddddgg de	j	 		!	dVd"d#Zd$d% Ze	je	jddiddde	jddidddgg d&d'd(eid)e	j		dTd*d+Ze	j		dTd,d-Ze	j		dTd.d/Ze	j	 		!	dVd0d1Ze	j		dTd2d3Ze	j		dTd4d5Ze	jd6d7 Ze	j			8	dWd9d:Ze	j	;		<	dXd=d>Ze	j	?		@	dYdAdBZ e	j	?		@	dYdCdDZ!e	jdej"fdEdFZ#e	jdej"fdGdHZ$d dIl%m&Z&m'Z' e	j		dTdJdKZ(e	j		dTdLdMZ)e	j		dTdNdOZ*e	j		dTdPdQZ+e	j		dTdRdSZ,dS dS )Z    N)HAS_CUDAHAS_GPU)
has_tritonzrequires cudazrequires gpu)language
BLOCK_SIZEtl.constexprc                 C   p   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S Nr   Zaxismasktl
program_idarangeloadstorein_ptr0in_ptr1out_ptr
n_elementsr   pidblock_startoffsetsr   xyoutput r   S/var/www/auris/lib/python3.10/site-packages/torch/testing/_internal/triton_utils.py
add_kernel      r    c                 C   sp   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S r	   r   r   r   r   r   
sub_kernel"   r!   r"   ARGS_PASSEDc                 C   s~   t jdd}|| }|t d| }||k }	t j| | |	d}
|dkr1t j|| |	d}|
| }n|
}t j|| ||	d d S Nr   r
   r   twor   )r   r   r   r   r#   r   r   r   r   r   r   r   r   r   r   r   add_kernel_with_optional_param3   s   	
r&   c                 C   s   t jdd}|| }|t d| }	|	|k }
t j| |	|  |
d}|dkr3t j||	 |
d}|| }n|}t j||	|  ||
d d S r$   r   )r   r   r   r   Zstrider#   r   r   r   r   r   r   r   r   r   r   r   -add_kernel_with_none_param_and_equal_to_1_argH   s   

r'            )Z
num_stagesZ	num_warps   @   )configskeyc                 C   r   r	   r   r   r   r   r   add_kernel_autotuned^   s   r/         c                 C   sp   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S r	   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   &add_kernel_autotuned_weird_param_orderx   s   r2   )BLOCK_SIZE_XBLOCK_SIZE_Yr3   r4   c                 C   s   t d| }|t d|d d d f  }||k }	t d| }
|
t d|d d d f  }||k }|}|}t | |||   |	|@ }t | |||   |	|@ }|| }t ||||   ||	|@  d S )Nr      r   )r   r   r   Z
x_elementsZ
y_elementsr3   r4   ZxoffsetZxindexZxmaskZyoffsetZyindexZymaskx1Zy0Ztmp0Ztmp1Ztmp2r   r   r   add_kernel_2d_autotuned   s   "r7   c                 O   s   | S )Nr   )r-   ___r   r   r   _dummy_early_config_prune   s   r:   
      Zearly_config_prune)r-   r.   ZwarmuprepZprune_configs_byc                 C   r   r	   r   r   r   r   r   *add_kernel_autotuned_with_unsupported_args   s   r>   c                 C   st   t jdd}|| }|t d| }||k }	t j| | |	d}
t j|| |	d}|
| | }t j|| ||	d d S r	   r   )r   r   r   r   Zscaling_factorr   r   r   r   r   r   r   r   r   r   r   add_kernel_with_scaling   s   	r?   c           	      C   s\   t jdd}|| }t | |g|gt j}t ||g|gt j}|| }t |||g d S )Nr   r
   r   r   Z_experimental_descriptor_loadZfloat32Z_experimental_descriptor_store)	in_desc_ptr0in_desc_ptr1out_desc_ptrr   r   offsetabr   r   r   r   add_kernel_with_tma_1d   s(   rG   c                 C   sz   t jdd}t jdd}|| }|| }t | ||g||gt j}	t |||g||gt j}
|	|
 }t ||||g d S )Nr   r
   r5   r@   )rA   rB   rC   r3   r4   Zpid_xZpid_yZoffset_xZoffset_yr   r   r   r   r   r   add_kernel_with_tma_2d  s,   rH   c           
      C   s^   t jdd}|| }|t d| }||k }t j| | |d}d| }	t j|| |	|d d S Nr   r
   r   r1   r   )
r   r   r   r   r   r   r   r   r   r   r   r   r   mul2_kernel*  s   rJ   c           	      C   s^   t jdd}|| }|t d| }||k }t j| | |d}d| }t j| | ||d d S rI   r   )	Zptrr   r   r   r   r   r   r   r   r   r   r   mul2_inplace_kernel9  s   rK   c                 C   s   t | dk| dS )Nr   )r   where)r   r   r   r   	zero_negsG  s   rM   
ACTIVATIONc           
      C   s   t jdd}|| }|t d| }||k }|dkr"t| ||d n|dkr/t| | |||d t j| | |d}	t j|| |	|d d S )Nr   r
   rK   )r   r    r   )r   r   r   rK   r    r   r   )
r   r   r   r   rN   r   r   r   r   r   r   r   r   indirection_kernelK  s   rO   X_BLOCK_SIZEY_BLOCK_SIZEc                 C   s   t jdd}t jdd}|| }|| }	|t d| }
|	t d| }|d d d f | |
d d d f  }|d d d f | |
d d d f  }t | | }t || |d  d S )Nr   r
   r5   g       @r   )Zin_ptrr   Zin_y_strideZout_y_striderP   rQ   xidZyidZx_startZy_startZ	x_offsetsZ	y_offsetsZsrc_offsetsZdst_offsetssrcr   r   r   double_strided_kernel^  s   	$$rT   nBLOCKc           	      C   x   t | t d| }t |t d| }t |g|t j}t jdd|||gt jddd}t |t d| | d S )Nr   shf.l.wrap.b32 $0, $1, $2, $3;
=r,r, r, rTr5   ZdtypeZis_purepackr   r   r   fullZint32Zinline_asm_elementwiser   	XYZrU   rV   r   r   szr   r   r   inline_asm_kernel_is_pure_truer     rd   c           	      C   rW   )Nr   rX   rY   Fr5   rZ   r\   r^   r   r   r   inline_asm_kernel_is_pure_false  re   rf   c           
   	   C   s   t jdd}|| }t jt j| |gdg|g|gdgddgd}t jt j||gdg|g|gdgddgd}|| }	t jt j||gdg|g|gdgd|	dgd d S Nr   r
   r5   )baseshapestridesr   Zblock_shapeorder)Zboundary_checkr   r   r   Zmake_block_ptrr   )
x_ptrZy_ptr
output_ptrr   r   r   r   r   r   r   r   r   r   add_kernel_with_block_ptr  sJ   
ro   c              	   C   s   t jdd}|| }t jt j| |dgddg|dg|dgddgddgd}|}t jt j||dgddg|dg|dgddgd|dgd d S rg   rl   )rm   rn   r   r   r   r   r   r   r   r   r   kernel_with_block_ptr_2d  s4   
rp   )r   r   c                 C   sj   t jdd}|| }|t d| }||k }t| | |d}	t|| |d}
|	|
 }t|| ||d d S r	   r   r   r   r   r   add_kernel_with_import  s   rq   c                 C   s   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
t ddkr4|	|
 }n|	|
 }t j|| ||d d S r	   r   r   r   r   r   cond_op_kernel  s   
rr   c                 C   r   r	   )r   r   r   r   Z
atomic_addr   r   r   r   atomic_add_kernel  r!   rs   c                 C   s   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
tdD ]}|	|
 }t j|| ||d q,d}|dkr[|d8 }|	|
 }t j|| ||d |dksCd S d S )Nr   r
   r   r1   r5   )r   r   r   r   ranger   )r   r   r   r   r   r   r   r   r   r   r   ir   r   r   r   add_4_times_kernel  s   rv   c                 C   sp   t jdd}|| }|t d| }||k }t j| | |d}	t j|| |d}
|	|
 }t j|| ||d d S r	   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   add_kernel_out_of_order_fn23  r!   rw   )r   r   )r#   r   r   r   )r3   r   r4   r   )r   r   rN   r   )rP   r   rQ   r   )rU   r   rV   r   )-ZunittestZ&torch.testing._internal.inductor_utilsr   r   Ztorch.utils._tritonr   Z
skipUnlessZrequires_cudaZrequires_gpuZtritonr   r   Zjitr    r"   r&   r'   ZautotuneZConfigr/   r2   r7   r:   r>   r?   rG   rH   rJ   rK   rM   rO   rT   rd   rf   Z	constexprro   rp   Ztriton.languager   r   rq   rr   rs   rv   rw   r   r   r   r   <module>   sN  	
!
-!    