o
    ZhX                  	   @  sP  d dl mZ d dlZd dlmZmZmZmZ d dlZd dl	m
Z
 ddlmZmZ ddlmZmZmZmZ ddlmZmZmZmZ dd	lmZmZmZmZmZmZ dd
l m!Z! ddl"m#Z#m$Z$ erld dl%m&Z& ddlm'Z' e(e)Z*ej+j,Z,edd Z-edd Z.dddddddddddddddddddddgZ/e0dd e/D Z1ej2j3rej45 re#e1Z1dd Z6d d! Z7d"Z8	 ed#e-d$e8 d% e8 d& d'Z9d(Z:ed)e.d*e: d+ e: d, d'Z;eej<d-d.e,j<j=d/Z>d0d1 Z?ee?dZ@G d2d3 d3eZAdQdEdFZBdGdH ZCdIdJ ZDee,j<dRdKdLZ<ee,jEdMdN ZEdOdP ZFee,j<eF dS )S    )annotationsN)castOptionalTYPE_CHECKING	TypedDict)CKGroupedConvFwdTemplate   )configir)add_layout_constraintconstrain_to_fx_strides	loweringsregister_lowering)autotune_select_algorithmExternKernelChoiceSymbolicGridFnTritonTemplate)is_onesis_zerospad_listlikesympy_productuse_ck_conv_templateuse_triton_template)V   )build_rocm_gemm_configsfiltered_configs)Sequence)	TensorBoxc                C  s*   || | | |d |||d |d fS NZBLOCK_MZBLOCK_NGROUPS )nchwmetacdivr!   r!   J/var/www/auris/lib/python3.10/site-packages/torch/_inductor/kernel/conv.pyconv2d_grid.   s   r)   c                C  s.   || | | | |d |||d |d fS r   r!   )r"   r#   dr$   r%   r&   r'   r!   r!   r(   conv3d_grid7   s   r+   )@         r      T)r	   cond)r-   r,   r.   r   r/   )i   r.   r.   r      )   r2       r   r1   )r,   r,   r3   r   r/   )r,   r-   r3   r   r1   )r-   r,   r3   r   r1   c                 c  s4    | ]}|d  rt ttttttf |d V  qdS )r0   r	   N)r   tupleint).0r	   r!   r!   r(   	<genexpr>N   s    
r7   c                 C  s,   | dks|dks|dkrdS | | | dkS )Nr-   Ti   r!   )mr"   kr!   r!   r(   _is_large_block_for_cpuY   s   r:   c                K  s,   |dkrt | ||tdtdS t | ||tdS )Ncpug      ?)configsscaleexclude)r<   )r   platform_configsr:   )r8   r"   r9   device_typekwargsr!   r!   r(   conv_configs`   s   rB   a  
        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
        idx_x_c = tl.arange(0, BLOCK_K) + k

        x_ptrs = x_base + (
            (idx_x_h * stride_xh)[:, None]
            + (idx_x_w * stride_xw)[:, None]
            + (idx_x_c * stride_xc)[None, :]
        )
        mask_x = (
            (idx_n < BATCH)[:, None]
            & (idx_x_h >= 0)[:, None]
            & (idx_x_h < IN_H)[:, None]
            & (idx_x_w >= 0)[:, None]
            & (idx_x_w < IN_W)[:, None]
            & (idx_x_c < GROUP_IN_C)[None, :]
        )
        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)

        w_ptrs = w_base + (
            (idx_x_c * stride_wc_in)[:, None] + (i * stride_wh) + (j * stride_ww)
        )
        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
Zconvolution2dag  
{{def_kernel("X", "W")}}
    # Tensor dimensions
    BATCH = {{size("X", 0)}}
    IN_C = {{size("X", 1)}}
    IN_H = {{size("X", 2)}}
    IN_W = {{size("X", 3)}}
    OUT_C = {{size(None, 1)}}
    OUT_H = {{size(None, 2)}}
    OUT_W = {{size(None, 3)}}

    # Strides:
    stride_xn = {{stride("X", 0)}}
    stride_xc = {{stride("X", 1)}}
    stride_xh = {{stride("X", 2)}}
    stride_xw = {{stride("X", 3)}}
    stride_wc_out = {{stride("W", 0)}}
    stride_wc_in = {{stride("W", 1)}}
    stride_wh = {{stride("W", 2)}}
    stride_ww = {{stride("W", 3)}}

    nhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    idx_y_w = nhw % OUT_W
    nh = nhw // OUT_W
    idx_y_h = nh % OUT_H
    idx_n = nh // OUT_H
    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)

{% if GROUPS == 1 %}
    group = 0
    GROUP_IN_C = IN_C
    GROUP_OUT_C = OUT_C
{% else %}
    group = tl.program_id(2)
    GROUP_IN_C = IN_C // GROUPS
    GROUP_OUT_C = OUT_C // GROUPS
{% endif %}

    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
    w_base = (
        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
    )

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

{% if UNROLL %}
{% for i in range(KERNEL_H) %}
{% for j in range(KERNEL_W) %}
    i = {{i}}
    j = {{j}}
    for k in range(0, GROUP_IN_C, BLOCK_K):
        a  
{% endfor %}
{% endfor %}
{% else %}
    # Could be simplified, but slightly slower:
    # for i in range(KERNEL_H):
    #     for j in range(KERNEL_W):
    #         for k in range(0, GROUP_IN_C, BLOCK_K):
    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
    for ijk in range(KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
        k = (ijk % BLOCK_K_COUNT) * BLOCK_K
        ij = ijk // BLOCK_K_COUNT
        i = ij // KERNEL_W
        j = ij % KERNEL_W
        a  
{% endif %}

    mask = (
        (idx_n < BATCH)[:, None]
        & (idx_y_h < OUT_H)[:, None]
        & (idx_y_w < OUT_W)[:, None]
        & (idx_y_c < GROUP_OUT_C)[None, :]
    )
    idx_n = idx_n[:, None]
    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
    idx_h = idx_y_h[:, None]
    idx_w = idx_y_w[:, None]

    # inductor generates a suffix
    {{store_output(("idx_n", "idx_c", "idx_h", "idx_w"), "acc", "mask")}}
)namegridsourcea  
        idx_x_d = d - PADDING_D + idx_y_d * STRIDE_D
        idx_x_h = i - PADDING_H + idx_y_h * STRIDE_H
        idx_x_w = j - PADDING_W + idx_y_w * STRIDE_W
        idx_x_c = tl.arange(0, BLOCK_K) + k

        x_ptrs = x_base + (
            (idx_x_d * stride_xd)[:, None]
            + (idx_x_h * stride_xh)[:, None]
            + (idx_x_w * stride_xw)[:, None]
            + (idx_x_c * stride_xc)[None, :]
        )
        mask_x = (
            (idx_n < BATCH)[:, None]
            & (idx_x_d >= 0)[:, None]
            & (idx_x_d < IN_D)[:, None]
            & (idx_x_h >= 0)[:, None]
            & (idx_x_h < IN_H)[:, None]
            & (idx_x_w >= 0)[:, None]
            & (idx_x_w < IN_W)[:, None]
            & (idx_x_c < GROUP_IN_C)[None, :]
        )
        matrix_x = tl.load(x_ptrs, mask=mask_x, other=0.0)

        w_ptrs = w_base + (
            (idx_x_c * stride_wc_in)[:, None] +
            (d * stride_wd) + (i * stride_wh) + (j * stride_ww)
        )
        mask_w = (idx_x_c[:, None] < GROUP_IN_C) & (idx_y_c[None, :] < GROUP_OUT_C)
        matrix_w = tl.load(w_ptrs, mask=mask_w, other=0.0)
        acc += tl.dot(matrix_x, matrix_w, allow_tf32=ALLOW_TF32)
Zconvolution3daH  
{{def_kernel("X", "W")}}
    # Tensor dimensions
    BATCH = {{size("X", 0)}}
    IN_C = {{size("X", 1)}}
    IN_D = {{size("X", 2)}}
    IN_H = {{size("X", 3)}}
    IN_W = {{size("X", 4)}}
    OUT_C = {{size(None, 1)}}
    OUT_D = {{size(None, 2)}}
    OUT_H = {{size(None, 3)}}
    OUT_W = {{size(None, 4)}}

    # Strides:
    stride_xn = {{stride("X", 0)}}
    stride_xc = {{stride("X", 1)}}
    stride_xd = {{stride("X", 2)}}
    stride_xh = {{stride("X", 3)}}
    stride_xw = {{stride("X", 4)}}
    stride_wc_out = {{stride("W", 0)}}
    stride_wc_in = {{stride("W", 1)}}
    stride_wd = {{stride("W", 2)}}
    stride_wh = {{stride("W", 3)}}
    stride_ww = {{stride("W", 4)}}

    ndhw = tl.program_id(0) * BLOCK_M + tl.arange(0, BLOCK_M)
    idx_y_w = ndhw % OUT_W
    ndh = ndhw // OUT_W
    idx_y_h = ndh % OUT_H
    nd = ndh // OUT_H
    idx_y_d = nd % OUT_D
    idx_n = nd // OUT_D
    idx_y_c = tl.program_id(1) * BLOCK_N + tl.arange(0, BLOCK_N)

{% if GROUPS == 1 %}
    group = 0
    GROUP_IN_C = IN_C
    GROUP_OUT_C = OUT_C
{% else %}
    group = tl.program_id(2)
    GROUP_IN_C = IN_C // GROUPS
    GROUP_OUT_C = OUT_C // GROUPS
{% endif %}

    x_base = X + (group * stride_xc * GROUP_IN_C + idx_n * stride_xn)[:, None]
    w_base = (
        W + (group * stride_wc_out * GROUP_OUT_C + idx_y_c * stride_wc_out)[None, :]
    )

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)

{% if UNROLL %}
{% for d in range(KERNEL_D) %}
{% for i in range(KERNEL_H) %}
{% for j in range(KERNEL_W) %}
    d = {{d}}
    i = {{i}}
    j = {{j}}
    for k in range(0, GROUP_IN_C, BLOCK_K):
        aF  
{% endfor %}
{% endfor %}
{% endfor %}
{% else %}
    # Could be simplified, but slightly slower:
    # for d in range(KERNEL_D):
    #   for i in range(KERNEL_H):
    #     for j in range(KERNEL_W):
    #         for k in range(0, GROUP_IN_C, BLOCK_K):
    BLOCK_K_COUNT = (GROUP_IN_C + BLOCK_K - 1) // BLOCK_K
    for dijk in range(KERNEL_D * KERNEL_H * KERNEL_W * BLOCK_K_COUNT):
        k = (dijk % BLOCK_K_COUNT) * BLOCK_K
        dij = dijk // BLOCK_K_COUNT
        j = dij % KERNEL_W
        di = dij // KERNEL_W
        i = di % KERNEL_H
        d = di // KERNEL_H
        a  
{% endif %}

    mask = (
        (idx_n < BATCH)[:, None]
        & (idx_y_d < OUT_D)[:, None]
        & (idx_y_h < OUT_H)[:, None]
        & (idx_y_w < OUT_W)[:, None]
        & (idx_y_c < GROUP_OUT_C)[None, :]
    )
    idx_n = idx_n[:, None]
    idx_c = idx_y_c[None, :] + group * GROUP_OUT_C
    idx_d = idx_y_d[:, None]
    idx_h = idx_y_h[:, None]
    idx_w = idx_y_w[:, None]

    # inductor generates a suffix
    {{store_output(("idx_n", "idx_c", "idx_d", "idx_h", "idx_w"), "acc", "mask")}}
zat::convolutionF)Zhas_out_variantZop_overloadc             	   C  sD   t t |dd}t j| dddd|dd|dddddS )Nr   r      r   )out)torchsqueezematmulpermute)xr%   rH   r!   r!   r(   conv1x1_via_mm{  s   &rN   c                   @  s>   e Zd ZU ded< ded< ded< ded< ded< ded	< d
S )ConvLayoutParamstuple[int, ...]stridepaddingdilationbool
transposedoutput_paddingr5   groupsN)__name__
__module____qualname____annotations__r!   r!   r!   r(   rO     s   
 rO   rM   r   weightbiasOptional[TensorBox]rQ   Sequence[int]rR   rP   rS   rU   rT   rV   rW   r5   return	ir.Layoutc	                 C  s   t jjH tjjtj| ddtj|ddtj|ddt jj	
|t jj	
|t jj	
||t jj	
||	}	t|	 }
t|	 }W d   n1 sOw   Y  t|  |  |
|S )z)Determine output layout for a convolutionT)Zguard_shapeN)r   graphZ	fake_moderI   opsatenconvolutionr
   Zir_node_to_tensorsizevars
size_hintsZconvert_shape_to_inductorsizerQ   ZFixedLayoutZget_device_or_errorZ	get_dtype)rM   r\   r]   rQ   rR   rS   rU   rV   rW   outputsizesr!   r!   r(   conv_layout  s*   
rk   c                 C  s&   t tt| }|d|d |S )Nr   rF   )listreversedrangeinsertpop)rankorderr!   r!   r(   channels_last_order  s   rs   c           
      C  s  t | }t|d D ]}ttj |dd}qttj |ddg}tj	| t
|} tt|}||d ttj | |} |  ^ }}ttj | t||g} |d u rattj | |}n	ttj || |}ttj |g |d}tt|}	|	d|	d ttj ||	S )Nr   rF   dimr   r   )lenget_sizern   Lrd   rJ   rL   r
   ExternKernelrequire_stride_orderrs   rl   appendrp   Zreshaper   mmZaddmmro   )
rM   r\   r]   rq   _Z	x_permuterj   in_chanresultZresult_permuter!   r!   r(   convert_1x1_conv_to_mm  s"   r   c	                   s   t |}t |}t |}t |}t|tstjj|}t|ts#J t tjj|}t tjj|}||||||d t	 t	 d krht
tj tt
tj dg	 |fi  ddS tjj	 ^}	}
}t	 dkrt|dkrtdkr d| d| d| d| d	 t
tj d
dt
tj d
dt
tj t|fi  d
dS t|t|}t|}t|}t|} fdd}tjptj}tjs|r#| r#t|r#t|r#t|r#t|r#|s#t|r#|dkr#tjjt	 dr#t|S |d urRtdkrRtd fi  }t
tj |t
tj ||	 d gdg  S     tjj rd
krtj j!d7  _!tj"#tj"#t$d fi  }n#t$d fi  }t%tjj&|j'}tj"(|tj"(|g d}|d u rg}d  d< |)dd n|g}|  |*  tjj|	  g }t+j,j-.drt/j0|||fi  g}t+j,j-.drt1|rt|r|st|rtjj2|
	 d rt|r.t|r.t|r.|dkr.|3t40|| t5t	 d g	 d
d  |	|
tdD ]}d
kr~t6j7|ff||d |d |d |d |d |d |t|t+j8j9j:|j;|j<d|j= qHdkrt>j7|fi dfd|d|d d|d d|d
 d|d d|d d|d
 d|d d|d d|d
 d|dt|d t+j8j9j:d!|j;d"|j<|j= qHt?|rt@jA||f|d ur|fnt   ||||d# tBd$|||S )%N)rQ   rR   rS   rU   rV   rW   r   r   rt   rG   Zxpu)r   )r   )rQ   rR   rS   rV   r   c                    sH   t jjr
dkr
dS td fi  } tt jj| j}|tj	kS )Nr   T)
r   rb   
layout_optrk   r
   get_stride_orderrf   rg   rQ   ZNHWC_STRIDE_ORDER)layoutreq_stride_orderrA   ndimr\   rM   r!   r(   channels_last_conv  s   
z'convolution.<locals>.channels_last_convr;   r]   ZATENZTRITON)r@   )input_nodesr   KERNEL_HKERNEL_WSTRIDE_HSTRIDE_W	PADDING_H	PADDING_Wr    UNROLL
ALLOW_TF32
num_stages	num_warpsr   r   ZKERNEL_Dr   r   ZSTRIDE_Dr   r   Z	PADDING_Dr   r   r    r   r   r   r   )r   rQ   rR   rS   rW   Zn_spatial_dimensionsre   )Cr4   
isinstancer5   r   rb   rf   Zevaluate_static_shapeZevaluate_static_shapesrv   rw   rx   rd   rJ   re   expandr
   Zget_device_typeupdateZ	unsqueezer   r	   Zmax_autotuneZmax_autotune_gemmZconv_1x1_as_mmr   r   Zstatically_known_gtr   r   addviewZrealizer   Znum_channels_last_convry   Zrequire_channels_lastrk   r   rg   rQ   rz   ro   Zfreeze_layoutrI   Z	_inductorutilsZ_use_conv_autotune_backendaten_convolutionbindr   Zstatically_known_equalsr{   aten_conv1x1_via_mmrB   conv2d_templateZmaybe_append_choicebackendsZcudnn
allow_tf32r   r   rA   conv3d_templater   r   Zadd_ck_conv_choicesr   )rM   r\   r]   rQ   rR   rS   rU   rV   rW   Zout_chanr~   Zkernel_shaper   Zautotuning_gemmr   r   r   Zordered_kwargs_for_cpp_kernelargschoicescfgr!   r   r(   re     s  
	(
	





$



 



	



re   c              
   C  s   t | ||||||||	S N)re   )rM   r\   r]   rQ   rR   rS   rU   rV   rW   Z	benchmarkZdeterministicZcudnn_enabledr   r!   r!   r(   _convolution  s   r   c                 O  s<   | j tjjjjksJ tjjr||fS t	| g|R i |S r   )
targetrI   rc   rd   re   defaultr   rb   r   r   )Zfx_noder   rA   r!   r!   r(   constrain_conv_to_fx_strides  s   r   )rM   r   r\   r   r]   r^   rQ   r_   rR   rP   rS   rP   rU   rT   rV   rP   rW   r5   r`   ra   )rM   r   r\   r   r]   r^   rQ   r_   rR   r_   rS   r_   rU   rT   rV   r_   rW   r5   )G
__future__r   loggingtypingr   r   r   r   rI   Z-torch._inductor.codegen.rocm.ck_conv_templater    r	   r
   Zloweringr   r   r   rx   r   Zselect_algorithmr   r   r   r   r   r   r   r   r   r   r   Zvirtualizedr   Z	mm_commonr   r   collections.abcr   r   	getLoggerrX   logrc   rd   r)   r+   Zkernel_configsr4   r?   versionZhipcudaZis_availabler:   rB   ZLOOP_BODY_2Dr   ZLOOP_BODY_3Dr   re   r   r   rN   r   rO   rk   rs   r   r   r   r!   r!   r!   r(   <module>   s    


45DE[!<=PQi

	# s
