o
    Zh$!                     @   s0  d dl Z d dlZd dlmZ d dlmZ ddlmZmZ	 ddl
mZmZmZmZ ddlmZmZmZmZmZ ddlmZ d	d
lmZmZmZmZmZmZ e eZ ej!j"Z"edd Z#dd Z$dd Z%ede#ddZ&eej'dZ(eej)de"j)j*dZ+e	,e"j'ddddZ-e	,e"j)d	d	ddddZ.dS )    N)counters)CKGemmTemplate   )irlowering)autotune_select_algorithmExternKernelChoiceSymbolicGridFnTritonTemplate)use_aten_gemm_kernelsuse_ck_gemm_templateuse_cpp_bmm_templateuse_cutlass_templateuse_triton_template)V   )_is_static_problemaddmm_epiloguemm_args
mm_configs
mm_optionsshould_fallback_to_atenc                C   s"   |||d |||d  | dfS )NZBLOCK_MZBLOCK_Nr    )bmnmetaZcdivr   r   I/var/www/auris/lib/python3.10/site-packages/torch/_inductor/kernel/bmm.pybmm_grid%   s   "r   c                 C   s(   | dks|dks|dkrdS | | dkS )N   Ti   r   )r   r   kr   r   r   _is_large_block_for_cpu*   s   r!   c                C   s&   |dkrt | ||dtdS t | ||S )Ncpug      ?)scaleexclude)r   r!   )r   r   r    device_typer   r   r   bmm_configs1   s   r&   bmma  
{{def_kernel("A", "B")}}
    M = {{size("A", -2)}}
    N = {{size("B", -1)}}
    K = {{size("A", -1)}}

    stride_aq = {{stride("A", 0)}}
    stride_am = {{stride("A", 1)}}
    stride_ak = {{stride("A", 2)}}

    stride_bq = {{stride("B", 0)}}
    stride_bk = {{stride("B", 1)}}
    stride_bn = {{stride("B", 2)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        ram = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        ram = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        rbn = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        rbn = rn % N

    rk = tl.arange(0, BLOCK_K)

    idx_q = tl.program_id(1)  # batch dimension for BMM
    A = A + (ram[:, None] * stride_am + rk[None, :] * stride_ak + idx_q*stride_aq)
    B = B + (rk[:, None] * stride_bk + rbn[None, :] * stride_bn + idx_q*stride_bq)

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    for k in range(K, 0, -BLOCK_K):
        if EVEN_K:
            a = tl.load(A)
            b = tl.load(B)
        else:
            a = tl.load(A, mask=rk[None, :] < k, other=0.)
            b = tl.load(B, mask=rk[:, None] < k, other=0.)
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)
        A += BLOCK_K * stride_ak
        B += BLOCK_K * stride_bk

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_q = tl.program_id(1)  # batch dimension for BMM
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_q", "idx_m", "idx_n"), "acc", "mask")}}
)namegridsourcezat::bmm_outzat::baddbmm_out)Zop_overloadlayoutc             
      sD  t dd | |fD r`|  d dks| d dkr2t| d} t|d}tjt| |ddS dd }d	d
   fdd}|| rPtjjj	d }|| |} ||r`tjjj	d }|||}t
| ||d\}}}	}} }td d| d| d|	   d7  < td|||	|  | | t rt| |f|gng }
t|rt|||	t| dD ]}tj|
f| |f|dt||||	| qt|\}}|r|rt||||	rddlm} ||
|| |g t|| |rddlm } |!|
|| |g t"||||	r	t#$|
|| |g t%|
r|
&t| |f| t'd|
| |g|S )Nc                 s   s    | ]
}|  jd kV  qdS )r"   N)Z
get_devicetype).0xr   r   r   	<genexpr>   s    ztuned_bmm.<locals>.<genexpr>r   r   )Zaxisc                 S   s,   t | sdS t j| dd\}}t|t jS )NTF)freeze)r   Zis_storage_and_layoutZas_storage_and_layout
isinstanceZFlexibleLayout)t_r,   r   r   r   is_valid_to_require_contiguous   s   
z1tuned_bmm.<locals>.is_valid_to_require_contiguousc                 S   sP   |d dko| d dkp|d | d kp'|d dko'| d dkp'|d | d kS )Nr1   r   r   )sizesstridesr   r   r    is_preferred_layout_as_bmm_input   s   &&z3tuned_bmm.<locals>.is_preferred_layout_as_bmm_inputc                    s6   |j d  }|j d  } ||stj| } | S )Nval)r   sizeZstrider   ZExternKernelZrequire_contiguous)r4   Zmeta_tr8   r9   r:   r   r   may_require_contiguous   s
   
z)tuned_bmm.<locals>.may_require_contiguousr   r+   aten_mm_infoz	aten.bmm_r5   zPTuned aten.bmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr%   Zinput_nodesr,   )CUTLASS3xGemmTemplate)CppBmmTemplater'   )(allget_sizeLZ	unsqueezeZsum_mulr   graphZcurrent_nodeargsr   r   loginfo	get_dtyper   aten_bmmbindr   r&   r   get_device_typebmm_templatemaybe_append_choicer   r   r   Zcodegen.cuda.gemm_templaterB   Zadd_cutlass_gemm_choicesr   Zcodegen.cpp_bmm_templaterC   Zadd_choicesr   r   Zadd_ck_gemm_choicesr   appendr   )mat1mat2r,   r6   r>   Z	meta_mat1Z	meta_mat2r   r   r    choicesconfigZstatic_shapeZ
is_nonzerorB   rC   r   r=   r   	tuned_bmm   sh    


&
rW   )alphabetar,   c             
   C   s  t ||| |d\}}}}}}} td d| d| d|   d7  < td|||| | |  | t rDtj| ||f|||dgng }	t|rwt	|||t
|dD ]!}
tj|	f| ||f|d	t|
||||dt|j||d
 qUtd|	| ||g|S )Nr+   r?   zaten.baddbmm_r5   r   z\Tuned aten.baddbmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, inp=%s, output_layout=%s)rX   rY   r@   rA   )Zprefix_argsZepilogue_fnbaddbmm)r   r   rJ   rK   rL   r   aten_baddbmmrN   r   r&   r   rO   rP   rQ   r   r   Zdtyper   )inprS   rT   rX   rY   r,   r   r   r    rU   rV   r   r   r   tuned_baddbmm   s<   &	r]   )/loggingZtorchZtorch._dynamo.utilsr   Z7torch._inductor.codegen.rocm.ck_universal_gemm_templater    r   r   rF   Zselect_algorithmr   r   r	   r
   utilsr   r   r   r   r   Zvirtualizedr   Z	mm_commonr   r   r   r   r   r   	getLogger__name__rJ   opsZatenr   r!   r&   rP   r'   rM   rZ   outr[   Zregister_loweringrW   r]   r   r   r   r   <module>   s8    


G

V