o
    ZhLs                     @   s`  d dl Z d dlZd dlZd dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZmZmZmZ d dlmZ d dlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z:m;Z;m<Z<m=Z= de>dee?e@df  fddZAzd dlBZBeAeBjCZDdZEeDdureD\ZFZGnd ZFd ZGW n eHy   dZDdZEd ZFd ZGY nw eIeJZKejLjMZMe%de7ejNjOdu seEreFdkreGdkrdnddZPe%de:d dZQe Rdd!d" ZSe$ejTd#ZUe$ejVd$eMjVjWd%ZXe$ejYd&ZZe$ej[d'dd(Z\d)d* Z]d+d, Z^d-d. Z_dddd/d0d1Z`e$e`dZae!eMjTdd2dd3d4d5Zbe!eMjYdd2dd3d6d7Zce!eMjVdd2dddd8d9d:Zde!eMj[dd2ddd;d<d=Zee Rdd>ee@ deffd?d@ZgdAdB ZhdCdD Zi		dLdEee@ fdFdGZjdHdI ZkdJdK ZldS )M    N)Optional)counters)AutoHeuristicSelectAlgorithm)	AHContextcontext_add_stridescontext_add_using_tf32mm_operations)CppGemmTemplate)V   )configir)CUTLASS2xGemmTemplateCUTLASS3xGemmTemplate)CKGemmTemplate)PythonWrapperCodegen)FlexibleLayout	is_triton)register_lowering)autotune_select_algorithmExternKernelChoiceTritonTemplate)	get_gpu_shared_memoryget_tma_workspace_arguse_aten_gemm_kernelsuse_ck_gemm_templateuse_cpp_gemm_templateuse_cutlass_templateuse_max_autotuneuse_triton_templateuse_triton_tma_template   )_is_static_problemaddmm_epilogueextra_mm_configsint8_mm_configsmm_args
mm_configsmm_grid
mm_optionspersistent_mm_configspersistent_mm_gridpersistent_mm_optionsshould_fallback_to_atentriton_configversion_stringreturn.c                 C   s.   d}t || }|rtdd | D S d S )Nz(\d+)\.(\d+)?c                 s   s    | ]}t |V  qd S N)int).0group r5   H/var/www/auris/lib/python3.10/site-packages/torch/_inductor/kernel/mm.py	<genexpr>?   s    z parse_version.<locals>.<genexpr>)rematchtuplegroups)r/   patternr9   r5   r5   r6   parse_version:   s
   r=   TFmm   aX	  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if ((stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1)) and M >= BLOCK_M:
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if ((stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1)) and N >= BLOCK_N:
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
a2	  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return
    stride_am = {{stride("A", 0)}}
    stride_ak = {{stride("A", 1)}}
    stride_bk = {{stride("B", 0)}}
    stride_bn = {{stride("B", 1)}}

    # based on triton.ops.matmul
    pid = tl.program_id(0)
    grid_m = (M + BLOCK_M - 1) // BLOCK_M
    grid_n = (N + BLOCK_N - 1) // BLOCK_N

    # re-order program ID for better L2 performance
    width = GROUP_M * grid_n
    group_id = pid // width
    group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
    pid_m = group_id * GROUP_M + (pid % group_size)
    pid_n = (pid % width) // (group_size)

    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    if (stride_am == 1 and stride_ak == M) or (stride_am == K and stride_ak == 1):
        offs_a_m = tl.max_contiguous(tl.multiple_of(rm % M, BLOCK_M), BLOCK_M)
    else:
        offs_a_m = rm % M
    if (stride_bk == 1 and stride_bn == K) or (stride_bk == N and stride_bn == 1):
        offs_b_n = tl.max_contiguous(tl.multiple_of(rn % N, BLOCK_N), BLOCK_N)
    else:
        offs_b_n = rn % N
    offs_k = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    for k_idx in range(0, tl.cdiv(K, BLOCK_K)):
        {% if not EVEN_K %}
        a_mask = offs_k[None, :] < (K - k_idx * BLOCK_K)
        b_mask = offs_k[:, None] < (K - k_idx * BLOCK_K)
        {% endif %}
        a_k_idx_vals = offs_k[None, :] + (k_idx * BLOCK_K)
        b_k_idx_vals = offs_k[:, None] + (k_idx * BLOCK_K)

        idx_m = offs_a_m[:, None]
        idx_n = a_k_idx_vals
        {{load_input("A", "a", ("idx_m", "idx_n"), mask=None if EVEN_K else "a_mask", indent_width=8)}}

        idx_m = b_k_idx_vals
        idx_n = offs_b_n[None, :]
        {{load_input("B", "b", ("idx_m", "idx_n"), mask=None if EVEN_K else "b_mask", indent_width=8)}}
        acc += tl.dot(a, b, allow_tf32=ALLOW_TF32)

    # rematerialize rm and rn to save registers
    rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M)
    rn = pid_n * BLOCK_N + tl.arange(0, BLOCK_N)
    idx_m = rm[:, None]
    idx_n = rn[None, :]
    mask = (idx_m < M) & (idx_n < N)

    # inductor generates a suffix
    {{store_output(("idx_m", "idx_n"), "acc", "mask")}}
)namegridsourceZmm_persistent_tmaal  
{{def_kernel("A", "B")}}
    M = {{size("A", 0)}}
    N = {{size("B", 1)}}
    K = {{size("A", 1)}}
    if M * N == 0:
        # early exit due to zero-size input(s)
        return

    start_pid = tl.program_id(0)
    grid_m = tl.cdiv(M, BLOCK_M)
    grid_n = tl.cdiv(N, BLOCK_N)
    k_tiles = tl.cdiv(K, BLOCK_K)
    num_tiles = grid_m * grid_n
    tiles_per_SM = num_tiles // NUM_SMS
    if start_pid < num_tiles % NUM_SMS:
        tiles_per_SM += 1

    tile_id = start_pid - NUM_SMS
    ki = -1

    width = GROUP_M * grid_n
    rk_for_mask = tl.arange(0, BLOCK_K)
    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    workspace_base = ws_ptr + start_pid * 2 * TMA_SIZE
    a_desc_ptr = workspace_base
    b_desc_ptr = workspace_base + TMA_SIZE

    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=a_desc_ptr,
        global_address=A,
        load_size=[BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
        global_size=[M, K] if A_ROW_MAJOR else [K, M],
        element_ty=A.dtype.element_ty,
    )
    triton.language.extra.cuda.experimental_device_tensormap_create2d(
        desc_ptr=b_desc_ptr,
        global_address=B,
        load_size=[BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
        global_size=[K, N] if B_ROW_MAJOR else [N, K],
        element_ty=B.dtype.element_ty,
    )

    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(a_desc_ptr)
    tl.extra.cuda.experimental_tensormap_fenceproxy_acquire(b_desc_ptr)

    pid_m = 0
    pid_n = 0
    rm = 0
    rn = 0

    for _ in range(0, k_tiles * tiles_per_SM):
        ki = tl.where(ki == k_tiles - 1, 0, ki + 1)
        if ki == 0:
            tile_id += NUM_SMS
            # re-order program ID for better L2 performance
            group_id = tile_id // width
            group_size = min(grid_m - group_id * GROUP_M, GROUP_M)
            pid_m = group_id * GROUP_M + (tile_id % group_size)
            pid_n = (tile_id % width) // (group_size)

            rm = pid_m * BLOCK_M
            rn = pid_n * BLOCK_N

        rk = ki * BLOCK_K

        a = tl._experimental_descriptor_load(
            a_desc_ptr,
            [rm, rk] if A_ROW_MAJOR else [rk, rm],
            [BLOCK_M, BLOCK_K] if A_ROW_MAJOR else [BLOCK_K, BLOCK_M],
            A.dtype.element_ty,
        )
        b = tl._experimental_descriptor_load(
            b_desc_ptr,
            [rk, rn] if B_ROW_MAJOR else [rn, rk],
            [BLOCK_K, BLOCK_N] if B_ROW_MAJOR else [BLOCK_N, BLOCK_K],
            B.dtype.element_ty,
        )
        acc += tl.dot(
            a if A_ROW_MAJOR else a.T,
            b if B_ROW_MAJOR else b.T,
            allow_tf32=ALLOW_TF32,
        )

        if ki == k_tiles - 1:
            # rematerialize rm and rn to save registers
            rcm = rm + tl.arange(0, BLOCK_M)
            rcn = rn + tl.arange(0, BLOCK_N)
            idx_m = rcm[:, None]
            idx_n = rcn[None, :]
            mask = (idx_m < M) & (idx_n < N)

            # inductor generates a suffix
            {{store_output(("idx_m", "idx_n"), "acc", "mask", indent_width=12)}}
            acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
c                 C   s   t | S r1   )r   )fnr5   r5   r6   lazy_register_extern_choiceM  s   rD   z
at::mm_outzat::addmm_out)Zop_overloadzat::_int_mm_outzat::_sparse_semi_structured_mm)Zhas_out_variantc                 C   s   |   tjtjfv S r1   )	get_dtypetorchZint8Zuint8)matr5   r5   r6   _is_int8_mata  s   rH   c                 C   s   | | dkS )Ni    r5   )mnkr5   r5   r6   _is_large_block_for_cpue  s   rL   c                 C   s   | dkr	dt dS i S )Ncpug      ?)scaleexclude)rL   )devicer5   r5   r6   mm_config_kwargsj  s
   rQ   outalphabetac                C   sL   |  ddks| ddkrtj| d |||||dS tj| |||||dS )z
    Giving torch.addmm a 1D tensor calls a different (faster) cublasLt
    kernel under the hood.  There are a few shapes where this is slower,
    but they are rare.
    r   r!   rR   )stridesizerF   addmm)inpmat1mat2rS   rT   rU   r5   r5   r6   
bias_addmms  s   r\   )Ztype_promotion_kindlayoutc                   s4  t | ||d\}}}}} }d}td d| d| d|   d7  < td||||  | | |}t s@t|j|j|j	d}t
 rLt| |f|gng }t|\}	}
|
rt|rt|||fi tt| D ]}tj|f| |f|d	t||||| qit| |rt|||fi tt| D ]"}tj|f| |f|td
|  ddt|||||t| | q|
rt||||rt||| |g |
rt||||rt !||| |g t"|| |rt#$||| |g | |g}|
rnt|rnt%j&j'(|rnt)| rng }t
 r|*d t+|}t,|||fi tt| D ]}tj|f| |f|d	t||||| q!t-| |||||||t. d d|d t%j&j'/|sn d urht+ dkrh fdd|D }n|d | }t0j1D ]}|*t2|| |f| qqt3|rt| |f|4 S t5||| |g|S )Nr]   r>   aten_mm_infozaten.mm__r!   zOTuned aten.mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%srP   dtyperW   input_nodesr^   r   Znum_tma_descriptorsrP   rd   r^   Zworkspace_argZ	extern_mm
   )top_kalways_includedr   c                    s   g | ]}| v r|qS r5   r5   )r3   choiceZ
ah_choicesr5   r6   
<listcomp>  s    ztuned_mm.<locals>.<listcomp>)6r&   r   loginforE   r   r   rP   rb   rW   r   aten_mmbindr"   r   r'   rQ   r   get_device_typemm_templatemaybe_append_choicer)   r    r*   persistent_tma_mm_templater   
get_devicer,   r   r   add_cutlass_gemm_choicesr   r   add_ck_gemm_choicesr   r	   add_choicesrF   	_inductorr   Zrun_autoheuristicr   appendlenr$   mm_autoheuristicr   Zcollect_autoheuristicinductor_configZexternal_matmulrD   r-   output_noder   )rZ   r[   r^   rI   rJ   rK   r@   Zaten_layoutchoicesstatic_shape
is_nonzeror   rd   ri   Z num_choices_before_extra_configsr5   rk   r6   tuned_mm  s   &
"

	




r   c             
   C   sF  t | ||tjd\}}}}} }td d| d| d|   d7  < td||||  | | t|\}}|oA|oAt||||}t	 rNt
| |f|gng }	|r^tj|	|| |gddd |rt|dd	rt|||fi tt| D ]}
tj|	f| |f|d
t|
|||| qut|	rt
| |f| S td|	| |g|S )N)r^   	out_dtyper_   zaten._int_mm_r`   r!   zTTuned aten._int_mm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sTZfuseableZnon_fuseable)Zenable_int32rc   Zint_mm)r&   rF   Zint32r   rm   rn   rE   r"   r   r   aten__int_mmrp   r   rv   r   r%   rQ   r   rq   rr   rs   r)   r-   r~   r   )rZ   r[   r^   rI   rJ   rK   r   r   Zuse_cutlassr   r   r5   r5   r6   tuned_int_mm  sL   
&

r   )rT   rU   r^   c             
   C   s  d}t ||| |d\}}}	}}}}
t|\}}td d| d| d|	   d7  < td|||	| | | |r>t sqdd	lm}m	} t
||rU||j|j|jd
}t retj| ||f|||dgng }td|| ||g|S t rtj|
||f|||dgng }t r|
 d dkr|
 jdkrtjjr|dtj|
||f|||d |r%t|r%t|||	fi tt|D ]!}tj |f|
||f|dt!||||	|dt"|j||d qt#||r%t$|||	fi tt|D ]-}t%j |f|
||f|t&d| ddt!||||	|t'||dt"|j||d q|rO|rOt(||||	rOt)*|
j+j,d dkrOt-j.|||||
g||g dd |rjt/||||	rjt0j1|||||
g||g dd t2|||rt3j4|||
||g||dd t5|r|6tj|
||f||||d |
 d dkr|
 jdkrtjjr|dtj|
||f|||d td||
||g|S )N)rU   rT   r]   r_   zaten.addmm_r`   r!   zRTuned aten.addmm: m=%s, n=%s, k=%s, mat1_dtype=%s, mat2_dtype=%s, output_layout=%sr   )FixedLayoutr   ra   )rT   rU   rX   cudarc   )Zprefix_argsZepilogue_fnr   re   rf   )r   r   r!   )rT   rU   Zinput_reorderT)rT   rU   Zhas_bias)7r&   r"   r   rm   rn   rE   r   torch._inductor.irr   r   
isinstancerP   rb   rW   r   
aten_addmmrp   r   Z
get_strideru   typer}   tritonZautotune_cublasLtinsertaten_bias_addmmr   r'   rQ   r   rq   rr   rs   r)   r#   r    r*   rt   r   r,   r   r   Zstatically_known_int_or_noner^   rV   r   rv   r   r   rw   r   r	   rx   r-   rz   )rY   rZ   r[   rT   rU   r^   Zordered_kwargs_for_cpp_kernelrI   rJ   rK   Zinp_expandedr   r   r   r   r   r   r5   r5   r6   tuned_addmm(  s"  &


		"	
	

		
	r   )r   r^   c                C   s  ddl m} || ||\} }}|  \}}| \}}	| \}
}tjj||}tjjd| |
}|d u rQddlm} ||	 |rE|n|
 ||g|dg}n|d u sYJ dt rhtj| ||f||dgng }|| dkrt||||rtj||| ||gddd	 td
|| ||g|S )Nr   )realize_inputsr   )r   r!   z,out_dtype is ignored if layout is specified.)r   Tr   Zsparse_semi_structured_mm)Z torch._inductor.select_algorithmr   get_sizer
   graphsizevarsZguard_equalsr   r   ru   rE   r   aten__sparse_semi_structured_mmrp   r   r   rv   r   )rZ   Z	mat1_metar[   r   r^   r   m1Zk1m2r`   Zk2rJ   rI   rK   r   r   r5   r5   r6   tuned_sparse_semi_structured_mm  s>   
r   indexc                 C   s   t j| pd}|jdkS )Nr      )rF   r   Zget_device_propertiesmajor)r   propsr5   r5   r6   _is_sm7x_or_older_gpu  s   
r   c                 C   s   t dd | D S )Nc                 s   s    | ]}t |tV  qd S r1   )r   r2   )r3   dimr5   r5   r6   r7     s    zdims_are_int.<locals>.<genexpr>)all)dimsr5   r5   r6   dims_are_int   s   r   c                 C   s
  t ||| ||\} }}t| ||gsd S |jtjkrd S tj dkr(t dkr*d S | dkr<|d dks:|d dkr<d S | dkrQ|dkrQ|dkrQtdddd	d
dS | dkrj| dkrj|dkrj|dkrjtdddd	d
dS | dkr| dkr|dkr|dkrtdddd	d
dS d S )N)   r   i  r!      r   i   @            )ZBLOCK_MZBLOCK_NZBLOCK_KZ
num_stagesZ	num_warps    )	get_size_hintsr   rb   rF   Zfloat16r   Zget_device_capabilityr   r.   )rI   rJ   rK   r   rZ   r[   
mat2_dtyper^   r5   r5   r6   try_heuristic  sF   
   r   rh   c              	      s   t | ||||\}}}t|||gsd S t| |\}} fdd}dd }||||| |||}t|||| ||	d}|
d urE|j|
|dS | S )Nc                    s   t  }|d|  |d| |d| |jd|jjdd |jd|jjdd t|d| t|d	| |jd
|j dd |jd|j dd  dkrVt||jj |S )NrI   rK   rJ   Z
mat1_dtypeT)Zis_categoricalr   rZ   r[   Zmat1_iscontigZmat2_iscontigr>   )r   Zadd_featurer^   rb   r   Zis_contiguousr   )rI   rK   rJ   rZ   r[   mat1_stridemat2_stridecontextr@   r5   r6   get_contextE  s"   z%mm_autoheuristic.<locals>.get_contextc                   S   s   d S r1   r5   r5   r5   r5   r6   fallbackY  s   z"mm_autoheuristic.<locals>.fallback)r   r   rd   r   r@   Zaugment_contextprecondition)ri   )r   r   get_size_hints_stridesr   Zget_top_k_choices_callerZget_choice_caller)rZ   r[   rI   rJ   rK   r   r@   rd   opsr   rh   ri   r   r   r   r   r   Zautoheuristicr5   r   r6   r|   2  s*   
r|   c                 C   sr   t |tr
t |tstjjj|  tjj	j
d\}}t |tr$t |ts4tjjj| tjj	j
d\}}|||fS )Nr   )r   r2   r
   r   r   
size_hintsr   rF   ry   r   unbacked_symint_fallback)rZ   r[   rI   rJ   rK   r5   r5   r6   r   p  s   


r   c                 C   sb   | j j}|j j}||g}g }|D ]}t|ts#tjjj|tj	j
jd}|| q|d |d fS )Nr   r   r!   )r^   rV   r   r2   r
   r   r   r   rF   ry   r   r   rz   )rZ   r[   r   r   stridesZstrides_hintsrV   r5   r5   r6   r     s   
r   )NN)m	functoolsloggingr8   typingr   rF   Ztorch._dynamo.utilsr   Z+torch._inductor.autoheuristic.autoheuristicr   Z1torch._inductor.autoheuristic.autoheuristic_utilsr   r   r   r   Z)torch._inductor.codegen.cpp_gemm_templater	   Ztorch._inductor.virtualizedr
    r   r}   r   Zcodegen.cuda.gemm_templater   r   Z'codegen.rocm.ck_universal_gemm_templater   Zcodegen.wrapperr   r   r   Zloweringr   Zselect_algorithmr   r   r   utilsr   r   r   r   r   r   r   r   r    Z	mm_commonr"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   strr:   r2   r=   r   __version__Ztriton_versionZ
has_tritonZtriton_majorZtriton_minorImportError	getLogger__name__rm   r   ZatenversionZhiprr   rt   	lru_cacherD   r>   ro   rX   defaultr   Z_int_mmr   Z_sparse_semi_structured_mmr   rH   rL   rQ   r\   r   r   r   r   r   boolr   r   r   r|   r   r   r5   r5   r5   r6   <module>   s   ,<



CF h
	
x- %,9
>