o
    Zh                     @   s  d dl Z d dlmZ d dlZd dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZ ddl	mZmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZ ddlmZ e eZeej dddZ!eej"j#j$ddej%dZ&ej"j#Z#ej"j'Z'ej"j(Z(dddZ)dddZ*dS )    N)Any)mm_args   )configlowering)CppGemmTemplateCppWoqInt4GemmTemplate)create_epilogue_with_attr)expandregister_lowering)WeightInt4PackMatmul)autotune_select_algorithmExternKernelChoicerealize_inputs)use_aten_gemm_kernelsuse_cpp_gemm_templateuse_max_autotune)Vzat::_weight_int8pack_mmF)has_out_variantz*at::native::_weight_int4pack_mm_cpu_tensor)r   Zkernel_creatorreturnc                   C   s>   t tjtjtjg t tj t tj t tj d S N)r   Zadd_needs_realized_inputs	quantizedZ
max_pool2d
_quantizedZ$wrapped_fbgemm_pack_gemm_matrix_fp16Z!wrapped_fbgemm_linear_fp16_weightmake_fallback r   r   R/var/www/auris/lib/python3.10/site-packages/torch/_inductor/quantized_lowerings.pyregister_quantized_ops'   s   r   c                  C   s   t tjd dd ddtjdtjdtjdtdtf
dd	} t tjd dd ddtjdtjd
tdtjdtdtfdd}t	tj
 t	tj d S )N)Ztype_promotion_kind)layoutinputweightscaler   r   c          
         s   t | | dd\}}} }}| tjtjtjfv r!| tjks#J  }t r2t	||f|gng }dtj
dtf fdd}	t|||ddrVtj||||gd|	d t|d	krrtjrrt srtd
 t	||f| S td|||g|S )NT)r   mat2_transposedbufr   c                    s   t | dtt jdS )Nmul)other)r	   r   r
   size)r"   r   r    r   r   _mul_epilogueO   s   z?register_woq_mm_ops.<locals>.int8pack_mm.<locals>._mul_epilogue)r!   )Ztrans_wZepilogue_creatorr   3No choices for GEMM, using ATen backend as fallback_weight_int8pack_mm)r   	get_dtypetorchbfloat16float16floatZint8r   aten__weight_int8pack_mmbindTensorr   r   r   add_choicesleninductor_configautotune_fallback_to_atenlogwarningoutput_noder   )
r   r   r    r   _mat1mat2aten_layoutchoicesr'   r   r&   r   int8pack_mm5   sB   	

z(register_woq_mm_ops.<locals>.int8pack_mm
qGroupSizeqScaleAndZerosc                S   sD  t | ||ddd\}}}}}}| tjtjtjfv r"| tjks$J tjj	tj
|tjdd d}|}	t rBt||||f|	gng }
t rdt|	||dd|drd|  rdt| |
|	||||g t|
dkrtjrt std t||||f|	 S dtjjjd	tjfd
d}|dd d}td|
||||g|	|dS )NT)r   Zuse_4x2_dimr!   )dtype)name)r!   Zis_woq_int4Zq_group_sizer   r(   xr   c                 S   s6   |    sJ |  }|  }tjdd|tj|dS )Nr      )rA   device)
get_layoutis_contiguousget_sizeZ
get_devicer+   randintuint8)rC   shaperE   r   r   r   get_example_weight   s   zHregister_woq_mm_ops.<locals>.int4pack_mm_cpu.<locals>.get_example_weightc                 S   s   t jj|   S r   )r   graph	constantsget_name)rC   r   r   r   <lambda>   s    z>register_woq_mm_ops.<locals>.int4pack_mm_cpu.<locals>.<lambda>)r      _weight_int4pack_mm_for_cpu)input_gen_fns) r   r*   r+   r,   r-   r.   rJ   r   rM   Zadd_tensor_constantZtensorZint64r   aten__weight_int4pack_mm_cpur0   r   r   rF   rG   r   r2   r3   r4   r5   r6   r7   r8   Z	_inductorZirZIRNoder1   r   )r   r   r?   r@   r   r9   r:   r;   Z
group_sizer<   r=   rL   rS   r   r   r   int4pack_mm_cpuk   sr   	
	





z,register_woq_mm_ops.<locals>.int4pack_mm_cpu)r   atenr)   r+   r1   r   rR   intr   r   Z_dyn_quant_matmul_4bitZ_dyn_quant_pack_4bit_weight)r>   rU   r   r   r   register_woq_mm_ops4   s<   5OrX   )r   N)+loggingtypingr   r+   Z torch._inductor.kernel.mm_commonr    r   r4   r   Zcodegen.cpp_gemm_templater   r   Zcodegen.cpp_utilsr	   r
   r   Z	mkldnn_irr   Zselect_algorithmr   r   r   utilsr   r   r   Zvirtualizedr   	getLogger__name__r6   r)   r/   opsr   Zint4mm_packed_weight_cpucreaterT   r   rV   r   rX   r   r   r   r   <module>   s6    

