o
    Zh-                     @   s  d dl mZ d dlmZmZ d dlZd dlZd dlmZ d dl	m
Z
 ddlmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZmZ dd	lmZ 	
			d8dddddddee dee dee dededeee  dee d  ded fddZ!			
d9dddddddee d  ded defddZ"dd Z#G dd deZ$G d d! d!eZ%G d"d# d#eZ&G d$d% d%eZ'G d&d' d'eZ(G d(d) d)eZ)G d*d+ d+eZ*G d,d- d-eZ+G d.d/ d/eZ,G d0d1 d1eZ-G d2d3 d3eZ.G d4d5 d5eZ/G d6d7 d7eZ0dS ):    )Sequence)AnyOptionalN)make_channels_last_strides_for
OrderedSet   )ExternKernelAllocFixedLayoutFlexibleLayoutget_device_typeir_node_to_tensor is_contiguous_storage_and_layoutLayoutmay_convert_to_optionalMultiOutputMultiOutputLayoutMutationOutput
NoneLayout	TensorBox)convert_shape_to_inductorpad_listlike)VFxr   weightbiaspaddingstridedilationgroups
transposedoutput_paddingquantize_argsotherc                  C   sX  dd }dd }|   |   |dur|   tjj t|dd}t|dd}t| d }d	t|  k r>|ksAJ  J d	t|  k rN|ksQJ  J d	t|  k r^|ksaJ  J t||}t||}t||}|	du r{td	g|}	nd	t|	  k r|ksJ  J t|	|}	t|t	t
jjjfsJ |r|||}| }|||||	|||}n|durt|ddn|}tjj||||||||	|	}| }d	gtttd
t|d
  }t|g| }W d   n1 sw   Y  | ||}tdd |D  }|rt|rt|}nt|}t|t|ks$J t|dv s-J |g}|
durY|
\}}}}|   |   |   |   |||g |g ||g }n||g7 }|durv| ||}t|tsqJ ||g7 }t| | t |t |}||||g}|r|!d
|	 |dur|"| n|!d	| |||||fS )a}  
    This function is a helper function to prepare inputs, layout and constant args
    for convolution post-op fusion's create function, including deciding the output
    layout (channels first or channels last), realizing inputs and make them etc. The
    function only supports the CPU/XPU device since conv post-op fusion kernel is only
    supported on CPU/XPU right now.
    c                 S   s   t | t |ksJ dt | }|dksJ dd}d}	g }
|
| |  |
||	 |  td|D ]1}|| d ||d   d }| | d ||d   ||d  d  | ||d   }|
| q3ttt|
S )NzExpect input dim == weight dim   zExpect input dim > 2r   r   )lenappendrangelistmapint)output_sizeweight_sizer   r!   r   r   r   dimZ	BATCH_DIMZWEIGHT_INPUT_CHANNELS_DIM
input_sizedkernelZinput_size_d r1   H/var/www/auris/lib/python3.10/site-packages/torch/_inductor/mkldnn_ir.py_conv_input_size5   s(   
z<_prepare_convolution_fusion_create.<locals>._conv_input_sizec                    s   |    t }|dksJ d|dkr9g }| d |  | d |  | fddtd|D  |S | dd  }|S )Nr$   zExpect weight dim > 2r   r   c                 3   s    | ]} | V  qd S Nr1   ).0r/   Zprepacked_weight_sizer1   r2   	<genexpr>[   s    z[_prepare_convolution_fusion_create.<locals>._original_deconv_weight_size.<locals>.<genexpr>)sizer%   r&   extendr'   Z	transpose)Zprepacked_weightr   r-   r,   r1   r6   r2   _original_deconv_weight_sizeP   s   zH_prepare_convolution_fusion_create.<locals>._original_deconv_weight_sizeNT)Zguard_shaper$   r   r   c                 s   s    | ]}t |tV  qd S r4   )
isinstancer*   )r5   ir1   r1   r2   r7      s    z5_prepare_convolution_fusion_create.<locals>.<genexpr>cpuZxpu)#realizer   graphZ	fake_moder   r%   r8   r   r;   r*   sympycorenumbersIntegertorchopsatenZconvolutionr(   reversedr'   require_stride_orderallr   r   contiguous_stridesr   r   r   r
   Zget_device_or_error	get_dtyper   insertr&   ) clsr   r   r   r   r   r   r   r    r!   r"   r#   r3   r:   Zx_fakeZweight_fakedimsr,   r.   r+   Z	bias_fakeoutputreq_stride_orderZdynamic_shapesoutput_strideinputsx_scalex_zero_pointw_scalew_zero_pointkernel_layoutconstant_argsr1   r1   r2   "_prepare_convolution_fusion_create   s   
   


 

 4	




rZ   
binary_sumc                 C   sd  |   |   |dur|   | ^ }}| \}}	t||	g }
tttt| }| ||}t|t|ks@J t|dv sHJ |g}|durs|\}}}}|   |   |   |   |||g |g ||g }n||g7 }|dur|r| ||}||g }t	|
}t
| | |
|}g }|dur|| n|d| |||||fS )z
    This function is a helper function to prepare inputs, layout and constant args
    for linear post-op fusion's create function. The function only supports the CPU device
    since linear post-op fusion kernel is only supported on CPU right now.
    Nr=   r   )r?   get_sizer(   rH   r'   r%   rI   r   r   rK   r
   
get_devicerL   r&   rM   )rN   r   r   r   r"   r#   r[   m_ocr+   rQ   rS   rT   rU   rV   rW   rR   rX   rY   r1   r1   r2   _prepare_linear_fusion_create   sH   


ra   c                 C   s,   t |  | g }t|  d| _|g| _|S )Ndevice)r   Z
get_layoutr   r]   layoutoutputs)packed	output_irr1   r1   r2   _create_output_node
  s   rh   c                       sr   e Zd Z	d	d fddZ fddZedd	d
d	dd	dee dee dee dedeee	  fddZ
  ZS )ConvolutionUnaryr1   returnNc                    $   t  j|||d tjjjjdd d S )NZ,aoti_torch_cpu_mkldnn__convolution_pointwiseop_overloadZcpp_kernel_name)super__init__rE   rF   mkldnn_convolution_pointwisedefaultselfrd   rS   rY   	__class__r1   r2   ro        

zConvolutionUnary.__init__c                       | d t | d S Nz.torch/csrc/inductor/aoti_torch/c/shim_mkldnn.hinclude_extra_headerrn   codegenrt   wrapperru   r1   r2   r|   %     
zConvolutionUnary.codegenr   r   r   r   padding_stride_	dilation_r   scalarsc              	   C   sH   t | |||||||\}}}}}||t|	|
g }t|||d}t|S )Nrd   rS   rY   )rZ   r   ri   rh   )rN   r   r   r   r   r   r   r   attrr   	algorithmrS   rY   rX   r_   rf   r1   r1   r2   create)  s(   zConvolutionUnary.creater1   rj   N__name__
__module____qualname__ro   r|   classmethodr(   r*   r   r   r   __classcell__r1   r1   ru   r2   ri     s0    

ri   c                       s   e Zd Z		d	d fddZ fddZedd	d
d	dd	dd	dee dee dee dedede	e
 de	e de	ee  de	e fddZ  ZS )ConvolutionBinaryr1   rj   Nc                    s*   t  j|||d tjjjjdd || _d S )NZ3aoti_torch_cpu_mkldnn__convolution_pointwise_binaryrl   )rn   ro   rE   rF   rp   rq   binarycpp_constant_args)rt   rd   rS   rY   r   ru   r1   r2   ro   N  s   

zConvolutionBinary.__init__c                    rx   ry   rz   r}   ru   r1   r2   r|   _  r   zConvolutionBinary.codegenr   r   r#   r   r   r   r   r   r   binary_attrbinary_alpha
unary_attrunary_scalarsunary_algorithmc              	   C   sd   t | |||||||\}}}}}| ||}|d| ||	|
|t||g }t|||d}t|S )Nr   r   )rZ   rI   rM   r   r   rh   )rN   r   r#   r   r   r   r   r   r   r   r   r   r   r   rS   rY   rX   rQ   r_   rf   r1   r1   r2   r   c  s0   zConvolutionBinary.create)r1   r1   r   )r   r   r   ro   r|   r   r(   r*   strr   floatr   r   r   r1   r1   ru   r2   r   M  sF    	

r   c                       s   e Zd Z	d	d fddZ fddZdeej fdd	Ze	d
dddddddde
e de
e de
e dededee dee dee
e  dee fddZ  ZS )ConvolutionBinaryInplacer1   rj   Nc                    s~   |d |d g|dd   }t  j|||d tjjjjdd tt|d 	 d|d | tt|d 	 d|d | g| _
d S )Nr   r   r$   Z4aoti_torch_cpu_mkldnn__convolution_pointwise_binary_rl   rb   )rn   ro   rE   rF   rp   Z_convolution_pointwise_r   r   r   r]   Zmutation_outputs)rt   rX   rS   rY   Zreordered_inputsru   r1   r2   ro     s   


z!ConvolutionBinaryInplace.__init__c                    rx   ry   rz   r}   ru   r1   r2   r|     r   z ConvolutionBinaryInplace.codegenc                 C      t  S r4   r   rt   r1   r1   r2   get_unbacked_symbol_defs     z1ConvolutionBinaryInplace.get_unbacked_symbol_defsr   r   r#   r   r   r   r   r   r   r   r   r   r   r   c              	   C   st   t | |||||||\}}}}}| ||}|d| ||	|
|t||g }tt|d  d||d}|jd S )Nr   rb   )rX   rS   rY   r   )rZ   rI   rM   r   r   r   r]   rS   )rN   r   r#   r   r   r   r   r   r   r   r   r   r   r   rS   rY   r_   rQ   rf   r1   r1   r2   r     s0   
zConvolutionBinaryInplace.creater   r   )r   r   r   ro   r|   r   rA   Symbolr   r   r(   r*   r   r   r   r   r   r   r1   r1   ru   r2   r     sF    	

r   c                       sz   e Zd Z	d	d fddZ fddZedd	d
d	dd	dee dee dee dee dedeee	  fddZ
  ZS )ConvolutionTransposeUnaryr1   rj   Nc                    rk   )NZ6aoti_torch_cpu_mkldnn__convolution_transpose_pointwiserl   )rn   ro   rE   rF   rp   Z _convolution_transpose_pointwiserr   rs   ru   r1   r2   ro     rw   z"ConvolutionTransposeUnary.__init__c                    rx   ry   rz   r}   ru   r1   r2   r|     r   z!ConvolutionTransposeUnary.codegenr   r   r   r   r   output_padding_r   r   groups_r   c                 C   sP   d}t | |||||||||
\}}}}}||	t|
|g }t|||d}t|S )NTr   )rZ   r   r   rh   )rN   r   r   r   r   r   r   r   r   r   r   r   r    rS   rY   rX   r_   rf   r1   r1   r2   r     s<   z ConvolutionTransposeUnary.creater   r   r   r1   r1   ru   r2   r     s4    	
r   c                       s~   e Zd Z	d	d fddZ fddZedd	d
d	dd	dd	dd	dd	dd	dee dee dee dededefddZ	  Z
S )QConvPointWisePT2Er1   rj   Nc                    s2   t |dk| _t j|||dtjjjjdd dS )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [stride, padding, dilation, groups, x_scale, x_zp, o_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, stride, padding, dilation, groups, x_scale, x_zp, o_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
           NZ(aoti_torch_cpu__qconv2d_pointwise_tensorrl   )	r%   has_biasrn   ro   rE   rF   onednnqconv2d_pointwiserr   rs   ru   r1   r2   ro     s   

zQConvPointWisePT2E.__init__c                    4   | d t | t| jtr| | d S d S ry   r{   rn   r|   r;   rd   r   Zcodegen_size_assertsr}   ru   r1   r2   r|   9  
   
zQConvPointWisePT2E.codegenqxr   rT   rU   qwrV   rW   r   r   r   r   r   output_scaleoutput_zero_pointc                 C   s   d}d }t | ||||	||
|||||||g\}}}}}|d u r-|d |d |d< |d< n|d |d |d< |d< |||||t||g }|d usLJ |tjtjfv rW||_t|||dS )NFr$   r   r   r   )rZ   r   rE   float32bfloat16dtyper   )rN   r   rT   rU   r   rV   rW   r   r   r   r   r   r   r   output_dtyper   r   r   r    r!   rS   rY   rX   r_   r1   r1   r2   r   ?  sP   
	zQConvPointWisePT2E.creater   r   )r   r   r   ro   r|   r   r(   r*   r   r   r   r1   r1   ru   r2   r     sD    	
r   c                       s   e Zd Z	d	d fddZ fddZdd	 Zdeej fd
dZ	e
dddddddddddddee dee dee deddddfddZ  ZS )QConvPointWiseBinaryPT2Er1   rj   Nc                    s8   t |dk| _d| _t j|||dtjjjj	dd dS )ag  
        Needs input/weight/output qparams
        if bias is not None
            - inputs = [x, x_scale, x_zp, w,  w_scale, w_zp, accum, b]
            - const_args = [stride, padding, dilation, groups, o_scale, o_zp,
            output_dtype, accum_scale, accum_zp, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, x_scale, x_zp, w,  w_scale, w_zp, accum]
            - const_args [b, stride, padding, dilation, groups, o_scale, o_zp,
             output_dtype, accum_scale, accum_zp, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
              NZ/aoti_torch_cpu__qconv2d_pointwise_binary_tensorrl   )
r%   r   idx_for_inplace_sumrn   ro   rE   rF   r   r   r   rs   ru   r1   r2   ro     s   

z!QConvPointWiseBinaryPT2E.__init__c                    r   ry   r   r}   ru   r1   r2   r|     r   z QConvPointWiseBinaryPT2E.codegenc                 C   s   | j | j  gS r4   )rS   r   get_namer   r1   r1   r2   get_mutation_names  s   z+QConvPointWiseBinaryPT2E.get_mutation_namesc                 C   r   r4   r   r   r1   r1   r2   r     r   z1QConvPointWiseBinaryPT2E.get_unbacked_symbol_defsr   r   rT   rU   r   qaccumr   r   r   r   r   r   r   c                 C   s   d}d }t | ||||
|	||||||||g|\}}}}}|d u r.|d |d |d< |d< n|d |d |d< |d< |||||||||t||g
 }|dksSJ dtj|  tt| d||d}|j	|j
 S )	NFr$   r   r   sumzCFor now, only post op sum is supported in QConvPointWiseBinaryPT2E.rb   r   )rZ   r   r   r@   mark_buffer_mutatedr   r   r   r]   rS   r   )rN   r   rT   rU   r   rV   rW   r   r   r   r   r   r   r   r   r   Zaccum_scaleZaccum_zero_pointr   alphar   r   r   r    r!   rS   rY   Z_kernel_layoutrQ   rf   r1   r1   r2   r     s^   

zQConvPointWiseBinaryPT2E.creater   r   )r   r   r   ro   r|   r   r   rA   r   r   r   r(   r*   r   r   r1   r1   ru   r2   r     sD    	
r   c                       s<   e Zd Z	d
	d fddZ fddZedd	 Z  ZS )MKLPackedLinearr1   rj   Nc                    "   t  j|||d tjjjjd d S N)rm   )rn   ro   rE   rF   ZmklZ_mkl_linearrr   rs   ru   r1   r2   ro        

zMKLPackedLinear.__init__c                    rx   ry   rz   r}   ru   r1   r2   r|     r   zMKLPackedLinear.codegenc                 C   s   |  | |}|  | |}| ^ }}| \}}t||g }	t|	}
|||g}|g}|d ur;||g7 }n|dd  tt|	 |
 |	|
||dS )Nr   r   )require_stride1realize_inputr\   r(   r   rK   rM   r   r
   r]   rL   )rN   r   Zpacked_wZorig_wBZ
batch_sizer^   r_   r`   r+   rR   rS   rY   r1   r1   r2   r     s$   

zMKLPackedLinear.creater   r   r   r   r   ro   r|   r   r   r   r1   r1   ru   r2   r     s    r   c                       sD   e Zd Z	d	d fddZ fddZedd	 Zd
d Z  ZS )LinearUnaryr1   rj   Nc                    rk   )NZ aoti_torch_cpu__linear_pointwiserl   )rn   ro   rE   rF   rp   _linear_pointwiserr   rs   ru   r1   r2   ro   -  rw   zLinearUnary.__init__c                    rx   ry   rz   r}   ru   r1   r2   r|   <  r   zLinearUnary.codegenc                 C   s   |  | |}|  | |}| ^ }}| \}	}t||	g }
||g}||r-|ndg|g}|d urD|  | |}|| n|dd  tt| |	 |
d||d}t
|S )Nr   rc   r   r8   r   )require_contiguousr   r\   r(   r&   rM   r   r
   r]   rL   rh   )rN   r   wr   r   r   r   r^   _icr`   r+   rS   rY   rf   r1   r1   r2   r   @  s*   	zLinearUnary.createc                 C      d S r4   r1   r   r1   r1   r2   apply_constraint[     zLinearUnary.apply_constraintr   r   )	r   r   r   ro   r|   r   r   r   r   r1   r1   ru   r2   r   ,  s    
r   c                       sH   e Zd ZdZ	d	d fddZ fddZed	d
 Zdd Z  Z	S )LinearBinaryz)torch.ops.mkldnn._linear_pointwise.binaryr1   rj   Nc                    rk   )NZ'aoti_torch_cpu__linear_pointwise_binaryrl   )rn   ro   rE   rF   rp   r   r   rs   ru   r1   r2   ro   b  rw   zLinearBinary.__init__c                    rx   ry   rz   r}   ru   r1   r2   r|   q  r   zLinearBinary.codegenc                 C   s   |  | |}|  | |}|  | |}| ^ }}| \}}t||g }	|||g}
|g}|d urF|  | |}|
| n|d| tt| |	 |	d|
|d}t
|S )Nr   r   r   )r   r   r\   r(   r&   rM   r   r
   r]   rL   rh   )rN   r   yr   r   r   r^   r   r`   r+   rS   rY   rf   r1   r1   r2   r   u  s,   
	zLinearBinary.createc                 C   r   r4   r1   r   r1   r1   r2   r     r   zLinearBinary.apply_constraintr   r   )
r   r   r   r0   ro   r|   r   r   r   r   r1   r1   ru   r2   r   _  s    
r   c                       sd   e Zd Z		d	d fddZ fddZed	d
dd
dd
dd
dd
dd
dd
dedefddZ  Z	S )QLinearPointwisePT2Er1   Trj   Nc                    s*   || _ t j|||dtjjjjdd dS )a  
        if bias is not None
            - inputs = [x, w, b, weight_scale, weight_zp]
            - const_args is: [x_scale, x_zp, o_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, weight_scale, weight_zp]
            - const_args is: [bias, x_scale, x_zp, o_scale, o_zp,
              fp32_output, unary_attr, unary_scalars, unary_algorithm]
        NZ(aoti_torch_cpu__qlinear_pointwise_tensorrl   )r   rn   ro   rE   rF   r   qlinear_pointwiseZtensorrt   rd   rS   rY   r   ru   r1   r2   ro     s   

zQLinearPointwisePT2E.__init__c                    r   ry   r   r}   ru   r1   r2   r|     
   
zQLinearPointwisePT2E.codegenr   r   rT   rU   r   rV   rW   r   r   r   c              	   C   sp   t | |||||||g\}}}}}|||	|
|t||g }|
d us#J |
tjtjfv r.|
|_t||||d udS )Nrd   rS   rY   r   )ra   r   rE   r   r   r   r   )rN   r   rT   rU   r   rV   rW   r   r   r   r   Zpost_op_nameZpost_op_argsZpost_op_algorithmrS   rY   rX   r_   r1   r1   r2   r     s0   
	zQLinearPointwisePT2E.creater1   Tr   )
r   r   r   ro   r|   r   r   r*   r   r   r1   r1   ru   r2   r     s6    	
r   c                       sp   e Zd Z		d	d fddZ fddZd	d
 ZedddddddddddddddddedefddZ	  Z
S )QLinearPointwiseBinaryPT2Er1   Trj   Nc                    s0   || _ d| _t j|||dtjjjjdd dS )a  
        if bias is not None
            - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2, bias]
            - const_args is: [o_scale, o_zp,
              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        else
            - inputs = [x, w, x_scale, x_zp, weight_scale, weight_zp, x2]
            - const_args is: [bias, o_scale, o_zp,
              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
        r   NZ/aoti_torch_cpu__qlinear_pointwise_binary_tensorrl   )	r   r   rn   ro   rE   rF   r   r   Zbinary_tensorr   ru   r1   r2   ro     s   

z#QLinearPointwiseBinaryPT2E.__init__c                    r   ry   r   r}   ru   r1   r2   r|     r   z"QLinearPointwiseBinaryPT2E.codegenc                 C   s(   | j d }|dkr| j| j  gS g S )Nr   )rY   rS   r   r   )rt   binary_post_opr1   r1   r2   r     s   
z-QLinearPointwiseBinaryPT2E.get_mutation_namesr   r   rT   rU   r   rV   rW   r#   r   r   r   c                 C   s   t | |||||||g||dk\}}}}}||	|
||||||t||g
 }|dkrFtj|  tt| d|||d ud}|j	|j
 S |d usLJ |tjtjfv rW||_t||||d udS )Nr   rb   r   )ra   r   r   r@   r   r   r   r   r]   rS   r   rE   r   r   r   )rN   r   rT   rU   r   rV   rW   r#   r   r   r   r   Zother_scaleZother_zpr   r   Zunary_post_opZunary_post_op_argsZunary_post_op_algorithmrS   rY   rX   rQ   rf   r1   r1   r2   r     sZ   
z!QLinearPointwiseBinaryPT2E.creater   r   )r   r   r   ro   r|   r   r   r   r*   r   r   r1   r1   ru   r2   r     s<    	
r   c                !       s   e Zd Z	d	d fddZeddddd	dd
ddddddddedee dededededededef ddZ fddZ	  Z
S )MkldnnRnnLayerr1   rj   Nc                    r   r   )rn   ro   rE   rF   rG   Zmkldnn_rnn_layerrr   rs   ru   r1   r2   ro   _  r   zMkldnnRnnLayer.__init__r   r   w0w1w2w3hxcxreversebatch_sizesmodehidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc              	      sf  |  |   |  | |}|  | |}|  | |}|  | |}|  | |}|  |  | |}|   }t|dksRJ d|\}}}|||g}| }| }||||||g}||	|
||||||g	}tt d||d dd }|||dgg}|||t	|t	|dgg} fdd	t
t||D }| _|S )
N   zExpect lstm input to be 3Drb   )rS   rY   c                 S   s   t | dks
J dt| S )Nr   zExpect output_shape to be 3D)r%   r   rK   )output_shaper   r1   r1   r2   get_strides_of_lstm_output  s   
z9MkldnnRnnLayer.create.<locals>.get_strides_of_lstm_outputr   c                    s8   g | ]\}\}}t t  || t|fgqS r1   )r   r
   r]   rL   tuple)r5   r<   r+   rR   rf   r   r1   r2   
<listcomp>  s    
z)MkldnnRnnLayer.create.<locals>.<listcomp>)r   r   Zfreeze_layoutr\   r%   r   r   r]   r   rK   	enumeratezipre   )rN   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r.   Z
seq_lengthZ
mini_batchr   Zhy_shapeZcy_shaperS   rY   r   Zoutput_sizesZoutput_stridesrg   r1   r   r2   r   m  s\   

zMkldnnRnnLayer.createc                    s   | d t |S ry   rz   r}   ru   r1   r2   r|     s   
zMkldnnRnnLayer.codegenr   r   )r   r   r   ro   r   boolr(   r*   r   r|   r   r1   r1   ru   r2   r   ^  sP    	
]r   c                       sN   e Zd Z	d	d fddZ fddZe				
								dddZ  ZS )WeightInt4PackMatmulr1   rj   Nc                    sD   t |dksJ t |dksJ t j|||dtjjjjdd dS )zY
        inputs = [x, w, qGroupSize, qScalesAndZeros]
        constant_args = ()
           r   NZ-aoti_torch_cpu__weight_int4pack_mm_cpu_tensorrl   )r%   rn   ro   rE   rF   Z	quantizedZint4mm_packed_weight_cpurr   rs   ru   r1   r2   ro     s   


zWeightInt4PackMatmul.__init__c                    r   ry   r   r}   ru   r1   r2   r|     r   zWeightInt4PackMatmul.codegenr   r   r   
qGroupSizeqScalesAndZerosc                 C   s`   ||||g}|  ^ }}|  \}}t||g }	t|	}
t| | |	|
}t||dS )N)rd   rS   )r\   r(   r   rK   r
   r]   rL   r   )rN   r   r   r   r   rS   r^   r_   nr+   rR   rX   r1   r1   r2   r     s   
zWeightInt4PackMatmul.creater   r   )r   r   r   r   r   r   r   r   r   r1   r1   ru   r2   r     s     r   )FNNN)NNF)1collections.abcr   typingr   r   rA   rE   Ztorch._prims_commonr   Ztorch.utils._ordered_setr   Zirr	   r
   r   r   r   r   r   r   r   r   r   r   r   utilsr   r   Zvirtualizedr   r*   r   r(   rZ   ra   rh   ri   r   r   r   r   r   r   r   r   r   r   r   r   r1   r1   r1   r2   <module>   s   <	



 3

?8AMCg|+36Svs