o
    Zh                     @   s  d dl Z d dlmZ d dlZd dlm  mZ d dlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZmZmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$ 				ddede%e de%e fddZ&de&_'dd Z(dS )    N)Optional)mm_args   )ir)CppGemmTemplate)CppGroupedGemmTemplatecreate_epilogue_with_attr)	TensorBox)addadd_needs_realized_inputsatenpermuteregister_loweringto_dtypeview)autotune_select_algorithmChoiceCallerExternKernelChoice)use_aten_gemm_kernelsuse_cpp_gemm_templateuse_max_autotune)opsVxwbc                    s    }t|dkrtd|d gt|}t sJ dd |D }g }	tt|d ddg d^ }
 }
tdd |D d	d fd
dt|D d}g|}|dd |D  t	j
|	 |fi | t|	dkspJ td|	| }|jj fddt|D tj|d  d__fddt|D }t|dkrt|D ]}t|| g |d d ||   d R ||< q|S )N   c                 S   s$   g | ]}|d u r
|nt j|qS N)r   ExternKernelrealize_input.0bias r%   O/var/www/auris/lib/python3.10/site-packages/torch/_inductor/mkldnn_lowerings.py
<listcomp>0   s   $ z)grouped_gemm_lowering.<locals>.<listcomp>r   r   layoutc                 S   s   g | ]}|d uqS r   r%   r"   r%   r%   r&   r'   6   s    Tc                    s   i | ]}| qS r%   r%   )r#   numr   r%   r&   
<dictcomp>9   s    z)grouped_gemm_lowering.<locals>.<dictcomp>)has_biastrans_wepilogue_creatorZact_mappingc                 S   s   g | ]}|d ur|qS r   r%   r"   r%   r%   r&   r'   =   s    Zgrouped_gemmc                    s    g | ]}t  t|fgqS r%   )r   ZMultiOutputlistr#   gemm_idx)r)   template_bufr%   r&   r'   N   s    )devicec                    s   g | ]
}t j | qS r%   )r   r
   creater1   )return_bufsr%   r&   r'   T   s    )get_sizelenr   r   r   r   dictrangeextendr   add_choicesr   datar   ZMultiOutputLayout
get_devicer)   Zoutputs)r   r   r   attrscalars	algorithmr)   x_sizeZnum_gemmchoices_kwargsZinput_nodesresultZreturn_tensorsr2   r%   )r)   r6   r3   r   r&   grouped_gemm_lowering    s\   	
&

 
rG   Tc               !      s  t jjrddlm tt jjjddj	j
dtt jjjjddjj
dtt jjjddjj
dtt jjjjddjj
dt jjjt jjjt jjjt jjjtjjt jjjg} tt jjjdtdtd	tffd
d}tt jjjjdtdtdtd	tffdd}tt jjjjdtdtdtd	tffdd}tt jjj	 d?dtdtdtffdd}tt jjjj	 d?dtdtdtdtffdd}tt jjjdtdtd	tffdd}ttjjdtdtdtdtdtdtdtd td!tt d"td#td$td%td&td'td(tf fd)d*}tt jjjd d+dtd,td-td.td	tf
fd/d0}tt jjjjd d+tt jjjjd d+dtd,td-td.td1td	tffd2d3}	tt jjjd d+	 d?dtd,td-td.td	tf
fd4d5}
tt jjjjd d+tt jjjjd d+	 d?dtd,td-td.td6td	tffd7d8}t jjrtt jjj d9dj!j
d | "t jjj  tt jjj d d:dtd;td<tdt#t f fd=d>}t$|  d S 	 d S )@Nr   	mkldnn_irzmkldnn::_linear_pointwiseF)Zhas_out_variantZkernel_creatorzonednn::qlinear_pointwiser   weightr$   c
           
         s$   t  j| |||||||||	
S r   )r
   r5   ZConvolutionUnary)
r   rJ   r$   paddingstridedilationgroupsr?   r@   rA   rH   r%   r&   convolution_unary   s   z5register_onednn_fusion_ops.<locals>.convolution_unaryotherc                    *   t  j| |||||||||	|
||S r   )r
   r5   ZConvolutionBinaryr   rP   rJ   r$   rK   rL   rM   rN   binary_attrbinary_alpha
unary_attrunary_scalarsZunary_algorithmrH   r%   r&   convolution_binary   "   z6register_onednn_fusion_ops.<locals>.convolution_binaryc                    rQ   r   )r
   r5   ZConvolutionBinaryInplacerR   rH   r%   r&   convolution_binary_inplace   rX   z>register_onednn_fusion_ops.<locals>.convolution_binary_inplacer   r   c                    s  |   }t|dkrt| d|d g} |d urtj|}g }t rrt|ddg}	t| |	|d^ }
}} }	t	|| |	rr fdd}t
|d udd	krNd n|d
}|d ur\g d|d< tj|||d u rh| |gn| ||gfi | t|dks{t rt
 d}|d u rd |d< |j|d u r| |gn| ||g|fi | | tjjv sJ ddd i}td||d u r| |gn| ||g||d}t|dkrt|g |d d |  d R }|S )Nr   r   r   r   r(   c                    s   t |  dS )Nr@   rA   r   bufrA   r?   r@   r%   r&   r/      s   zJregister_onednn_fusion_ops.<locals>.linear_unary.<locals>.epilogue_creatorTnoner-   r.   r/   )r   r   r   input_indices)r?   r@   rA   Bc                 S      t jj|   S r   r   graph	constantsget_namer+   r%   r%   r&   <lambda>      zBregister_onednn_fusion_ops.<locals>.linear_unary.<locals>.<lambda>linear_unaryinput_gen_fnsr7   r8   r   r   r    r!   r   r   r   r   r9   r   r<   r   appendbindrf   r   rd   re   r   )r   r   r   r?   r@   rA   r)   rB   rC   transposed_wrD   r/   rE   rk   rF   )aten_mkldnn_linear_unaryr]   r&   ri      sd   
$z0register_onednn_fusion_ops.<locals>.linear_unaryyc                    s  |   }t|dkrt| d|d g}   }t|dkr&td|d g|d ur0tj|}g }t rt|ddg}	t| |	|d^ }
}} }	t	|| |	r fdd}t
|d ud|d	}|d u rhg d
ng d|d< tj|||d u r{| |gn| ||gfi | t|dkst rt
 d}|d u rd |d< |j|d u r| |gn| ||g|fi | | tjjv sJ ddd i}td||d u r| |gn| ||g||d}t|dkrt|g |d d |  d R }|S )Nr   r   r   r   r(   c                    s   t |  dS )N)rP   r   r[   r?   rq   r%   r&   r/   ?  s   zKregister_onednn_fusion_ops.<locals>.linear_binary.<locals>.epilogue_creatorTr_   )r   r   r   )   r   r   r   r`   )r?   ra   c                 S   rb   r   rc   r+   r%   r%   r&   rg   [  rh   zCregister_onednn_fusion_ops.<locals>.linear_binary.<locals>.<lambda>linear_binaryrj   rl   )r   rq   r   r   r?   r)   rB   Zy_sizerC   ro   rD   r/   rE   rk   rF   )aten_mkldnn_linear_binaryrr   r&   rt   *  sl   
$z1register_onednn_fusion_ops.<locals>.linear_binaryc                    s&   t  j| |||||||||	|
S r   )r
   r5   ZConvolutionTransposeUnary)r   rJ   r$   rK   Zoutput_paddingrL   rM   rN   r?   r@   rA   rH   r%   r&   convolution_transpose_unaryh  s   z?register_onednn_fusion_ops.<locals>.convolution_transpose_unaryw0w1w2w3hxcxreversebatch_sizesmodehidden_size
num_layers
has_biasesbidirectionalbatch_firsttrainc                    s4   t tj j| |||||||||	|
|||||S r   )pytreeZtree_mapr
   r5   ZMkldnnRnnLayer)r   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   rH   r%   r&   mkldnn_rnn_layer  s*   z4register_onednn_fusion_ops.<locals>.mkldnn_rnn_layer)Ztype_promotion_kindpacked_weightw_scalew_zpc                    s   t |tksJ tjjtj|tjddd}t |tksJ tjjtj|tj	ddd}t
 j| |||||||||	|
||||||S )Ndtypex_scalenamex_zp)typefloatr   rd   add_tensor_constanttorchtensorfloat32intint32r
   r5   ZQConvPointWisePT2E)r   r   r   r   r   r   r$   rL   rK   rM   rN   o_inv_scaleo_zero_pointoutput_dtyper?   r@   rA   rH   r%   r&   qconvolution_unary  s:   z6register_onednn_fusion_ops.<locals>.qconvolution_unaryaccumc                    s   t |tksJ tjjtj|tjddd}t |tksJ tjjtj|tj	ddd}|dkrM|tjtj
fv rM| tjtj
fv rM| |krMt||}t j| |||||||||	|
|||||||||||S )Nr   r   r   r   sum)r   r   r   rd   r   r   r   r   r   r   bfloat16	get_dtyper   r
   r5   ZQConvPointWiseBinaryPT2E)r   r   r   r   r   r   r   r$   rL   rK   rM   rN   r   r   r   Zaccum_scaleZaccum_zprS   alpharU   rV   unary_algorithmmrH   r%   r&   qconvolution_binary  sN   
z7register_onednn_fusion_ops.<locals>.qconvolution_binaryc                    s  |  tju sJ d|  }t|dkrt| d|d g} t
tjs;t	
t
ks,J tjjtj
tjddd
n 
  tdd 
 D rOt
g 
t
 d	v s[J d
d u rmtjjtjdtjdddttjst	tks{J tjjtjtjdddn   dksJ d|d u rtjjtjdtjddd}  |  |  tjkrttj|tjrtjj|  tj}tjjtj|tjd| d}d u rd n  g }t rt| ||d^ }}} }ttj|tjrtttjj|  tjj|  rt|| |rtjj|    }tj!|tj
dd}tjj|| d d	 	
fdd}|   tj"tjfv scJ t#j$||d u rt| 
||gn| 
||gd u|d u rg dng dd t|dkst% rt& d}d u rd |d< |'j(d u r| 
||fn| 
||f|fi | | tjjv sJ dd dd dd dd d}ttj
tjrd d |d< ttjtjr
d!d |d< t)d"|d u r| 
||gn| 
||g||d#}t|dkr@t|g |d d | d R }|S )$Nz2Only int8 weights are supported by oneDNN qlinear.r   r   r   r   r   c                 s       | ]}|d kV  qdS r   Nr%   r#   dimr%   r%   r&   	<genexpr>P      zDregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<genexpr>r   r   x_scale must be 0D or 1Dr   r   r   z(x_zp is incompatible with oneDNN qlinearr   r)   Z	out_dtyper   _BMatrixCompensc                    sJ  t jt jt jt jfv sJ |      d d ur, fdd}tj|  t j|| 	 d}
dkrSt
|
	d}t jkrr| fdd}tj| ||	 d}|S t jt jfv rdd	lm  |  fd
d}tj| tj|ttd|	 d}|S )Nc           	         s   | }t |tj}| d f}d}d}|}|}t t |||}t |t t t ||||} d ur`|}tjtjfv sNJ tjkrZt |tj}t ||}|S )Nr   r%   r   r   r   r   mulsubr   r   )	indexinputweight_compens_index_x_scale_x_zp_w_scaleZ_weight_compotemp_bias)r$   
bias_dtypebias_loaderinput_loaderw_scale_loaderweight_compens_loaderx_scale_loaderx_zp_loaderr%   r&   inner_fn  sD   

z]register_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fnr4   r   r   rangesr^   rZ   c                        | }t |S r   r   r   r   r   output_cast_loaderr   r%   r&   inner_fn_cast_output_to_bf16     zqregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fn_cast_output_to_bf16r   _create_constantsc           	         s   | } d| |t jd\}}t|| | }t jkr) ddt jd\}}n
 ddt jd\}}tt|||}t|S Ng      ?r   r      i   r   r   r   rounduint8minimummaximumr   	r   scale
zero_pointr   Z	inv_scalevalZqminZqmaxZclampedr   r   requant_input_loaderr%   r&   inner_fn_requant   s   


zeregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator.<locals>.inner_fn_requantr   r   r   r   r   r   int8Zmake_loaderr   Z	Pointwiser>   r7   r	   Zget_device_or_errorloweringr   	functoolspartialr   r   Zinput_bufferr   Z
output_bufr   r   rA   r?   r$   r   o_scaler   r   r@   r   weight_compensr   r   )	r   r   r   r   r   r   r   r   r   r&   r/     sd   
5
'zKregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.epilogue_creator)r   rs   r   r         )   r   rs   r   r   r   r   r-   r/   r`   )output_scaleoutput_zero_pointr   Zpost_op_nameZpost_op_argsZpost_op_algorithmr$   c                 S   rb   r   rc   r+   r%   r%   r&   rg   A  rh   zCregister_onednn_fusion_ops.<locals>.qlinear_unary.<locals>.<lambda>c                 S   rb   r   rc   r+   r%   r%   r&   rg   B  rh   c                 S   rb   r   rc   r+   r%   r%   r&   rg   C  rh   c                 S   rb   r   rc   r+   r%   r%   r&   rg   D  rh   )rs   r   r   r   c                 S   rb   r   rc   r+   r%   r%   r&   rg   K  rh   c                 S   rb   r   rc   r+   r%   r%   r&   rg   P  rh   qlinear_unaryrj   )*r   r   r   r7   r8   r   
isinstancer   r
   r   r   r   rd   r   r   r   realizeallr   r   Z	get_numelInputsKernelunwrap_storage_for_inputConstantBufferre   rf   tor   r   equal
zeros_liker   to_denser   r   r   r<   r   r9   rm   rn   r   )r   r   r   r   r   r   r$   r   r   r   r?   r@   rA   r)   rB   w_zp_tensorrC   rD   W_tensorweight_compens_tensorr/   rE   rk   rF   )aten_mkldnn_qlinear_unaryr   r&   r   1  s   





" 



	


	$z1register_onednn_fusion_ops.<locals>.qlinear_unaryx2c                    s  |   }
  }t|t|ksJ t|dkr.|dkr.t| d|d g} t
d|d g
ttjsKttks<J tj	j
tjtjdddn   tdd   D r_tg t  d	v skJ d
d u r}tj	j
tjdtjddd|d u rtj	j
tjdtjddd}ttjsttksJ tj	j
tjtjdddn    |  | tjkrttj|tjrtj	j|  tj}tj	j
tj|tjd| d}|dkrtjtjfv r
 tjtjfv r
 kr
t

n
 ksJ d
  d ur#  nd g }t r|dkrt| |
|d^ }}} }
ttjtjrt jdkrttj|tjrtt tj	j|  tj	j|  rt!|| |rtj	j|  }|" }tj#|tjdd}tj	j
|| d d	 	
fdd}t$j%|| d u r| ||
gn	| ||
 g d u| d u rg dng dd t|dkst& r%t'||||d
} d u rd |d< |(j) d u r| ||
fn	| ||
 f|fi | | tj	jv s0J dd dd dd d} d urGdd |d < t*d!| d u rX| ||
gn	| ||
 g||d"}t|dkr|dkrt|g |d d |  d R }|S )#Nr   r   r   r   r   r   c                 s   r   r   r%   r   r%   r%   r&   r     r   zEregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<genexpr>r   r   r   r   r   r   zCdtype of accum for qlinear post op sum should be the same as outputr   r   r   c              
      sX  t jt jt jt jfv sJ |       	d 
d ur0
 
	f
dd}tj|  t j|| 	 d}dkrYt
|d}t jkrx| fdd}tj| ||	 d}|S t jt jfv rdd	lm  |  fd
d}tj| t jtj|ttd|	 d}|S )Nc           
         s  | }| }d}	d}t |tj}| d f}|}|}t t |||}t |t t t ||||} d urd|}	tjtjfv sRJ tjkr^t |	tj}	t ||	}tjtjfv snJ tjkrzt |tj}t ||}|S )Nr%   r   r   )
r   r   Z_x2r   r   r   r   Z_weight_compensr   r   )
r$   r   r   r   r   r   x2_dtype	x2_loaderr   r   r%   r&   r     sR   


z^register_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fnr   r^   rZ   c                    r   r   r   r   r   r%   r&   r   =  r   zrregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fn_cast_output_to_bf16r   r   c           	         s   | } d| |t jd\}}t|| | }t jkr) ddt jd\}}n
 ddt jd\}}tt|||}t|t jS r   r   r   r   r%   r&   r   L  s   


zfregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator.<locals>.inner_fn_requantr   r   r   r$   r   r   r   r   r   rU   rV   r   r   r   r   r   r   )
r   r   r   r   r   r   r   r   r   r   r&   r/     sl   
7
'zLregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.epilogue_creator)r   rs   r   r   r   r   r   )   r   rs   r   r   r   r   r   r   )
r   r   r   Zother_scaleZother_zpZbinary_post_oprT   Zunary_post_opZunary_post_op_argsZunary_post_op_algorithmr$   c                 S   rb   r   rc   r+   r%   r%   r&   rg     rh   zDregister_onednn_fusion_ops.<locals>.qlinear_binary.<locals>.<lambda>c                 S   rb   r   rc   r+   r%   r%   r&   rg     rh   c                 S   rb   r   rc   r+   r%   r%   r&   rg     rh   )rs   r   r   c                 S   rb   r   rc   r+   r%   r%   r&   rg     rh   r   qlinear_binaryrj   )+r7   r8   r   r   r   r
   r   r   r   rd   r   r   r   r   r   r   r   r   r   r   r   r   re   rf   r   r   r   r   r   Z
get_layoutsizer   r   r   r   r   r   r<   r   r9   rm   rn   r   )r   r   r   r   r   r   r   r$   r   r   r   Zx2_scaleZx2_zprS   r   rU   rV   r   r)   rB   Zx2_sizer   rC   rD   r   r   r/   rE   rk   rF   )aten_mkldnn_qlinear_binaryr   r&   r   _  s  








	
& 



	

	$z2register_onednn_fusion_ops.<locals>.qlinear_binaryzmkl::_mkl_linearr(   packed_worig_wc                   s   g }t  r-t|ddg}t| ||d^ }}} }t|| |r-tj||| ||gdddgd t|dks6t rE| j	| ||f|d |d |
 tjjv sOJ |
 tjjv sYJ dd	 d
d	 d}	td|| ||g||	d}
|d urwt|
|}
|
S )Nr   r   r(   Tr   )r.   r`   )ra   
batch_sizec                 S   rb   r   rc   r+   r%   r%   r&   rg     rh   zGregister_onednn_fusion_ops.<locals>.mkl_packed_linear.<locals>.<lambda>c                 S   rb   r   rc   r+   r%   r%   r&   rg     rh   )r   r   Zpacked_linearrj   )r   r   r   r   r   r<   r8   r   rm   rn   rf   r   rd   re   r   r   )r   r  r  r   r  r)   rC   ro   rD   rk   rF   )aten_mkl_linearr%   r&   mkl_packed_linear  sF   

z5register_onednn_fusion_ops.<locals>.mkl_packed_linearr   )%r   Z_CZ_has_mkldnn rI   r   r   ZmkldnnZ_linear_pointwiseZLinearUnaryr5   binaryZLinearBinaryZonednnZqlinear_pointwiseZQLinearPointwisePT2EZQLinearPointwiseBinaryPT2EZ_convolution_pointwiseZ_convolution_pointwise_Z _convolution_transpose_pointwiser   r   defaultZqconv2d_pointwiser   r
   boolr0   r   Zbinary_tensorZhas_mklZmklZ_mkl_linearZMKLPackedLinearrm   r   r   )Zcpu_needs_realized_inputsrO   rW   rY   ri   rt   rv   r   r   r   r   r   r  r%   )r  ru   rp   r  r   rI   r&   register_onednn_fusion_opsc   s  


	!!A=
	
(3H  /  
B2r  )NNNN))r   typingr   r   Ztorch.utils._pytreeutilsZ_pytreer   Z torch._inductor.kernel.mm_commonr   r  r   Zcodegen.cpp_gemm_templater   Z!codegen.cpp_grouped_gemm_templater   Zcodegen.cpp_utilsr	   r
   r   r   r   r   r   r   r   r   Zselect_algorithmr   r   r   r   r   r   Zvirtualizedr   r   r0   rG   Z_inductor_lowering_functionr  r%   r%   r%   r&   <module>   s6   $	
@