
    eThcP              
          S r SSKrSSKJr  SSKJr  SSKJr  SSKJ	r	J
r
JrJr  SSKJrJrJrJr  \" 5       (       a
  SSKrSSKJr  \R*                  " \5      r/ S	Q/ S
Q/ SQSS./ S	Q/ SQ/ SQSSS./ S	Q/ S
Q/ SQSS./ S	Q/ S
Q/ SQSS.S.rSSS.SSS.SSS.SSS.SSS.SSS.SSS.SSS.S.rS r    S$S\4S jjrS rS rS rS  r S! r!S" r"S# r#g)%z;AWQ (Activation aware Weight Quantization) integration file    N)version   )ACT2FN)PreTrainedModel)is_auto_awq_availableis_ipex_availableis_torch_availablelogging)AwqBackendPackingMethod	AwqConfigAWQLinearVersionExllamaVersion)q_projk_projv_projo_proj)	gate_projup_proj	down_proj)input_layernormpost_attention_layernormnormF)	attentionmlp	layernorm	use_alibi)w1w3w2g    .A)r   r   r   r   
rope_theta)mistralmixtralllamallavaactc_fc)r%   layer_before_actdense_h_to_4hr   fc_in	gelu_impl)
starcoder2RefinedWebModelfalconmptgptjgpt_neoxgpt_bigcodebloomc                 f   SSK Jn  U[        ;  a  U $ U R                  5        H  u  p4[        U   S   n[        U   S   nX5:X  a]  [	        X5      (       aM  [        U [        U   S   5      nUR                  n[        R                  " U5      n	U" XI5      U R                  U'   [        XA5      n
M     U $ )Nr   )ScaledActivationr%   r'   )awq.modules.actr4   AWQ_SCALES_MAPPINGSnamed_childrenhasattrgetattrout_featurestorchones_modulesreplace_quantization_scales)model
model_typer4   namemoduleact_namelayer_before_act_namer'   size
scale_like_s              U/var/www/auris/envauris/lib/python3.13/site-packages/transformers/integrations/awq.pyr>   r>   M   s    0,,,,.&z259 3J ?@R S E E&u.A*.MN`.ab#00DD)J#3F#GENN4 '; / L    returnc           
      B  ^ Uc  / nUR                   n[        5       (       d  [        S5      eU[        R                  :X  Ga  UR
                  [        R                  :X  a	  SSKJ	n  UnOUR
                  [        R                  :X  a	  SSKJn  UnOUR
                  [        R                  :X  ao  UR                  S   [        R                   :X  a	  SSKJn	  U	nOUR                  S   [        R&                  :X  a	  SSKJn
  U
nOb[        S	UR                  S    35      eUR
                  [        R,                  :X  a	  SS
KJn  UnO [        SUR
                   35      eSSKJn  UnU R7                  5        GH%  u  pTc  / mTR9                  U5        [;        U[<        R>                  5      (       a  X;  a  [A        U4S jU 5       5      (       d  URB                  nURD                  nU" URF                  URH                  UUURJ                  SLURL                  RN                  S9U RP                  U'   SnU RP                  U   RS                  S5        [U        [W        URY                  5       5      5      S:  a  [[        UUTUUS9u  nnTR]                  S5        GM(     X4$ )a\  
Public method that recursively replaces the Linear layers of the given model with AWQ quantized layers.
`accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
conversion has been successful or not.

During the module replacement, we also infer the backend to use through the `quantization_config` object.

Args:
    model (`torch.nn.Module`):
        The model to convert, can be any `torch.nn.Module` instance.
    quantization_config (`AwqConfig`):
        The quantization config object that contains the quantization parameters.
    modules_to_not_convert (`list`, *optional*):
        A list of modules to not convert. If a module name is in the list (e.g. `lm_head`), it will not be
        converted.
    current_key_name (`list`, *optional*):
        A list that contains the current key name. This is used for recursion and should not be passed by the user.
    has_been_replaced (`bool`, *optional*):
        A boolean that indicates if the conversion has been successful or not. This is used for recursion and
        should not be passed by the user.
NzAWQ (either `autoawq` or `llmawq`) is not available. Please install it with `pip install autoawq` or check out the installation guide in https://github.com/mit-han-lab/llm-awqr   )WQLinear_GEMM)WQLinear_GEMVr   )WQLinear_Exllama)WQLinear_ExllamaV2Unrecognized Exllama version: WQLinear_IPEXzUnrecognized AWQ version: )WQLinearc              3   J   >#    U  H  oS R                  T5      ;   v   M     g7f).N)join).0keycurrent_key_names     rH   	<genexpr>*replace_with_awq_linear.<locals>.<genexpr>   s      [DZSchh'788DZs    #)w_bit
group_sizein_featuresr:   biasdevTF)modules_to_not_convertrY   quantization_confighas_been_replaced)/backendr   
ValueErrorr   AUTOAWQr   r   GEMMawq.modules.linear.gemmrL   GEMVawq.modules.linear.gemvrM   EXLLAMAexllama_configr   ONEawq.modules.linear.exllamarN   TWOawq.modules.linear.exllamav2rO   IPEXawq.modules.linear.gemm_ipexrR   awq.quantize.qmodulerS   r7   append
isinstancennLinearanyr^   r:   bitsr]   r_   weightdevicer=   requires_grad_lenlistchildrenreplace_with_awq_linearpop)r?   ra   rb   rY   rc   re   rL   
target_clsrM   rN   rO   rR   rS   rA   rB   r^   r:   rG   s      `              rH   r   r   ^   sb   8 %!#!))G "" ~
 	
 )111&&*:*?*??=&J ((,<,A,AA=&J ((,<,D,DD"11)<@R@RRG-
$33I>.BTBTTK/
 #ABUBdBdenBoAp!qrr ((,<,A,AAB&J9:M:U:U9VWXX1
,,.#!%fbii((T-O[DZ[[[$00%22'1-222== +!-D0,,(t$ %)! t$33E:tFOO%&'!+#:'=!1$7"3$ A  	R A /B ##rI   c                    [        U [        5      (       d"  [        SU R                  R                   35      eUR
                  b  UR
                  nUR                  US'   U$ U R                  R                  [        ;   ax  [        U R                  R                     nU R                  R                  SS9nUR                  nUR                  n[        USU5      nXBS'   XRS'   XbS'   UR                  US'   U$ [        S5      e)	aJ  
Returns the fusing mapping given the quantization config and the model

Args:
    model (`~PreTrainedModel`):
        The model to fuse - note this model should have been converted into AWQ format beforehand.
    quantization_config (`~transformers.quantization_config.AWQConfig`):
        The quantization configuration to use.
z:The model should be an instance of `PreTrainedModel`, got max_seq_lenTdecodernum_key_value_headshidden_sizenum_attention_headsa  Fusing mapping not found either on the quantization config or the supported `AWQ_FUSED_MAPPINGS`. Please pass a `fused_mapping` argument in the `quantization_config` or raise an issue on transformers https://github.com/huggingface/transformers to add its support.)rv   r   	TypeError	__class____name__modules_to_fusefuse_max_seq_lenconfigr@   AWQ_FUSED_MAPPINGSget_text_configr   r   r9   rf   )r?   rb   current_fused_mappingr   r   r   r   s          rH   get_modules_to_fuser      s    e_--TUZUdUdUmUmTnopp **6 3 C C/B/S/Sm,, ! + 
	 	 $6	6 25<<3J3J K --d-; (($88%f.CEXY 0;m,7J347J34/B/S/Sm, ! 	 N
 	
rI   c                   ^
^^ [        U[        5      (       a  [        R                  " U5      nUR                  n[        X5      n[        USS5      nU[        R                  :X  a  SSK	J
n  SSKJn  SSKJn  O[        S5      e/ m
U R!                  5        H  u  mnUb  [#        U4S jU 5       5      (       a  M%  [%        US	   X5        UR&                  S
:w  a  [)        U TUS   X5        O[*        R-                  S5        [/        XUTU5      n	U	(       d  M  T
R1                  TR3                  S5      S   5        M     [5        T
5      S:  ax  U R!                  5        Hd  u  mn[#        U
U4S jT
 5       5      (       d  M#  [7        US5      (       d  M6  [7        UR8                  S5      (       d  MS  SUR8                  l        Mf     U $ )a.  
Optionally fuse some modules in the model to speedup inference.

Args:
    model (`~PreTrainedModel`):
        The model to fuse - note this model should have been converted into AWQ format beforehand.
    quantization_config (`Union[AwqConfig, dict]`):
        The quantization configuration to use.
ra   Nr   )QuantAttentionFused)QuantFusedMLP)FasterTransformerRMSNormz0Fusing is only supported for the AutoAWQ backendc              3   ,   >#    U  H	  oT;   v   M     g 7fN )rW   module_name_to_not_convertrA   s     rH   rZ   #fuse_awq_modules.<locals>.<genexpr>  s     oXn:T5Xns   r   ipexr   z7The IPEX version AWQ does not support fuse mlp for now.rU   c              3   .   >#    U  H
  nTT;   v   M     g 7fr   r   )rW   fused_attention_parent_modulefused_attention_modulesmodule_names     rH   rZ   r   (  s      \s;X66\ss   r   _attn_implementationcustom)rv   dictr   	from_dictre   r   r9   r   rg   awq.modules.fused.attnr   awq.modules.fused.mlpr   awq.modules.fused.normr   rf   named_modulesry   _fuse_awq_layernormr   _fuse_awq_mlploggerinfo_fuse_awq_attention_layersru   splitr~   r8   r   r   )r?   rb   re   r   ra   r   r   r   rB   attention_has_been_fusedr   r   rA   s             @@@rH   fuse_awq_modulesr      s    %t,,'112EF!))G)%EO$%8:RTXY)111>7CKLL ++-f!-oXnooo 	OK8&[ &&&0%u'=vUKKQR $>?D2E$
  $##**4::c?1+=>) .2 "#a'#(#6#6#8K \s   68,,H^1_1_9AFMM6 $9 LrI   c                     U  Hn  n[        X5      (       d  M  [        X5      nU" UR                  UR                  5      R	                  UR                  R
                  5      UR                  U'   AMp     g)a  
Fuse the LayerNorm layers into a target class using autoawq

Args:
    fuse_module_names (`List[str]`):
        The list of module names to fuse
    module (`nn.Module`):
        The pytorch parent module that has layernorm modules to fuse
    target_cls (`~autoawq.FasterTransformerRMSNorm`):
        The `FasterTransformerRMSNorm` class as it only supports that class
        for now.
N)r8   r9   r{   variance_epsilontor|   r=   )fuse_module_namesrB   r   r   
old_modules        rH   r   r   0  sh     )6'' 5J+5!!++, b""))* OOK(  )rI   c                    [        U5      S:X  a  g[        X2S   5      (       a  [        X2S   5      n[        X2S   5      n[        X2S   5      nUR                  R                  nU R
                  R                  SS9n	U	R                  n
[        U
   nU" XWXk5      nUR                  SS5      u  pU R                  U5      n[        XUR                  U5      5        AAAgg)a  
Fuse the MLP layers into a target class using autoawq

Args:
    model (`~PreTrainedModel`):
        The input pretrained model
    current_module_name (`str`):
        The current submodule name
    fuse_module_names (`List[str]`):
        The list of module names to fuse. For the MLP layers it has to be an array
        of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
    module (`nn.Module`):
        The pytorch parent module that has layernorm modules to fuse
    target_cls (`~autoawq.QuantFusedMLP`):
        The `QuantFusedMLP` class as it only supports that class
        for now.
r   N   r   Tr   rU   )r~   r8   r9   qweightr|   r   r   
hidden_actr   rsplitget_submodulesetattrr   )r?   current_module_namer   rB   r   r   r   r   previous_devicer   r   activation_fn
new_moduleparent_name
child_nameparents                   rH   r   r   G  s    $ "v+,,Fa$89	&A"67Fa$89	#++22 --d-;&&
z*	gM
"5"<"<S!"D$$[1JMM/$BCw	# -rI   c                    SSK JnJn  Sn[        US   5      S:X  a  U$ [	        XS   S   5      (       Ga2  [        XS   S   5      n[        X5      (       a  Un	Sn
O[        X5      (       a  Un	Sn
O[        5       (       af  [        R                  " [        R                  R                  S5      5      [        R                  " S5      :  a  SSK Jn  [        X5      (       a  Un	Sn
O[        S	5      eUR                  R                  n[        XS   S   5      n[        XS   S
   5      n[        XS   S   5      nUR                   b6  ["        R$                  " UR                   UR                   UR                   /SS9OSnW	" UR&                  UR(                  UR*                  UR,                  UR,                  -   UR,                  -   UR                   SL[/        [1        UR3                  5       R5                  5       5      5      R                  5      n["        R$                  " UR                  UR                  UR                  /W
S9Ul        ["        R$                  " UR6                  UR6                  UR6                  /U
S9Ul        ["        R$                  " UR8                  UR8                  UR8                  /U
S9Ul        [        UU5      (       a  UR:                  Ul        UUl        U" US   US   US   UUUUS   US   UR=                  SS5      S9	nSUl        URA                  SS5      u  nnU RC                  U5      n[E        UUURG                  U5      5        AAAASnU$ )av  
Fuse the Attention layers into a target class using autoawq

Args:
    model (`~PreTrainedModel`):
        The input pretrained model
    module (`nn.Module`):
        The pytorch parent module that has layernorm modules to fuse
    modules_to_fuse (`List[str]`):
        The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
        in the correct order: q, k, v, o layer
    current_module_name (`str`):
        The current submodule name
    target_cls (`~autoawq.QuantAttentionFused`):
        The `QuantAttentionFused` class as it only supports that class
        for now.
r   )rL   rM   Fr   r   autoawqz0.2.6rQ   z'Unsupported q_proj type: {type(q_proj)}r      N)dimr   r   r   r   r   r    g     @)r   r    TrU   )$awq.modules.linearrL   rM   r~   r8   r9   rv   r   r   parse	importlibmetadatarR   rf   r   r|   r_   r;   catr\   r]   r^   r:   nextiter
state_dictvaluesqzerosscalessplit_k_itersgetis_hf_transformersr   r   r   r   )r?   rB   r   r   r   rL   rM   module_has_been_fusedr   linear_target_clscat_dimrR   r   r   r   r   r_   	qkv_layerfused_attention_layerr   r   r   s                         rH   r   r   p  s   $ @!
?;'(A-$$v{3A677!=a!@Af,, -G.. -G  W]]93E3E3M3Mi3X%Y\c\i\ijq\r%r8&00$1!FGG ..//!=a!@A!=a!@A!=a!@ALRKKLcuyy&++v{{FKK@aHim%LL&"5"558K8KKKKt#f'')00234;;
	 "IIv~~v~~v~~&V\cd	 99fmmV]]FMM%RX_`	 99fmmV]]FMM%RX_`	i//&,&:&:I#	 *M*1212M*%k2&**<A!
 480"5"<"<S!"DZ$$[1
$9$<$<_$MNFFF $  rI   c                     US   [         R                  :X  a  SSKJn  U" U 5      n U $ US   [         R                  :X  a  SSKJn  U" U US   US   S9n U $ [        SUS    35      e)	z
Runs post init for Exllama layers which performs:
    - Weights unpacking, reordering and repacking
    - Devices scratch space allocation
r   r   )exllama_post_init)exllamav2_post_initmax_input_lenmax_batch_size)r   r   rP   )r   rn   ro   r   rp   rq   r   rf   )r?   rm   r   r   s       rH   post_init_awq_exllama_modulesr     s     i N$6$66@!%( L 
		"n&8&8	8D#(9)*:;
 L 9.:S9TUVVrI   c                 "    SSK Jn  U" U 5      n U $ )z`
Runs post init for IPEX layers which performs:
    - Weights packing, reordering and repacking
r   )ipex_post_init)rs   r   )r?   r   s     rH   post_init_awq_ipex_modulesr     s     <5!ELrI   )NNNF)$__doc__r   	packagingr   activationsr   modeling_utilsr   utilsr   r   r	   r
   utils.quantization_configr   r   r   r   r;   torch.nnrw   
get_loggerr   r   r   r6   r>   boolr   r   r   r   r   r   r   r   r   rI   rH   <module>r      s5   >     , Y Y  			H	% >4L	 >!L >4L	 >4L	) :  V<$/JAi8w7?C f= oF	 &  f$ 
f$R&!R=@.&*R\!~2
rI   