o
    ZhcP                  
   @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZmZmZmZ e r=ddlZddlmZ eeZg d	g d
g dddg d	g dg ddddg d	g d
g dddg d	g d
g ddddZdddddddddddddddddddddddddZdd Z				d-defddZdd  Zd!d" Zd#d$ Zd%d& Z d'd( Z!d)d* Z"d+d, Z#dS ).z;AWQ (Activation aware Weight Quantization) integration file    N)version   )ACT2FN)PreTrainedModel)is_auto_awq_availableis_ipex_availableis_torch_availablelogging)AwqBackendPackingMethod	AwqConfigAWQLinearVersionExllamaVersion)q_projk_projv_projo_proj)	gate_projup_proj	down_proj)Zinput_layernormZpost_attention_layernormZnormF)	attentionmlp	layernorm	use_alibi)Zw1Zw3Zw2g    .A)r   r   r   r   
rope_theta)ZmistralZmixtralllamaZllavaactZc_fc)r   layer_before_actZdense_h_to_4hr   Zfc_inZ	gelu_impl)Z
starcoder2ZRefinedWebModelZfalconZmptZgptjZgpt_neoxZgpt_bigcodeZbloomc                 C   s   ddl m} |tvr| S |  D ]7\}}t| d }t| d }||krBt| |rBt| t| d }|j}t|}	|||	| j	|< t
||}
q| S )Nr   )ScaledActivationr   r   )Zawq.modules.actr   AWQ_SCALES_MAPPINGSnamed_childrenhasattrgetattrout_featurestorchZones_modulesreplace_quantization_scales)model
model_typer   namemoduleZact_nameZlayer_before_act_namer   sizeZ
scale_like_ r,   L/var/www/auris/lib/python3.10/site-packages/transformers/integrations/awq.pyr%   M   s   
r%   returnc              	      s  |du rg }|j }t std|tjkr||jtjkr$ddlm	} |}n`|jtj
kr3ddlm} |}nQ|jtjkre|jd tjkrJddlm}	 |	}n:|jd tjkr[ddlm}
 |
}n)td	|jd  |jtjkrtdd
lm} |}ntd|j ddlm} |}|  D ]b\}} du rg   | t|tjr||vrt  fdd|D s|j!}|j"}||j#|j$|||j%du|j&j'd| j(|< d}| j(| )d t*t+|, dkrt-|| ||d\}} .d q| |fS )a  
    Public method that recursively replaces the Linear layers of the given model with AWQ quantized layers.
    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
    conversion has been successful or not.

    During the module replacement, we also infer the backend to use through the `quantization_config` object.

    Args:
        model (`torch.nn.Module`):
            The model to convert, can be any `torch.nn.Module` instance.
        quantization_config (`AwqConfig`):
            The quantization config object that contains the quantization parameters.
        modules_to_not_convert (`list`, *optional*):
            A list of modules to not convert. If a module name is in the list (e.g. `lm_head`), it will not be
            converted.
        current_key_name (`list`, *optional*):
            A list that contains the current key name. This is used for recursion and should not be passed by the user.
        has_been_replaced (`bool`, *optional*):
            A boolean that indicates if the conversion has been successful or not. This is used for recursion and
            should not be passed by the user.
    NzAWQ (either `autoawq` or `llmawq`) is not available. Please install it with `pip install autoawq` or check out the installation guide in https://github.com/mit-han-lab/llm-awqr   )WQLinear_GEMM)WQLinear_GEMVr   )WQLinear_Exllama)WQLinear_ExllamaV2Unrecognized Exllama version: WQLinear_IPEXzUnrecognized AWQ version: )WQLinearc                 3   s    | ]
}|d   v V  qdS ).N)join).0keycurrent_key_namer,   r-   	<genexpr>   s    z*replace_with_awq_linear.<locals>.<genexpr>)w_bit
group_sizein_featuresr"   biasdevTF)modules_to_not_convertr<   quantization_confighas_been_replaced)/backendr   
ValueErrorr
   AUTOAWQr   r   ZGEMMZawq.modules.linear.gemmr/   ZGEMVZawq.modules.linear.gemvr0   ZEXLLAMAexllama_configr   ONEawq.modules.linear.exllamar1   TWOawq.modules.linear.exllamav2r2   ZIPEXawq.modules.linear.gemm_ipexr5   Zawq.quantize.qmoduler6   r   append
isinstancennZLinearanyr@   r"   bitsr?   rA   weightdevicer$   Zrequires_grad_lenlistchildrenreplace_with_awq_linearpop)r&   rC   rD   r<   rE   rG   r/   
target_clsr0   r1   r2   r5   r6   r(   r)   r@   r"   r+   r,   r;   r-   rZ   ^   sp   


rZ   c                 C   s   t | tstd| jj |jdur|j}|j|d< |S | jjt	v rOt	| jj }| jj
dd}|j}|j}t|d|}||d< ||d< ||d< |j|d< |S td	)
af  
    Returns the fusing mapping given the quantization config and the model

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`~transformers.quantization_config.AWQConfig`):
            The quantization configuration to use.
    z:The model should be an instance of `PreTrainedModel`, got Nmax_seq_lenTdecodernum_key_value_headshidden_sizenum_attention_headsa  Fusing mapping not found either on the quantization config or the supported `AWQ_FUSED_MAPPINGS`. Please pass a `fused_mapping` argument in the `quantization_config` or raise an issue on transformers https://github.com/huggingface/transformers to add its support.)rQ   r   	TypeError	__class____name__modules_to_fuseZfuse_max_seq_lenconfigr'   AWQ_FUSED_MAPPINGSget_text_configra   rb   r!   rH   )r&   rD   Zcurrent_fused_mappingrg   ra   rb   r`   r,   r,   r-   get_modules_to_fuse   s(   




rj   c           
         sX  t |tr
t|}|j}t| |}t|dd}|tjkr0ddl	m
} ddlm} ddlm} ntdg  |  D ]E\}|durNtfdd	|D rNq:t|d
 || |jdkrft| |d || ntd t| |||}	|	r dd  q:t dkr|  D ]\}t fdd	 D rt|drt|jdrd|j_q| S )aJ  
    Optionally fuse some modules in the model to speedup inference.

    Args:
        model (`~PreTrainedModel`):
            The model to fuse - note this model should have been converted into AWQ format beforehand.
        quantization_config (`Union[AwqConfig, dict]`):
            The quantization configuration to use.
    rC   Nr   )QuantAttentionFused)QuantFusedMLP)FasterTransformerRMSNormz0Fusing is only supported for the AutoAWQ backendc                 3   s    | ]}| v V  qd S Nr,   )r9   Zmodule_name_to_not_convert)r(   r,   r-   r=     s    z#fuse_awq_modules.<locals>.<genexpr>r   Zipexr   z7The IPEX version AWQ does not support fuse mlp for now.r7   c                 3   s    | ]} v V  qd S rn   r,   )r9   Zfused_attention_parent_module)fused_attention_modulesmodule_namer,   r-   r=   (  s    
rg   _attn_implementationZcustom)rQ   dictr   	from_dictrG   rj   r!   r
   rI   Zawq.modules.fused.attnrk   Zawq.modules.fused.mlprl   Zawq.modules.fused.normrm   rH   Znamed_modulesrS   _fuse_awq_layernormr   _fuse_awq_mlploggerinfo_fuse_awq_attention_layersrP   splitrW   r    rg   rq   )
r&   rD   rG   rf   rC   rk   rl   rm   r)   Zattention_has_been_fusedr,   )ro   rp   r(   r-   fuse_awq_modules   sD   






rz   c                 C   sB   | D ]}t ||rt||}||j|j|jj|j|< ~qdS )a  
    Fuse the LayerNorm layers into a target class using autoawq

    Args:
        fuse_module_names (`List[str]`):
            The list of module names to fuse
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.FasterTransformerRMSNorm`):
            The `FasterTransformerRMSNorm` class as it only supports that class
            for now.
    N)r    r!   rU   Zvariance_epsilontorV   r$   )fuse_module_namesr)   r\   rp   Z
old_moduler,   r,   r-   rt   0  s   


rt   c                 C   s   t |dkrdS t||d rXt||d }t||d }t||d }|jj}| jjdd}	|	j}
t|
 }|||||}|	dd\}}| 
|}t|||| ~~~dS dS )a  
    Fuse the MLP layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        current_module_name (`str`):
            The current submodule name
        fuse_module_names (`List[str]`):
            The list of module names to fuse. For the MLP layers it has to be an array
            of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        target_cls (`~autoawq.QuantFusedMLP`):
            The `QuantFusedMLP` class as it only supports that class
            for now.
    r   N   r   Tr^   r7   )rW   r    r!   qweightrV   rg   ri   
hidden_actr   rsplitget_submodulesetattrr{   )r&   current_module_namer|   r)   r\   r   r   r   previous_devicerg   r   Zactivation_fnZ
new_moduleparent_name
child_nameparentr,   r,   r-   ru   G  s    

ru   c                 C   sH  ddl m}m} d}t|d dkr|S t||d d r"t||d d }t||r1|}	d}
n/t||r;|}	d}
n%t r\t	t
jdt	dkr\ddl m} t||r[|}	d}
ntd	|jj}t||d d }t||d d
 }t||d d }|jdurtj|j|j|jgddnd}|	|j|j|j|j|j |j |jdutt|  j}tj|j|j|jg|
d|_tj|j|j|jg|
d|_tj|j|j|jg|
d|_t||r|j|_||_||d |d |d ||||d |d |ddd	}d|_| dd\}}| !|}t"|||#| ~~~~d}|S )a  
    Fuse the Attention layers into a target class using autoawq

    Args:
        model (`~PreTrainedModel`):
            The input pretrained model
        module (`nn.Module`):
            The pytorch parent module that has layernorm modules to fuse
        modules_to_fuse (`List[str]`):
            The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
            in the correct order: q, k, v, o layer
        current_module_name (`str`):
            The current submodule name
        target_cls (`~autoawq.QuantAttentionFused`):
            The `QuantAttentionFused` class as it only supports that class
            for now.
    r   )r/   r0   Fr   r}   Zautoawqz0.2.6r4   z'Unsupported q_proj type: {type(q_proj)}r      N)dimra   rb   r`   r]   r   r   g     @)r   r   Tr7   )$Zawq.modules.linearr/   r0   rW   r    r!   rQ   r   r   parse	importlibmetadatar5   rH   r~   rV   rA   r#   catr>   r?   r@   r"   nextiterZ
state_dictvaluesZqzerosscalesZsplit_k_itersgetZis_hf_transformersr   r   r   r{   )r&   r)   rf   r   r\   r/   r0   Zmodule_has_been_fusedr   Zlinear_target_clsZcat_dimr5   r   r   r   r   rA   Z	qkv_layerZfused_attention_layerr   r   r   r,   r,   r-   rx   p  sp   

"
(	


rx   c                 C   sl   |d t jkrddlm} || } | S |d t jkr-ddlm} || |d |d d} | S td|d  )	z
    Runs post init for Exllama layers which performs:
        - Weights unpacking, reordering and repacking
        - Devices scratch space allocation
    r   r   )exllama_post_init)exllamav2_post_initmax_input_lenmax_batch_size)r   r   r3   )r   rK   rL   r   rM   rN   r   rH   )r&   rJ   r   r   r,   r,   r-   post_init_awq_exllama_modules  s   r   c                 C   s   ddl m} || } | S )zl
    Runs post init for IPEX layers which performs:
        - Weights packing, reordering and repacking
    r   )ipex_post_init)rO   r   )r&   r   r,   r,   r-   post_init_awq_ipex_modules  s   r   )NNNF)$__doc__r   	packagingr   Zactivationsr   Zmodeling_utilsr   utilsr   r   r   r	   Zutils.quantization_configr
   r   r   r   r#   Ztorch.nnrR   Z
get_loggerre   rv   rh   r   r%   boolrZ   rj   rz   rt   ru   rx   r   r   r,   r,   r,   r-   <module>   sr   

i)@)_