o
    Zh5                     @   s   d dl mZmZmZmZmZ ddlmZ erddlm	Z	 ddl
mZmZmZmZ ddlmZ e r5d dlZeeZG d	d
 d
eZdS )    )TYPE_CHECKINGAnyDictListOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_nameNc                       s  e Zd ZdZdZdZddgZ fddZdd	 Zd,ddZ	ddddde
dee
ef fddZ	d-ddddde
dddee
ef deee
  fddZd.ddZ	d-dddeee
  fd d!Zd"ee
 d#e
dee
 fd$d%Zd&d' Zd-d(d)Zedefd*d+Z  ZS )/FbgemmFp8HfQuantizerz/
    FP8 quantization using fbgemm kernels
    TFz
fbgemm-gpuZ
acceleratec                    s   t  j|fi | || _d S N)super__init__quantization_config)selfr   kwargs	__class__ [/var/www/auris/lib/python3.10/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   +   s   
zFbgemmFp8HfQuantizer.__init__c                 O   s   t  stdt stdtdstdtj stdtj }|\}}|dk r0t	d|
dd }|d u rAtd	 d S |d ur]| js_t|trad
| v sYd| v rct	dd S d S d S d S )NzUsing fbgemm fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpuZdiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ImportErrorr   r   torchcudaZis_availableRuntimeErrorZget_device_capability
ValueErrorgetloggerZwarning_oncepre_quantized
isinstancedictvalues)r   argsr   Zcompute_capabilitymajorminorr   r   r   r   validate_environment/   sJ   

z)FbgemmFp8HfQuantizer.validate_environmenttorch_dtypetorch.dtypereturnc                 C   s4   |d u rt j}td| |S |t jkrtd|S )Na  Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.bfloat16 to remove this warning.zeYou cannot use FP8 with torch_dtype=torch.float16.We recommend you passing torch_dtype=torch.bfloat16)r   Zbfloat16r$   infoZfloat16r"   )r   r-   r   r   r   update_torch_dtype\   s   
z'FbgemmFp8HfQuantizer.update_torch_dtypemodelr
   param_valueztorch.Tensor
param_name
state_dictc           
      K   s   ddl m}m} t||\}}	t||r5| js|	dkr+|	dkr)|jtjkr)t	ddS |	dkr3t	dd	S t||rQ| jsA|	dkrCdS |	d
ksK|	dkrOt	dd	S dS )Nr	   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsZbiasweightz6Expect quantized weights but got an unquantized weightFweight_scalez;Expect unquantized weights but got a quantized weight_scaleTZgate_up_proj_scaleZdown_proj_scale)
integrationsr7   r8   r   r&   r%   Zdtyper   Zfloat8_e4m3fnr"   )
r   r2   r3   r4   r5   r   r7   r8   moduletensor_namer   r   r   check_quantized_paraml   s"   

z*FbgemmFp8HfQuantizer.check_quantized_paramNtarget_deviceztorch.deviceunexpected_keysc                 C   s  ddl m} t||\}}	t||r|	dkrG|dd}
|
j}|
d|d }tjj	
|\}}||}|dd}||d d|d }n4|	dkr{|dd}
|
j}|
d|d }tjj	
|\}}||}|dd}||d |d d}tj|||j|	 d< ntjj	
|\}}tj||jd d||j|	 d< tj|||j|	< |d	ur||v r|| ~d	S )
z@
        Quantizes weights into weight and weight_scale
        r	   )r8   Zgate_up_projr   r   Z	down_projZ_scaleN)r;   r8   r   r&   Z	transposeshapeZreshaper   opsZfbgemmZquantize_fp8_per_rownn	Parameterto_parametersviewremove)r   r2   r3   r4   r?   r5   r@   r8   r<   r=   Ztransposed_paramZoriginal_shapeZflattened_paramZnew_value_flatZweight_scale_flat	new_valuer:   r   r   r   create_quantized_param   s8   


 
z+FbgemmFp8HfQuantizer.create_quantized_paramc                 K   s   |S r   r   )r   r2   r   r   r   r   #_process_model_after_weight_loading      z8FbgemmFp8HfQuantizer._process_model_after_weight_loadingkeep_in_fp32_modulesc                 K   sT   ddl m} |j}| || jj|| _|j}||| j| j| j||d}| j|j_d S )Nr	   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r%   configtp_plan)r;   rO   Z_tp_planZget_modules_to_not_convertr   rP   rQ   r%   )r   r2   rN   r   rO   rR   rQ   r   r   r   $_process_model_before_weight_loading   s   
	z9FbgemmFp8HfQuantizer._process_model_before_weight_loadingmissing_keysprefixc           	         s   ddl m}m} g  | D ]/\}}t||st||r=|D ]}||v s-|| d| v r<|ds<|ds< | qq fdd|D S )Nr	   r6   .z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0kZnot_missing_keysr   r   
<listcomp>   s    z<FbgemmFp8HfQuantizer.update_missing_keys.<locals>.<listcomp>)r;   r7   r8   Znamed_modulesr&   endswithappend)	r   r2   rT   rU   r7   r8   namer<   missingr   rY   r   update_missing_keys   s   
z(FbgemmFp8HfQuantizer.update_missing_keysc                 C   s   d|j jv rYi ddddddddddddd	d
dddddddddddddddddd
dddddddd
dddd	}| d urT|| _|S ||_|S |S )NZLlama4z layers.*.self_attn.q_proj.weightZlocal_colwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightZlocal_rowwisezlayers.*.self_attngatherzlayers.*.input_layernorm.weightZsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightzlayers.*.feed_forward.expertslocalZlocal_packed_rowwise)	zlayers.*.feed_forwardz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalez.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__Zget_text_configZbase_model_tp_plan)r   rQ   Z	text_planr   r   r   update_tp_plan   sh   	
%
z#FbgemmFp8HfQuantizer.update_tp_planc                 C      dS )NTr   )r   Zsafe_serializationr   r   r   is_serializable#  rM   z$FbgemmFp8HfQuantizer.is_serializablec                 C   rd   )NFr   )r   r   r   r   is_trainable&  s   z!FbgemmFp8HfQuantizer.is_trainable)r-   r.   r/   r.   r   )r2   r
   )rb   
__module____qualname____doc__Z requires_parameters_quantizationZrequires_calibrationZrequired_packagesr   r,   r1   strr   r   r>   r   r   rK   rL   rS   r_   rc   re   propertyboolrf   __classcell__r   r   r   r   r   !   sV    
-

%



?


/r   )typingr   r   r   r   r   baser   Zmodeling_utilsr
   utilsr   r   r   r   Zquantizers_utilsr   r   Z
get_loggerrb   r$   r   r   r   r   r   <module>   s   
