o
    Zh%                     @   s   d dl mZmZmZmZmZ ddlmZmZm	Z	m
Z
 ddlmZ ddlmZ e r-d dlZer5ddlmZ e
eZG d	d
 d
eZdS )    )TYPE_CHECKINGAnyDictListOptional   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                       s
  e Zd ZdZdZdZdgZ fddZdd Zd-ddZ		d.ddddde
dddee
ef deee
  fddZddddde
dee
ef fddZ	d.dddeee
  fddZd/dd Zd!ee
 d"e
dee
 fd#d$Zd%d& Zd.d'd(Zedefd)d*Zd+d, Z  ZS )0FineGrainedFP8HfQuantizerz
    FP8 quantization implementation supporting both standard and MoE models.
    Supports both e4m3fn formats based on platform.
    TFZ
acceleratec                    s   t  j|fi | || _d S N)super__init__quantization_config)selfr   kwargs	__class__ `/var/www/auris/lib/python3.10/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr      s   
z"FineGrainedFP8HfQuantizer.__init__c                 O   s  t  stdt std|dds|ddrtdtj s*t s*t	dtj rOtj
 }|\}}|dk sD|dkrO|d	k rOtd
| d| d|dd }|d u r`td d S |d ur|| js~t|trd| v sxd| v rtdd S d S d S d S )NzxUsing fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)Zfrom_tfFZ	from_flaxzConverting into FP8 weights from tf/flax weights is currently not supported, please make sure the weights are in PyTorch format.zANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.`
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpuZdiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r	   ImportErrorr   get
ValueErrortorchcudaZis_availabler
   RuntimeErrorZget_device_capabilityloggerZwarning_oncepre_quantized
isinstancedictvalues)r   argsr   Zcompute_capabilitymajorminorr   r   r   r   validate_environment   sP   

z.FineGrainedFP8HfQuantizer.validate_environmenttorch_dtypetorch.dtypereturnc                 C   s   |d u rt d tj}|S )NzWSetting torch_dtype to torch.float32 as no torch_dtype was specified in from_pretrained)r'   infor$   Zfloat32)r   r0   r   r   r   update_torch_dtypeM   s   
z,FineGrainedFP8HfQuantizer.update_torch_dtypeNmodelr   param_valueztorch.Tensor
param_nametarget_deviceztorch.device
state_dictunexpected_keysc              
   C   s`  ddl m} ||}ttjj}ttjj}	| jj	\}
}|j
dd \}}||
 dks4|| dkrEtd| d| d|
 d| d		|j
}|d
||
 |
|| |ddddd}tjt|dd}|	| }|j
}|d
d
}tj|| ||	dtj}|ddddd}||}||  }|||| |||ddd d | dS )zO
        Quantizes weights to FP8 format using Block-wise quantization
        r   )_load_parameter_into_modelNr   zMatrix dimensions (z, z$) must be divisible by block sizes ()r         )r>   r<   )dim)minmaxr   z.weight_scale_inv)modeling_utilsr;   tor$   Zfinfofloat8_e4m3fnrB   rC   r   Zweight_block_sizeshaper#   ZreshapeZpermuteZamaxabsZ	unsqueezeclampZsqueezeZ
reciprocalrsplit)r   r5   r6   r7   r8   r9   r:   r;   Zfp8_minZfp8_maxZblock_size_mZblock_size_nrowscolsZparam_value_orig_shapeZmax_absscaleZscale_orig_shapeZquantized_paramr   r   r   create_quantized_paramS   s4   

 z0FineGrainedFP8HfQuantizer.create_quantized_paramc           	      K   sj   ddl m} t||\}}t||r3| js|dkr)|dkr'|jtjkr'tddS |dkr1tdd	S dS )
Nr   	FP8LinearZbiasweightz6Expect quantized weights but got an unquantized weightFZweight_scale_invz;Expect unquantized weights but got a quantized weight_scaleT)	integrations.finegrained_fp8rP   r   r)   r(   Zdtyper$   rF   r#   )	r   r5   r6   r7   r9   r   rP   moduleZtensor_namer   r   r   check_quantized_param   s   
z/FineGrainedFP8HfQuantizer.check_quantized_paramkeep_in_fp32_modulesc                 K   s@   ddl m} | || jj|| _||| j| jd}| j|j_d S )Nr   )replace_with_fp8_linear)modules_to_not_convertr   )rR   rV   Zget_modules_to_not_convertr   rW   config)r   r5   rU   r   rV   r   r   r   $_process_model_before_weight_loading   s   
z>FineGrainedFP8HfQuantizer._process_model_before_weight_loadingc                 K   s   |S r   r   )r   r5   r   r   r   r   #_process_model_after_weight_loading      z=FineGrainedFP8HfQuantizer._process_model_after_weight_loadingmissing_keysprefixc                    s   ddl m} g  | D ]*\}}t||r6|D ]}||v s&|| d| v r5|ds5|ds5 | qq fdd|D S )Nr   rO   r   z.weightz.biasc                    s   g | ]}| vr|qS r   r   ).0kZnot_missing_keysr   r   
<listcomp>   s    zAFineGrainedFP8HfQuantizer.update_missing_keys.<locals>.<listcomp>)ZintegrationsrP   Znamed_modulesr)   endswithappend)r   r5   r\   r]   rP   namerS   missingr   r`   r   update_missing_keys   s   

z-FineGrainedFP8HfQuantizer.update_missing_keysc                 C   sz   d|j jv r;i ddddddddddddd	d
dd
dddddddddddd
dd
dd}||_|S )NZQwen3z layers.*.self_attn.q_proj.weightZlocal_colwisez*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightZlocal_rowwisez*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.self_attngatherzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_invzlayers.*.mlp)r   __name__Zbase_model_tp_plan)r   rX   Z	text_planr   r   r   update_tp_plan   sH   	
z(FineGrainedFP8HfQuantizer.update_tp_planc                 C      dS )NTr   )r   Zsafe_serializationr   r   r   is_serializable   r[   z)FineGrainedFP8HfQuantizer.is_serializablec                 C   rj   )NFr   r   r   r   r   is_trainable      z&FineGrainedFP8HfQuantizer.is_trainablec                 C   rj   )Nr   r   rl   r   r   r   get_cuda_warm_up_factor   rn   z1FineGrainedFP8HfQuantizer.get_cuda_warm_up_factor)r0   r1   r2   r1   r   )r5   r   )rh   
__module____qualname____doc__Z requires_parameters_quantizationZrequires_calibrationZrequired_packagesr   r/   r4   strr   r   r   r   rN   rT   rY   rZ   rf   ri   rk   propertyboolrm   ro   __classcell__r   r   r   r   r      sX    
.


6





r   )typingr   r   r   r   r   utilsr   r	   r
   r   baser   Zquantizers_utilsr   r$   rD   r   Z
get_loggerrh   r'   r   r   r   r   r   <module>   s    
