
    fTh%                         S SK JrJrJrJrJr  SSKJrJrJ	r	J
r
  SSKJr  SSKJr  \" 5       (       a  S SKr\(       a  SSKJr  \
R$                  " \5      r " S	 S
\5      rg)    )TYPE_CHECKINGAnyDictListOptional   )is_accelerate_availableis_torch_availableis_torch_xpu_availablelogging   )HfQuantizer)get_module_from_nameN)PreTrainedModelc                   *  ^  \ rS rSrSrSrSrS/rU 4S jrS r	S S	 jr
 S!S
SSSS\SSS\\\4   S\\\      4S jjrS
SSSS\S\\\4   4S jr S!S
SS\\\      4S jjrS"S jrS\\   S\S\\   4S jrS rS!S jr\S\4S j5       rS rSrU =r$ )#FineGrainedFP8HfQuantizer   zz
FP8 quantization implementation supporting both standard and MoE models.
Supports both e4m3fn formats based on platform.
TF
acceleratec                 4   > [         TU ]  " U40 UD6  Xl        g N)super__init__quantization_config)selfr   kwargs	__class__s      i/var/www/auris/envauris/lib/python3.13/site-packages/transformers/quantizers/quantizer_finegrained_fp8.pyr   "FineGrainedFP8HfQuantizer.__init__   s    ,77#6     c                 >   [        5       (       d  [        S5      e[        5       (       d  [        S5      eUR                  SS5      (       d  UR                  SS5      (       a  [	        S5      e[
        R                  R                  5       (       d  [        5       (       d  [        S5      e[
        R                  R                  5       (       aF  [
        R                  R                  5       nUu  pEUS:  d  US:X  a  US	:  a  [	        S
U SU S35      eUR                  SS 5      nUc  [        R                  S5        g Ub\  U R                  (       dJ  [        U[        5      (       a4  SUR!                  5       ;   d  SUR!                  5       ;   a  [	        S5      eg g g g )NzxUsing fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zMLoading an FP8 quantized model requires accelerate (`pip install accelerate`)from_tfF	from_flaxzConverting into FP8 weights from tf/flax weights is currently not supported, please make sure the weights are in PyTorch format.zANo GPU or XPU found. A GPU or XPU is needed for FP8 quantization.   	   ziFP8 quantized models is only supported on GPUs with compute capability >= 8.9 (e.g 4090/H100), actual = `.`
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a cpu/disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the cpu/disk device from the device_map.)r
   ImportErrorr	   get
ValueErrortorchcudais_availabler   RuntimeErrorget_device_capabilityloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr'   s          r   validate_environment.FineGrainedFP8HfQuantizer.validate_environment   s   !##] 
 '((mnn::i''6::k5+I+IF 
 

''))-C-E-Ebcc::""$$!&!A!A!C-LE	uzeai ##('5'4 
 ZZd3
| #&&z400j//11Vz?P?P?R5R k  6S 1 ' $r   returnc                 V    Uc%  [         R                  S5        [        R                  nU$ )NzWSetting torch_dtype to torch.float32 as no torch_dtype was specified in from_pretrained)r2   infor-   float32)r   torch_dtypes     r   update_torch_dtype,FineGrainedFP8HfQuantizer.update_torch_dtypeM   s$    KKqr--Kr   modelr   param_valueztorch.Tensor
param_nametarget_deviceztorch.device
state_dictunexpected_keysc                    SSK Jn  UR                  U5      n[        R                  " [        R
                  5      R                  n[        R                  " [        R
                  5      R                  n	U R                  R                  u  pUR                  SS u  pX-  S:w  d  X-  S:w  a  [        SU SU SU
 SU S	3	5      eUR                  nUR                  S
X-  XU-  U5      R                  SSSSS5      n[        R                  " [        R                  " U5      SS9nX-  nUR                  nUR!                  S
5      R!                  S
5      n[        R"                  " UU-  XS9R                  [        R
                  5      nUR                  SSSSS5      nUR                  U5      nUR                  U5      R%                  5       R'                  5       nU" XU5        U" XR)                  SS5      S   S-   U5        g)z?
Quantizes weights to FP8 format using Block-wise quantization
r   )_load_parameter_into_modelNr   zMatrix dimensions (z, z$) must be divisible by block sizes ()r         )rO   rM   )dim)minmaxr%   z.weight_scale_inv)modeling_utilsrL   tor-   finfofloat8_e4m3fnrS   rT   r   weight_block_sizeshaper,   reshapepermuteamaxabs	unsqueezeclampsqueeze
reciprocalrsplit)r   rE   rF   rG   rH   rI   rJ   rL   fp8_minfp8_maxblock_size_mblock_size_nrowscolsparam_value_orig_shapemax_absscalescale_orig_shapequantized_params                      r   create_quantized_param0FineGrainedFP8HfQuantizer.create_quantized_paramS   s    	@!nn]3 ++e11266++e11266%)%=%=%O%O" &&rs+
!#t':a'?%dV2dV3WXdWeeghtguuvw  "-!2!2!))$lL4H,

'!Q1a
  	
 **UYY{3B! ;;#--b1  ++kE&9wTWWX]XkXkl)11!Q1a@)112HI ./779DDF 	#5oF"5*;*;C*CA*FI\*\^cdr   c                     SSK Jn  [        X5      u  px[        Xv5      (       aY  U R                  (       d  US:X  a0  US:X  a)  UR
                  [        R                  :w  a  [        S5      egUS:X  a  [        S5      eg	g)
Nr   	FP8Linearbiasweightz6Expect quantized weights but got an unquantized weightFweight_scale_invz;Expect unquantized weights but got a quantized weight_scaleT)	integrations.finegrained_fp8rs   r   r5   r4   dtyper-   rX   r,   )	r   rE   rF   rG   rI   r   rs   moduletensor_names	            r   check_quantized_param/FineGrainedFP8HfQuantizer.check_quantized_param   st     	=25Ef((!![F%:(*{/@/@EDWDW/W$%]^^"44$%bccr   keep_in_fp32_modulesc                     SSK Jn  U R                  XR                  R                  U5      U l        U" UU R                  U R                  S9nU R                  UR
                  l        g )Nr   )replace_with_fp8_linearmodules_to_not_convertr   )rw   r   get_modules_to_not_convertr   r   config)r   rE   r}   r   r   s        r   $_process_model_before_weight_loading>FineGrainedFP8HfQuantizer._process_model_before_weight_loading   sb     	K&*&E&E++BBDX'
# (#'#>#> $ 8 8
 ,0+C+C(r   c                     U$ r    )r   rE   r   s      r   #_process_model_after_weight_loading=FineGrainedFP8HfQuantizer._process_model_after_weight_loading   s    r   missing_keysprefixc                 \   SSK Jn  / nUR                  5        Hr  u  pg[        Xt5      (       d  M  U HU  nXh;   d  Xc SU 3;   d  M  UR	                  S5      (       a  M,  UR	                  S5      (       a  MD  UR                  U5        MW     Mt     U V	s/ s H  oU;  d  M
  U	PM     sn	$ s  sn	f )Nr   rr   r%   z.weightz.bias)integrationsrs   named_modulesr5   endswithappend)
r   rE   r   r   rs   not_missing_keysnamery   missingks
             r   update_missing_keys-FineGrainedFP8HfQuantizer.update_missing_keys   s    ,!//1LD&,,+GDhay4I,I ' 0 0 ; ; ' 0 0 9 9(//8  , 2 (E<a4D+D<EEEs   	B) B)c                     SUR                   R                  ;   a8  0 SS_SS_SS_SS_SS_SS_S	S
_SS
_SS_SS_SS_SS_SS_SS
_SS
_SS_nX!l        U$ )NQwen3z layers.*.self_attn.q_proj.weightlocal_colwisez*layers.*.self_attn.q_proj.weight_scale_invz layers.*.self_attn.k_proj.weightz*layers.*.self_attn.k_proj.weight_scale_invz layers.*.self_attn.v_proj.weightz*layers.*.self_attn.v_proj.weight_scale_invz layers.*.self_attn.o_proj.weightlocal_rowwisez*layers.*.self_attn.o_proj.weight_scale_invzlayers.*.self_attngatherzlayers.*.mlp.gate_proj.weightz'layers.*.mlp.gate_proj.weight_scale_invzlayers.*.mlp.up_proj.weightz%layers.*.mlp.up_proj.weight_scale_invzlayers.*.mlp.down_proj.weightz'layers.*.mlp.down_proj.weight_scale_invzlayers.*.mlp)r   __name__base_model_tp_plan)r   r   	text_plans      r   update_tp_plan(FineGrainedFP8HfQuantizer.update_tp_plan   s    f&&///2O<o 3O =o	
 3O =o 3O =o %h 0 :? . 8 0 :?  !I& )2%r   c                     g)NTr   )r   safe_serializations     r   is_serializable)FineGrainedFP8HfQuantizer.is_serializable   s    r   c                     g)NFr   r   s    r   is_trainable&FineGrainedFP8HfQuantizer.is_trainable   s    r   c                     g)Nr   r   r   s    r   get_cuda_warm_up_factor1FineGrainedFP8HfQuantizer.get_cuda_warm_up_factor   s    r   r   )rB   torch.dtyper>   r   r   )rE   r   )r   
__module____qualname____firstlineno____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r<   rC   strr   r   r   r   ro   r{   r   r   r   r   r   propertyboolr   r   __static_attributes____classcell__)r   s   @r   r   r      sC   
 (,$ %7,\ 044e 4e $4e 	4e
 &4e cN4e "$s),4el  $ 	
 cN4 59D D 'tCy1D(FtCy F# FRVWZR[ F2 d   r   r   )typingr   r   r   r   r   utilsr	   r
   r   r   baser   quantizers_utilsr   r-   rU   r   
get_loggerr   r2   r   r   r   r   <module>r      sI    ; ; ` `  2 0			H	%W Wr   