
    fTh5                         S SK JrJrJrJrJr  SSKJr  \(       a  SSKJ	r	  SSK
JrJrJrJr  SSKJr  \" 5       (       a  S SKr\R$                  " \5      r " S	 S
\5      rg)    )TYPE_CHECKINGAnyDictListOptional   )HfQuantizer   )PreTrainedModel)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging)get_module_from_nameNc                   &  ^  \ rS rSrSrSrSrSS/rU 4S jrS r	S S
 jr
SSSSS\S\\\4   4S jr S!SSSSS\SSS\\\4   S\\\      4S jjrS"S jr S!SSS\\\      4S jjrS\\   S\S	\\   4S jrS rS!S jr\S	\4S j5       rSrU =r$ )#FbgemmFp8HfQuantizer!   z'
FP8 quantization using fbgemm kernels
TFz
fbgemm-gpu
acceleratec                 4   > [         TU ]  " U40 UD6  Xl        g N)super__init__quantization_config)selfr   kwargs	__class__s      d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/quantizers/quantizer_fbgemm_fp8.pyr   FbgemmFp8HfQuantizer.__init__+   s    ,77#6     c                 x   [        5       (       d  [        S5      e[        5       (       d  [        S5      e[        S5      (       d  [        S5      e[        R
                  R                  5       (       d  [        S5      e[        R
                  R                  5       nUu  pEUS:  a  [        S5      eUR                  SS 5      nUc  [        R                  S	5        g Ub\  U R                  (       dJ  [        U[        5      (       a4  S
UR!                  5       ;   d  SUR!                  5       ;   a  [        S5      eg g g g )NzUsing fbgemm fp8 quantization requires torch >= 2.1.0Please install the latest version of torch ( pip install --upgrade torch )zUsing fbgemm fp8 quantization requires fbgemm-gpu libraryPlease install the latest version of fbgemm-gpu library by following : https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-librariesz0.32.2z`Loading an FP8 quantized model requires accelerate > 0.32.1 (`pip install --upgrade accelerate`)z=Using FP8 quantized models with fbgemm kernels requires a GPU	   zXFP8 quantized models is only supported on GPUs with compute capability >= 9.0 (e.g H100)
device_mapzYou have loaded an FP8 model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model. To remove this warning, pass device_map = 'cuda'. cpudiskzYou are attempting to load an FP8 model with a device_map that contains a CPU or disk device.This is not supported when the model is quantized on the fly. Please use a quantized checkpoint or remove the CPU or disk device from the device_map.)r   ImportErrorr   r   torchcudais_availableRuntimeErrorget_device_capability
ValueErrorgetloggerwarning_oncepre_quantized
isinstancedictvalues)r   argsr   compute_capabilitymajorminorr"   s          r   validate_environment)FbgemmFp8HfQuantizer.validate_environment/   s>   !##]  '((F 
 'x00r  zz&&((^__"ZZ==?)19j  ZZd3
| #&&z400j//11Vz?P?P?R5R n  6S 1 ' $r   returnc                     Uc(  [         R                  n[        R                  SU5        U$ U[         R                  :X  a  [        S5      eU$ )Na  Overriding torch_dtype=%s with `torch_dtype=torch.bloat16` due to requirements of `fbgemm-gpu` to enable model loading in fp8. Pass your own torch_dtype to specify the dtype of the remaining non-linear layers or pass torch_dtype=torch.bfloat16 to remove this warning.zeYou cannot use FP8 with torch_dtype=torch.float16.We recommend you passing torch_dtype=torch.bfloat16)r&   bfloat16r-   infofloat16r+   )r   torch_dtypes     r   update_torch_dtype'FbgemmFp8HfQuantizer.update_torch_dtype\   sV    ..KKKF  	 EMM)w  r   modelr   param_valueztorch.Tensor
param_name
state_dictc                    SSK JnJn  [        X5      u  p[	        X5      (       aY  U R
                  (       d  U	S:X  a0  U	S:X  a)  UR                  [        R                  :w  a  [        S5      egU	S:X  a  [        S5      eg	[	        X5      (       a0  U R
                  (       d  U	S:X  a  gU	S
:X  d  U	S:X  a  [        S5      eg	g)Nr
   FbgemmFp8LinearFbgemmFp8Llama4TextExpertsbiasweightz6Expect quantized weights but got an unquantized weightFweight_scalez;Expect unquantized weights but got a quantized weight_scaleTgate_up_proj_scaledown_proj_scale)
integrationsrG   rH   r   r0   r/   dtyper&   float8_e4m3fnr+   )
r   rA   rB   rC   rD   r   rG   rH   moduletensor_names
             r   check_quantized_param*FbgemmFp8HfQuantizer.check_quantized_paraml   s     	O25Ef..!![F%:(*{/@/@EDWDW/W$%]^^.0$%bccf99!![F%:"66+IZ:Z$%bccr   target_deviceztorch.deviceunexpected_keysc                    SSK Jn  [        X5      u  p[        X5      (       Ga  U	S:X  a  UR	                  SS5      n
U
R
                  nU
R                  SUS   5      n[        R                  R                  R                  U5      u  pUR                  U5      nUR	                  SS5      nUR                  US   SUS   5      nOU	S:X  a  UR	                  SS5      n
U
R
                  nU
R                  SUS   5      n[        R                  R                  R                  U5      u  pUR                  U5      nUR	                  SS5      nUR                  US   US   S5      n[        R                  R                  WR                  U5      5      UR                  U	 S3'   O[        R                  R                  R                  U5      u  nn[        R                  R                  UR                  UR
                  S   S5      R                  U5      5      UR                  U	 S3'   [        R                  R                  WR                  U5      5      UR                  U	'   Ub  X6;   a  UR!                  U5        Ag	)
z0
Quantizes weights into weight and weight_scale
r
   )rH   gate_up_projr   r   	down_proj_scaleN)rN   rH   r   r0   	transposeshapereshaper&   opsfbgemmquantize_fp8_per_rownn	Parameterto_parametersviewremove)r   rA   rB   rC   rU   rD   rV   rH   rQ   rR   transposed_paramoriginal_shapeflattened_paramnew_value_flatweight_scale_flat	new_valuerK   s                    r   create_quantized_param+FbgemmFp8HfQuantizer.create_quantized_param   sI    	>25Ef99n, $/#8#8A#>  "2!7!7"2":":2~b?Q"R 5:II4D4D4Y4YZi4j1 +22>B	%//15	0889JA~^_O`a+ $/#8#8A#>  "2!7!7"2":":2~b?Q"R 5:II4D4D4Y4YZi4j1 +22>B	%//15	0889JN[\L]_`a9>9K9KLOO\iLj9kF+f56&+ii&6&6&K&KK&X#I|9>9K9K!!,"4"4Q"7;>>}M:F+f56 +0((*<*<Y\\-=X*Y;'&:+H"":.r   c                     U$ r    )r   rA   r   s      r   #_process_model_after_weight_loading8FbgemmFp8HfQuantizer._process_model_after_weight_loading   s    r   keep_in_fp32_modulesc           	         SSK Jn  UR                  nU R                  XR                  R
                  U5      U l        UR                  nU" UU R
                  U R                  U R                  UUS9nU R                  UR                  l        g )Nr
   )replace_with_fbgemm_fp8_linear)modules_to_not_convertr   r/   configtp_plan)rN   rv   _tp_planget_modules_to_not_convertr   rw   rx   r/   )r   rA   rt   r   rv   ry   rx   s          r   $_process_model_before_weight_loading9FbgemmFp8HfQuantizer._process_model_before_weight_loading   s     	B..&*&E&E++BBDX'
# .#'#>#> $ 8 8,,
 ,0+C+C(r   missing_keysprefixc                    SSK JnJn  / nUR                  5        H  u  px[	        X5      (       d  [	        X5      (       d  M'  U HU  n	Xy;   d  Xs SU	 3;   d  M  U	R                  S5      (       a  M,  U	R                  S5      (       a  MD  UR                  U	5        MW     M     U V
s/ s H  oU;  d  M
  U
PM     sn
$ s  sn
f )Nr
   rF   .z.weightz.bias)rN   rG   rH   named_modulesr0   endswithappend)r   rA   r~   r   rG   rH   not_missing_keysnamerQ   missingks              r   update_missing_keys(FbgemmFp8HfQuantizer.update_missing_keys   s    N!//1LD&22j6d6d+GDhay4I,I ' 0 0 ; ; ' 0 0 9 9(//8  , 2 (E<a4D+D<EEEs   %	B;2B;c                    SUR                   R                  ;   ap  0 SS_SS_SS_SS_SS_SS_S	S
_SS_SS_SS_SS_SS_SS_SS_SS_SS
_SS_SSSSSS
SSSS.	EnUR                  5       b  X!R                  5       l        U$ X!l        U$ U$ )NLlama4z layers.*.self_attn.q_proj.weightlocal_colwisez&layers.*.self_attn.q_proj.weight_scalez layers.*.self_attn.k_proj.weightz&layers.*.self_attn.k_proj.weight_scalez layers.*.self_attn.v_proj.weightz&layers.*.self_attn.v_proj.weight_scalez layers.*.self_attn.o_proj.weightlocal_rowwisezlayers.*.self_attngatherzlayers.*.input_layernorm.weightsequence_parallelz(layers.*.post_attention_layernorm.weightznorm.weightz4layers.*.feed_forward.shared_expert.gate_proj.weightz:layers.*.feed_forward.shared_expert.gate_proj.weight_scalez2layers.*.feed_forward.shared_expert.up_proj.weightz8layers.*.feed_forward.shared_expert.up_proj.weight_scalez4layers.*.feed_forward.shared_expert.down_proj.weightzlayers.*.feed_forward.expertslocallocal_packed_rowwise)	zlayers.*.feed_forwardz0layers.*.feed_forward.experts.*.gate_proj.weightz6layers.*.feed_forward.experts.*.gate_proj.weight_scalez.layers.*.feed_forward.experts.*.up_proj.weightz4layers.*.feed_forward.experts.*.up_proj.weight_scalez0layers.*.feed_forward.experts.*.down_proj.weightz*layers.*.feed_forward.experts.gate_up_projz0layers.*.feed_forward.experts.gate_up_proj_scalez'layers.*.feed_forward.experts.down_proj)r   __name__get_text_configbase_model_tp_plan)r   rx   	text_plans      r   update_tp_plan#FbgemmFp8HfQuantizer.update_tp_plan   sQ   v''000$ 3O	$
 9/$ 3O$ 9/$ 3O$ 9/$ 3O$ %h$ 23F$ ;<O$ 2$& G'$( Mo)$* Eo+$, KO-$. G/$0 01$2 *2DSJYBQHWDS ?UDZ;JG$IJ %%'3>G&&(; M -6)Mr   c                     g)NTrq   )r   safe_serializations     r   is_serializable$FbgemmFp8HfQuantizer.is_serializable#  s    r   c                     g)NFrq   )r   s    r   is_trainable!FbgemmFp8HfQuantizer.is_trainable&  s    r   )rw   r   )r>   torch.dtyper9   r   r   )rA   r   )r   
__module____qualname____firstlineno____doc__ requires_parameters_quantizationrequires_calibrationrequired_packagesr   r7   r?   strr   r   rS   r   r   rn   rr   r|   r   r   r   propertyboolr   __static_attributes____classcell__)r   s   @r   r   r   !   s7    (,$ %|47+Z   $ 	
 cNJ 04= = $= 	=
 &= cN= "$s),=~ 59D D 'tCy1D2FtCy F# FRVWZR[ F-^ d  r   r   )typingr   r   r   r   r   baser	   modeling_utilsr   utilsr   r   r   r   quantizers_utilsr   r&   
get_loggerr   r-   r   rq   r   r   <module>r      sO    < ;  0 a a 2  
		H	%G; Gr   