o
    Zh9@                     @   s   d dl Z d dlZd dlZd dlmZmZmZ d dlmZ ddl	m
Z
 ddlmZ er0ddlmZ d d	lmZmZmZ dd
lmZmZmZ ddlmZ e rWd dlZd dlmZ eeZdedee fddZdd Z dd Z!dd Z"G dd de
Z#dS )    N)TYPE_CHECKINGOptionalUnion)version   )HfQuantizer)get_module_from_name   )PreTrainedModel)AnyDictList)is_torch_availableis_torchao_availablelogging)TorchAoConfigconfig_namereturnc                 C   s&   |   } td| }|r|dS dS )z
    Extract the size digit from strings like "4weight", "8weight".
    Returns the digit as an integer if found, otherwise None.
    z
(\d)weightr   N)lowerresearchgroup)r   Z	str_match r   X/var/www/auris/lib/python3.10/site-packages/transformers/quantizers/quantizer_torchao.pyfuzzy_match_size)   s
   
r   c                 C   s.   | dd d }| }|D ]}|j| }q|S )N.)splitZ_modules)modelnameZmodule_treeparentmr   r   r   find_parent9   s
   r"   c                 C   sj   ddl m} ddlm} t| |r| jj d|   dS t| |r3| jj d| j dt| j	 dS d S )Nr   )AffineQuantizedTensor)LinearActivationQuantizedTensor()z(activation=	, weight=)
Ztorchao.dtypesr#   Z7torchao.quantization.linear_activation_quantized_tensorr$   
isinstance	__class____name___quantization_typeZinput_quant_funcZoriginal_weight_tensor)weightr#   r$   r   r   r   r+   A   s   

"r+   c                 C   sZ   t | j}|d u rd| jjd  d| jjd  dS d| jjd  d| jjd  d| S )Nzin_features=r   z, out_features=r   z, weight=Noner'   )r+   r,   shape)selfr,   r   r   r   _linear_extra_reprL   s   
"&r/   c                       s0  e Zd ZdZdZdZdgZ fddZdd Zd	d
 Z	d.ddZ
deeeeef f deeeeef f fddZ	d/dddeee  fddZdddddedeeef def
ddZdddddedd deeef d!ee fd"d#Zd$d% Zd/defd&d'Zd(d) Zedefd*d+Zedefd,d-Z  ZS )0TorchAoHfQuantizerz?
    Quantizer for torchao: https://github.com/pytorch/ao/
    TFtorchaoc                    s   t  j|fi | d S N)super__init__)r.   quantization_configkwargsr)   r   r   r4   ]   s   zTorchAoHfQuantizer.__init__c                 O   s   t  stdd| _|dd }t|tr+d| v s!d| v r+| jr(tdd| _| jrN|dd }|rPt	
tj	d	}|t	
d
k rRtd| dd S d S d S )NzSLoading an torchao quantized model requires torchao library (`pip install torchao`)F
device_mapcpuZdiskzYou are attempting to perform cpu/disk offload with a pre-quantized torchao model This is not supported yet . Please remove the CPU or disk device from the device_map.Tweights_onlytorchz2.5.0zlIn order to use torchao pre-quantized model, you need to have torch>=2.5.0. However, the current version is zc. You can also set with `weights_only=False` in `from_pretrained` if you don't want to update torch)r   ImportErroroffloadgetr(   dictvaluespre_quantized
ValueErrorr   parse	importlibmetadataRuntimeError)r.   argsr6   r8   r:   Ztorch_versionr   r   r   validate_environment`   s,   

z'TorchAoHfQuantizer.validate_environmentc                 C   sp   | j jdkr$|d ur|tjkrtd| d |d u r$td tj}| j jdkr6|d u r6td tj}|S )Nint4_weight_onlyzSetting torch_dtype to zu for int4_weight_only quantization, but only bfloat16 is supported right now. Please set the torch_dtype to bfloat16.zSetting torch_dtype to torch.bfloat16 for int4_weight_only quantization since only bfloat16 is supported right now. Please set torch_dtype=torch.bfloat16 to remove this warning.#int8_dynamic_activation_int8_weightzSetting torch_dtype to torch.float32 for int8_dynamic_activation_int8_weight quantization as no torch_dtype was specified in from_pretrained)r5   
quant_typer;   Zbfloat16loggerZwarning_onceinfoZfloat32)r.   torch_dtyper   r   r   update_torch_dtypey   s"   
z%TorchAoHfQuantizer.update_torch_dtyperN   torch.dtyper   c                 C   s   t tj dt dkrOddlm} | j t dkr?ddl	m
} | jj}t||r?|jj}t|}|dkr<|jS tjS |jtjtjd d}|| jj S td	)
NZ
acceleratez0.19.0r   )CustomDtype0.9.0AOBaseConfig4rI   int8_weight_onlyrJ   	autoquantzYou are using `device_map='auto'` on a torchao quantized model. To automatically compute the appropriate device map, you should upgrade your `accelerate` library with `pip install --upgrade accelerate`)r   rC   rD   rE   Zaccelerate.utilsrQ   r5   _get_ao_versionVersiontorchao.core.configrT   rK   r(   r)   r*   r   ZINT4r;   Zint8rB   )r.   rN   rQ   rT   rK   r   
size_digitmap_to_target_dtyper   r   r   adjust_target_dtype   s(   
z&TorchAoHfQuantizer.adjust_target_dtype
max_memoryc                 C   s   dd |  D }|S )Nc                 S   s   i | ]	\}}||d  qS )g?r   ).0keyvalr   r   r   
<dictcomp>   s    z8TorchAoHfQuantizer.adjust_max_memory.<locals>.<dictcomp>)items)r.   r_   r   r   r   adjust_max_memory   s   z$TorchAoHfQuantizer.adjust_max_memoryNr   r
   keep_in_fp32_modulesc                    st   |  || jj|| _| jjr8|   fdd| D | fdd| D fdd| jD | _d S )Nc                    $   g | ]\}}t |t  kr|qS r   idr`   r   module)	input_embr   r   
<listcomp>      $ zKTorchAoHfQuantizer._process_model_before_weight_loading.<locals>.<listcomp>c                    rg   r   rh   rj   )
output_embr   r   rm      rn   c                    s   g | ]
}|  vr|qS r   r   )r`   x)input_emb_namesoutput_emb_namesr   r   rm      s    )Zget_modules_to_not_convertr5   modules_to_not_convertinclude_input_output_embeddingsget_input_embeddingsZnamed_modulesZget_output_embeddings)r.   r   rf   r6   r   )rl   rq   ro   rr   r   $_process_model_before_weight_loading   s   
z7TorchAoHfQuantizer._process_model_before_weight_loadingparam_valueztorch.Tensor
param_name
state_dictc           
         s   | j jdkrdS |dd }t fdd| jD rdS |dkr%| jr%dS t| \}}tjj	g}	| j j
r<|	tjj t|t|	oF|dkS )NrX   Fparam_devicec                 3   s$    | ]}|d   v p| kV  qdS )r   Nr   )r`   ra   rx   r   r   	<genexpr>   s   " z;TorchAoHfQuantizer.check_quantized_param.<locals>.<genexpr>r9   r,   )r5   rK   popanyrs   r=   r   r;   nnLinearrt   appendZ	Embeddingr(   tuple)
r.   r   rw   rx   ry   r6   rz   rk   tensor_nameZ_QUANTIZABLEr   r{   r   check_quantized_param   s   
z(TorchAoHfQuantizer.check_quantized_paramtarget_deviceztorch.deviceunexpected_keysc                 C   s~  | j jdkrdS ddlm} t||\}}	| jr9tjj|j	|d|j
d|j|	< t|tjr7tt||_dS dS t| j tsAJ tjj||j
dj	|d|j|	< | }
| j jrpt|t|
krp|  t|jjddd	d
 | j  tdkrddlm} | j  }t||r|dd\}}d}||j v r|j | }n|j !dd}|dur|||dd d dS ||| j   dS )z
        Each nn.Linear layer that needs to be quantized is processed here.
        First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module.
        rX   Nr   )	quantize_)Zdevice)requires_gradT)decoderZtie_word_embeddingsFz0.10.0)AOPerModuleConfigr   r   _defaultc                 S      dS NTr   )rp   Zfqnr   r   r   <lambda>  s    z;TorchAoHfQuantizer.create_quantized_param.<locals>.<lambda>)Z	filter_fn)"r5   rK   torchao.quantizationr   r   rA   r;   r   	Parametertor   _parametersr(   r   types
MethodTyper/   Z
extra_reprr   ru   Zuntie_embedding_weightsri   Ztie_weightssetattrconfigZget_text_configrY   r   rZ   r   Zget_apply_tensor_subclassrsplitZmodule_fqn_to_configr>   )r.   r   rw   rx   r   ry   r   r   rk   r   Zinput_embedr   r   Z
module_fqn_cr   r   r   create_quantized_param   sD   


z)TorchAoHfQuantizer.create_quantized_paramc                 K   sT   | j jdkr(ddlm} ddlm} tj|dd}||f|dd| j j}|S d	S )
z/No process required for torchao quantized modelrX   r   )rX   )ALL_AUTOQUANT_CLASS_LISTzmax-autotune)modeF)Zqtensor_class_listZset_inductor_configN)	r5   rK   r1   rX   r   r   r;   compileZquant_type_kwargs)r.   r   r6   rX   r   r   r   r   #_process_model_after_weight_loading  s   z6TorchAoHfQuantizer._process_model_after_weight_loadingc                 C   s`   |r	t d dS ttjdtdk}|st d | jr.| jjd u r.t d dS |S )Nzetorchao quantized model does not support safe serialization, please set `safe_serialization` to FalseFZhuggingface_hubz0.25.0zMtorchao quantized model is only serializable after huggingface_hub >= 0.25.0 a  The model contains offloaded modules and these modules are not quantized. We don't recommend saving the model as we won't be able to reload them.If you want to specify modules to not quantize, please specify modules_to_not_convert in the quantization_config.)	rL   warningr   rC   rD   rE   r=   r5   rs   )r.   Zsafe_serializationZ_is_torchao_serializabler   r   r   is_serializable+  s    
z"TorchAoHfQuantizer.is_serializablec                 C   sl   | j  tdkr)ddlm} | j j}t||r)|jj	}t
|}|dkr'dS dS ddddd}|| j j S )a8  
        This factor is used in caching_allocator_warmup to determine how many bytes to pre-allocate for CUDA warmup.
        - A factor of 2 means we pre-allocate the full memory footprint of the model.
        - A factor of 4 means we pre-allocate half of that, and so on

        However, when using TorchAO, calculating memory usage with param.numel() * param.element_size() doesn't give the correct size for quantized weights (like int4 or int8)
        That's because TorchAO internally represents quantized tensors using subtensors and metadata, and the reported element_size() still corresponds to the torch_dtype
        not the actual bit-width of the quantized data.

        To correct for this:
        - Use a division factor of 8 for int4 weights
        - Use a division factor of 4 for int8 weights
        rR   r   rS   rU         rV   )r5   rY   r   rZ   r[   rT   rK   r(   r)   r*   r   )r.   rT   rK   r   r\   r]   r   r   r   get_cuda_warm_up_factor>  s   
z*TorchAoHfQuantizer.get_cuda_warm_up_factorc                 C   s   ddg}| j j|v S )NrW   rJ   )r5   rK   )r.   Z"supported_quant_types_for_trainingr   r   r   is_trainablee  s   zTorchAoHfQuantizer.is_trainablec                 C   r   r   r   )r.   r   r   r   is_compileablem  s   z!TorchAoHfQuantizer.is_compileable)rN   rP   r   rP   r2   )r*   
__module____qualname____doc__Z requires_parameters_quantizationZrequires_calibrationZrequired_packagesr4   rH   rO   r^   r   strr   intre   r   r   rv   r   boolr   r   r   r   r   propertyr   r   __classcell__r   r   r7   r   r0   T   s^    
2$





;'r0   )$rD   r   r   typingr   r   r   	packagingr   baser   Zquantizers_utilsr   Zmodeling_utilsr
   r   r   r   utilsr   r   r   Zutils.quantization_configr   r;   Ztorch.nnr   Z
get_loggerr*   rL   r   r   r"   r+   r/   r0   r   r   r   r   <module>   s*   
