o
    Zh0                     @   s   d dl mZ d dlmZmZmZmZ e rddlZddlmZ e r(ddl	m
Z
 e r/ddlZeeZG dd dejjZG d	d
 d
ejZ							dddZ						dddZdS )   )ACT2FN)is_accelerate_availableis_fbgemm_gpu_availableis_torch_availablelogging    N)nn)init_empty_weightsc                       *   e Zd Zejf fdd	Zdd Z  ZS )FbgemmFp8Linearc                    s   t  ||| || _|| _tjtj||ftjd| _	tjtj|df|d| _
| jdtjdgtjddd |rKtjtj| j|d| _d S d | _d S )Ndtype   input_scale_ubF
persistent)super__init__in_featuresout_featurestorchr   	Parameterzerosfloat8_e4m3fnweightweight_scaleregister_bufferfloatbias)selfr   r   r   Zweight_dtype	__class__ S/var/www/auris/lib/python3.10/site-packages/transformers/integrations/fbgemm_fp8.pyr   !   s   
zFbgemmFp8Linear.__init__c                 C   s   g |j d d dR }tjjj|d|j d  | jd\}}| j	tj
}tjjj|| j||dd}| jd ur?|| j n|}|	|j}||}~~|S )N)Zscale_ubTZuse_fast_accum)shaper   opsfbgemmquantize_fp8_per_rowview
contiguousr   r   tofloat32f8f8bf16_rowwiser   r   devicereshape)r   xZoutput_shapeZx_quantizedZx_scaleZweight_scale_float32outputr"   r"   r#   forward/   s   

zFbgemmFp8Linear.forward__name__
__module____qualname__r   r-   r   r3   __classcell__r"   r"   r    r#   r       s    r   c                       r
   )FbgemmFp8Llama4TextExpertsc                    s   t    |j| _|j| _|j| _| j| _t|j | _	t
jt
j| j| jd| j ft
jd| _t
jt
j| jd| jd ft
jd| _t
jt
j| j| j| jft
jd| _t
jt
j| j| jdft
jd| _| jdt
jdgt
jddd d S )Nr   r   r   r   Fr   )r   r   Znum_local_expertsnum_expertsZintermediate_sizehidden_sizeZ
expert_dimr   Z
hidden_actact_fnr   r   r   r   r   gate_up_projr-   gate_up_proj_scale	down_projdown_proj_scaler   r   )r   configr   r    r"   r#   r   H   s&   
"z#FbgemmFp8Llama4TextExperts.__init__c              	   C   s  | | jd| j}d}t|}t| jD ]}|| }|d| j}tjj	||| j
\}}| jjd d }	| jtj}
tjjj|| j| ddd|	  ||
| d d|	  dd dd}tjjj|| j| dd|	d  ||
| d |	d  dd dd}|| | }tjj	||| j
\}}| jtj}tjjj|| j| dd |||  dd dd}|||< q||j}| d| jS )z
        Args:
            hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
        Returns:
            torch.Tensor: (batch_size * token_num, hidden_size)
        r$   Nr   r   r   Tr%   )r*   r:   r;   r   Z
empty_likeranger0   r'   r(   r)   r   r=   r&   r>   r,   r-   r.   Z	transposer+   r<   r@   r?   r/   )r   Zhidden_statesZ
num_tokensZnext_statesiZexpert_hiddenZexpert_hidden_reshapedZexpert_quantizedZexpert_scaleZsharded_expert_dimZgate_up_proj_scale_float32ZgateupZ	activatedZactivated_quantizedZactivated_scaleZdown_proj_scale_float32Zexpert_outputr"   r"   r#   r3   `   sP   

z"FbgemmFp8Llama4TextExperts.forwardr4   r"   r"   r    r#   r9   G   s    r9   Fc                    s  ddl }|du r
g }|  D ]\}	}
||	 t|
tjrp|	|vrpd| t fdd|D sptdd$ |
j	}|
j
}t|||
jdu| j|	< d}| j|	 d W d   n1 s]w   Y  tj|jgtjd	| j|	 _|
jjd
kr|	|vrd| t fdd|D stdd d||dd d < t|j| j|	< W d   n1 sw   Y  tj|jgtjd	| j|	 _tt|
 dkrt|
|||||||d\}}|d q| |fS )z
    Private method that wraps the recursion for module replacement.

    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
    r   N.c                 3   $    | ]}|d   v p| kV  qdS rE   Nr"   .0keyZcurrent_key_name_strr"   r#   	<genexpr>       
z2_replace_with_fbgemm_fp8_linear.<locals>.<genexpr>T)Zinclude_buffersFr   ZLlama4TextExpertsc                 3   rF   rG   r"   rH   rK   r"   r#   rL      rM   z\d+*z.down_proj_scale)has_been_replacedpre_quantizedrA   tp_planr$   )reZnamed_childrenappend
isinstancer   Linearjoinanyr	   r   r   r   r   Z_modulesZrequires_grad_r   ZtensorZactivation_scale_ubr   r   r!   r5   subr9   Ztext_configlenlistchildren_replace_with_fbgemm_fp8_linearpop)modelmodules_to_not_convertcurrent_key_namequantization_configrO   rP   rA   rQ   rR   namemoduler   r   _r"   rK   r#   r\      sh   





r\   c              	   C   s`   |du rdgn|}|j dur||j  tt|}t| ||||||d\} }|s.td | S )a  
    A helper function to replace all `torch.nn.Linear` modules by `FbgemmFp8Linear` modules.
    This will enable running your models using high performance fp8 kernel from FBGEMM library.

    The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should
    be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no
    CPU/GPU memory is required to run this function. Each weight will be quantized along the channel.

    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`):
            Names of the modules to not convert in `FP8Linear`. In practice we keep the `lm_head` in full precision
            for numerical stability reasons.
        current_key_name (`List[`str`]`, *optional*):
            An array to track the current key of the recursion. This is used to check whether the current key (part of
            it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or
            `disk`).
    NZlm_head)rP   rA   rQ   zYou are loading your model using FP8 quantization but no linear modules were found in your model. Please double check your model architecture, or submit an issue on github if you think this is a bug.)r_   extendrZ   setr\   loggerwarning)r^   r_   r`   ra   rP   rA   rQ   rO   r"   r"   r#   replace_with_fbgemm_fp8_linear   s$   

	ri   )NNNFFNN)NNNFNN)Zactivationsr   utilsr   r   r   r   r   r   Z
accelerater	   Zfbgemm_gpu.experimental.gen_aiZ
fbgemm_gpuZ
get_loggerr5   rg   rU   r   Moduler9   r\   ri   r"   r"   r"   r#   <module>   s6   
'X
O