o
    Zh;                     @   s  d dl mZmZmZ ddlmZmZmZmZ e r3d dl	Z	d dl
mZ d dlZd dlmZ d dl
mZ e r<d dlmZ eeZejdejfdd	Zd*de	jdedee	je	jf fddZejdejdejdejdejfddZe	jfde	jde	jde	jde	jdee de	jde	jfddZ e	j!de	jfde	jde	jde	jd e	jdeeeef  de	jde	jfd!d"Z"G d#d$ d$ej#Z$					%d+d&d'Z%		d,d(d)Z&dS )-    )ListOptionalTuple   )is_accelerate_availableis_torch_accelerator_availableis_torch_availableloggingN)
functional)init_empty_weights
BLOCK_SIZEc           	      C   s   t jdd}|| t d| }t | | t j}t t |d }|| }||jj	}t 
|| | t 
|| | d S )Nr   Zaxisg      |@)tl
program_idarangeloadtofloat32maxabsdtype
element_tystore)	Zx_ptrZy_ptrZs_ptrr   pidZoffsxsy r   X/var/www/auris/lib/python3.10/site-packages/transformers/integrations/finegrained_fp8.pyact_quant_kernel$   s   r      r   
block_sizereturnc                    s      sJ  jd | dksJ tj tjd} jg   d d  d| R dtji} fdd}t|  |||d ||fS )Nr   r   r   c                    s   t   | d fS )Nr   )tritoncdivnumel)metar   r   r   grid6   s   zact_quant.<locals>.grid)r   )	is_contiguousshapetorchZ
empty_likefloat8_e4m3fn	new_emptysizer   r   )r   r!   r   r   r*   r   r)   r   	act_quant0   s   2r1   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc           6      C   s  t jdd}t ||}t ||}|| }|| }|| }t|| |}|||  }|| | } || t d| | }!| | t d| | }"t d|}#| |!dddf |
 |#dddf |   }$||#dddf | |"dddf |   }%||!|  }&|"| }'||'|  }(t j||ft jd})tdt ||D ]h}*t j|$|#dddf ||*|  k dd}+t j|%|#dddf ||*|  k dd},|*| }-|-|	 }.t |&|.|  }/t |(|.|  }0|)t 	|+|,|/dddf  |0dddf  7 })|$|| 7 }$|%|| 7 }%q|j
jt jkr|)t j}1n|j
jt jkr%|)t j}1n|)t j}1|| t d| }2| | t d| }3|||2dddf   ||3dddf   }4|2dddf |k |3dddf |k @ }5t j|4|1|5d dS )zTriton-accelerated function used to perform linear operations (dot
    product) on input tensors `A` and `B` with block-wise quantization, and
    store the result in output tensor `C`.
    r   r   Nr$   g        )maskother)r6   )r   r   r&   minr   zerosr   ranger   dotr   r   Zbfloat16r   Zfloat16r   )6ABCAsBsMNKZgroup_nZgroup_kZ	stride_amZ	stride_akZ	stride_bkZ	stride_bnZ	stride_cmZ	stride_cnZstride_As_mZstride_As_kZstride_Bs_kZstride_Bs_nr2   r3   r4   r5   r   Z	num_pid_mZ	num_pid_nZnum_pid_in_groupZgroup_idZfirst_pid_mZgroup_size_mZpid_mZpid_nZoffs_amZoffs_bnZoffs_kZa_ptrsZb_ptrsZAs_ptrsZoffs_bsnZBs_ptrsZaccumulatorkabZk_startZoffs_ksZa_sZb_scZoffs_cmZoffs_cnZc_ptrsZc_maskr   r   r   _w8a8_block_fp8_matmul>   sL   %,,((0,(rH   r<   r=   r?   r@   output_dtypec                    s  t |dksJ |d |d }}| jd |jd ksJ | jdd |jdd kr/|  s1J t| jd ||jd ksAJ |  | jd   |jdkrX| rX|jdksZJ |j\}t||jd kslJ t|||jd ksyJ | jdd f }	| j|	|d}
d} |k rt }t	|d}|}|| dksJ |} fd	d
}t
| | ||
|| |||| d| d|d|d|
d|
d|d|d|d|d|||dd |
S )a  This function performs matrix multiplication with block-wise
    quantization.
    It takes two input tensors `A` and `B` with scales `As` and `Bs`.
    The output is returned in the specified `output_dtype`.
    Args:
        A: The input tensor, e.g., activation.
        B: The input tensor, e.g., weight.
        As: The per-token-group quantization scale for `A`.
        Bs: The per-block quantization scale for `B`.
        block_size: The block size for per-block quantization. It should
        be 2-dim, e.g., [128, 128].
        output_dytpe: The dtype of the returned tensor.
    Returns:
        torch.Tensor: The result of matmul.
    r   r      r#   Nr$   r       c                    s"   t  | d t | d  fS )Nr2   r3   )r%   r&   )ZMETArA   rB   r   r   r*      s   "z*w8a8_block_fp8_matmul_triton.<locals>.grid   )r2   r3   r4   r5   )lenr,   r+   r%   r&   r'   ndimr/   Znext_power_of_2r   rH   Zstride)r<   r=   r?   r@   r!   rI   Zblock_nZblock_krC   ZC_shaper>   r2   r4   r3   r*   r   rL   r   w8a8_block_fp8_matmul_triton   s^   (  


rQ   input_qweight_qinput_scaleweight_scalec              
   C   s  | j dkr| jn
d| jd | jd f\}}}|jd }	| d|}
||jd d}|	|d  }||d  }tj|| |	ftj| jd}t|D ]k}||d  }||d  }t|D ]X}||d  }||d  }|
dd||f }|||||f }|dd||d f }|||f }tj||	 tj
dtj| jd||d| }|dd||f  |7  < qZqH||||	}||S )a  
    Performs blocked matrix multiplication with FP8 quantized matrices.

    Args:
        input_q: Quantized input tensor with 1x128 block quantization
        weight_q: Quantized weight tensor with 128x128 block quantization
        input_scale: Scaling factors for input blocks
        weight_scale: Scaling factors for weight blocks
        block_size: Tuple of (M, N) for weight block dimensions
        output_dtype: Desired output dtype
       rJ   r   r#   r   deviceN)Zscale_aZscale_bZ	out_dtype)rP   r,   viewr-   r9   r   rX   r:   Z
_scaled_mmtZtensorr   )rR   rS   rT   rU   r!   rI   Z
batch_sizeZseq_lenZ
hidden_dimout_featuresZinput_reshapedZinput_scale_reshapedZnum_weight_blocks_mZnum_weight_blocks_noutputiZm_startZm_endjZn_startZn_endZinput_blockZweight_blockZcurr_input_scaleZcurr_weight_scaleZblock_resultr   r   r   w8a8_block_fp8_matmul_compile   s>   ,

r_   c                       sb   e Zd ZejZ					ddedededee	eef  f fdd	Z
d
ejdejfddZ  ZS )	FP8LinearFNdynamicin_featuresr[   biasr!   c           
         s   t  || || _|| _tjtj||tj	|d| _
| j
 dkrJ||d  d |d  }||d  d |d  }	ttj||	tj|d| _n| dd  || _|| _|rdtt| j| _d S | dd  d S )NrW   rJ   r   weight_scale_invrc   )super__init__rb   r[   r-   nn	Parameteremptyr`   r   weightelement_sizer   rd   Zregister_parameterr!   activation_schemerc   )
selfrb   r[   rc   r   r!   rX   rl   Zscale_out_featuresZscale_in_features	__class__r   r   rf   )  s    
zFP8Linear.__init__inputr"   c              	   C   s   | j  dkrt|| j | jS t rtj j	nd}t
t|tj}||j  t|| jd \}}t|| j || j| j|jd}W d    n1 sKw   Y  |  | jd ur^|| j }|j|jdS )NrJ   cuda)rI   r$   )rj   rk   FZlinearrc   r   r-   ZacceleratorZcurrent_acceleratortypegetattrrq   rX   r1   r!   rQ   rd   r   Zsynchronizer   )rm   rp   Zdevice_typeZtorch_accelerator_moduleZqinputscaler\   r   r   r   forwardK  s&   

zFP8Linear.forward)FNNNra   )__name__
__module____qualname__r-   r.   r   intboolr   r   rf   Tensorrv   __classcell__r   r   rn   r   r`   &  s"    "r`   Fc           	         s   |du rg }|   D ]p\}}|| t|tjr_||pg vr_d| t fdd|p-g D s_t # t|j	|j
|jdu|jj|jj|j|jd| j|< d}W d   n1 sZw   Y  tt| dkrut||||||d\}}|d	 q
| |fS )
z%Replace Linear layers with FP8Linear.N.c                 3   s    | ]}| v V  qd S )Nr   ).0keyZcurrent_key_name_strr   r   	<genexpr>u  s    z+_replace_with_fp8_linear.<locals>.<genexpr>)rb   r[   rc   rX   r   rl   r!   Tr   )has_been_replacedr#   )Znamed_childrenappend
isinstancerg   Linearjoinanyr   r`   rb   r[   rc   rj   rX   r   rl   Zweight_block_sizeZ_modulesrO   listchildren_replace_with_fp8_linearpop)	modeltp_planmodules_to_not_convertZcurrent_key_namequantization_configr   namemodule_r   r   r   r   d  s<   	

	
	r   c                 C   s\   |du rdgn|}|j dur||j  tt|}t| | j||d\} }|s,td | S )z:Helper function to replace model layers with FP8 versions.NZlm_head)r   r   r   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.)r   extendr   setr   Z_tp_planloggerwarning)r   r   r   r   r   r   r   replace_with_fp8_linear  s   

r   )r    )NNNNF)NN)'typingr   r   r   utilsr   r   r   r	   r-   Ztorch.nnrg   r%   Ztriton.languagelanguager   r
   rr   Z
accelerater   Z
get_loggerrw   r   ZjitZ	constexprr   r|   rz   r1   rH   r   r   rQ   compiler_   r   r`   r   r   r   r   r   r   <module>   s   
&Z
QA@
0