
    eTh;                        S SK JrJrJr  SSKJrJrJrJr  \" 5       (       a  S SK	r	S SK
Jr  S SKrS SKJr  S SK
Jr  \" 5       (       a  S SKJr  \R(                  " \5      r\R.                  S\R0                  4S j5       rS!S	\	R4                  S
\S\\	R4                  \	R4                  4   4S jjr\R.                  S\R0                  S\R0                  S\R0                  S\R0                  4S j5       r\	R<                  4S\	R4                  S\	R4                  S\	R4                  S\	R4                  S
\\   S\	R>                  S\	R4                  4S jjr \	RB                  S\	R<                  4S\	R4                  S\	R4                  S\	R4                  S\	R4                  S
\\\\4      S\	R>                  S\	R4                  4S jj5       r" " S S\RF                  5      r$     S"S jr%  S#S  jr&g)$    )ListOptionalTuple   )is_accelerate_availableis_torch_accelerator_availableis_torch_availableloggingN)
functional)init_empty_weights
BLOCK_SIZEc                    [         R                  " SS9nXC-  [         R                  " SU5      -   n[         R                  " X-   5      R	                  [         R
                  5      n[         R                  " [         R                  " U5      5      S-  nXg-  nUR	                  UR                  R                  5      n[         R                  " X-   U5        [         R                  " X$-   U5        g )Nr   axisg      |@)tl
program_idarangeloadtofloat32maxabsdtype
element_tystore)	x_ptry_ptrs_ptrr   pidoffsxsys	            a/var/www/auris/envauris/lib/python3.13/site-packages/transformers/integrations/finegrained_fp8.pyact_quant_kernelr%   $   s    
--Q
Cbii:66D
  ,A
rvvayE!A	A	U[[##$AHHU\1HHU[!    r!   
block_sizereturnc                 h  ^  T R                  5       (       d   eT R                  S   U-  S:X  d   e[        R                  " T [        R                  S9nT R
                  " / T R                  5       S S QT R                  S5      U-  P7S[        R                  06nU 4S jn[        U   " T X#US9  X#4$ )Nr   r   r   c                 V   > [         R                  " TR                  5       U S   5      4$ )Nr   )tritoncdivnumel)metar!   s    r$   gridact_quant.<locals>.grid6   s"    AGGItL'9:<<r&   )r   )	is_contiguousshapetorch
empty_likefloat8_e4m3fn	new_emptysizer   r%   )r!   r'   r#   r"   r1   s   `    r$   	act_quantr:   0   s    ??772;#q(((%"5"56A	RQVVXcr]RAFF2J*$<REMMRA= T1az:4Kr&   BLOCK_SIZE_MBLOCK_SIZE_NBLOCK_SIZE_KGROUP_SIZE_Mc                    [         R                  " SS9n[         R                  " UU5      n[         R                  " UU5      nUU-  nUU-  nUU-  n[        UU-
  U5      nUUU-  -   nUU-  U-  n UU-  [         R                  " SU5      -   U-  n!U U-  [         R                  " SU5      -   U-  n"[         R                  " SU5      n#U U!SS2S4   U
-  U#SSS24   U-  -   -   n$UU#SS2S4   U-  U"SSS24   U-  -   -   n%UU!U-  -   n&U"U-  n'UU'U-  -   n([         R
                  " UU4[         R                  S9n)[        S[         R                  " UU5      5       H  n*[         R                  " U$U#SSS24   UU*U-  -
  :  SS9n+[         R                  " U%U#SS2S4   UU*U-  -
  :  SS9n,U*U-  n-U-U	-  n.[         R                  " U&U.U-  -   5      n/[         R                  " U(U.U-  -   5      n0U)[         R                  " U+U,5      U/SS2S4   -  U0SSS24   -  -  n)U$UU-  -  n$U%UU-  -  n%M     UR                  R                  [         R                  :X  a   U)R                  [         R                  5      n1OgUR                  R                  [         R                  :X  a   U)R                  [         R                  5      n1OU)R                  [         R                  5      n1UU-  [         R                  " SU5      -   n2U U-  [         R                  " SU5      -   n3X.U2SS2S4   -  -   UU3SSS24   -  -   n4U2SS2S4   U:  U3SSS24   U:  -  n5[         R                  " U4U1U5S9  g)zTriton-accelerated function used to perform linear operations (dot
product) on input tensors `A` and `B` with block-wise quantization, and
store the result in output tensor `C`.
r   r   Nr+   g        )maskother)r@   )r   r   r.   minr   zerosr   ranger   dotr   r   bfloat16r   float16r   )6ABCAsBsMNKgroup_ngroup_k	stride_am	stride_ak	stride_bk	stride_bn	stride_cm	stride_cnstride_As_mstride_As_kstride_Bs_kstride_Bs_nr;   r<   r=   r>   r   	num_pid_m	num_pid_nnum_pid_in_groupgroup_idfirst_pid_mgroup_size_mpid_mpid_noffs_amoffs_bnoffs_ka_ptrsb_ptrsAs_ptrsoffs_bsnBs_ptrsaccumulatorkabk_startoffs_ksa_sb_scoffs_cmoffs_cnc_ptrsc_masks6                                                         r$   _w8a8_block_fp8_matmulry   >   s[   J --Q
C<(I<(I#i/&&H\)Ky;.=L3-.E##4E|#bii<&@@AEG|#bii<&@@AEGYYq,'F'!T'"Y.a91LLMF&D/I-a0@90LLMF7[((G'!H8k))G((L,7rzzJK1bgga./GGFa1q<7G3G!GsSGGF41q<7G3G!GsSl"W$ggg+ 556ggg+ 556rvva|c!T'l2Sq\AA,**,** 0 	wwR[[(NN2;;'	
		rzz	)NN2::&NN2::&l"RYYq,%??Gl"RYYq,%??GWQW---	GD!G<L0LLFag"wtQw'7!';<FHHVQV$r&   rH   rI   rK   rL   output_dtypec                   ^^ [        U5      S:X  d   eUS   US   pvU R                  S   UR                  S   :X  d   eU R                  SS UR                  SS :X  a  U R                  5       (       d   e[        R                  " U R                  S   U5      UR                  S   :X  d   eU R                  5       U R                  S   -  mUR                  S:X  a%  UR                  5       (       a  UR                  S:X  d   eUR                  u  mn[        R                  " TU5      UR                  S   :X  d   e[        R                  " X5      UR                  S   :X  d   eU R                  SS T4-   n	U R                  XS9n
SnTU:  a"  [        R                  " T5      n[        US5      nUnX|-  S:X  d   eUnUU4S	 jn[        U   " U UU
UUTTUUUU R                  S
5      U R                  S5      UR                  S5      UR                  S5      U
R                  S
5      U
R                  S5      UR                  S
5      UR                  S5      UR                  S5      UR                  S5      UUUSS9  U
$ )aQ  This function performs matrix multiplication with block-wise
quantization.
It takes two input tensors `A` and `B` with scales `As` and `Bs`.
The output is returned in the specified `output_dtype`.
Args:
    A: The input tensor, e.g., activation.
    B: The input tensor, e.g., weight.
    As: The per-token-group quantization scale for `A`.
    Bs: The per-block quantization scale for `B`.
    block_size: The block size for per-block quantization. It should
    be 2-dim, e.g., [128, 128].
    output_dytpe: The dtype of the returned tensor.
Returns:
    torch.Tensor: The result of matmul.
r   r      r*   Nr+         c                 p   > [         R                  " TU S   5      [         R                  " TU S   5      -  4$ )Nr;   r<   )r-   r.   )METArM   rN   s    r$   r1   *w8a8_block_fp8_matmul_triton.<locals>.grid   s1    AtN34v{{1d>FZ7[[]]r&      )r;   r<   r=   r>   )lenr4   r3   r-   r.   r/   ndimr8   next_power_of_2r   ry   stride)rH   rI   rK   rL   r'   rz   block_nblock_krO   C_shaperJ   r;   r=   r<   r1   rM   rN   s                  @@r$   w8a8_block_fp8_matmul_tritonr      sE   . z?a!!}jmW772;!''"+%%%773B<288CR=(Q__->->>>;;qwwr{G,<<<		QWWR[ A66Q;1??,,A==77DAq;;q'"bhhqk111;;q"bhhqk111ggcrlaT!G	G0AL<--a0<,L!Q&&&L^ 4 			

									
		"
		"
		!
		!!!!16 Hr&   input_qweight_qinput_scaleweight_scalec                 n   U R                   S:X  a  U R                  OSU R                  S   U R                  S   4u  pgnUR                  S   n	U R                  SU5      n
UR                  UR                  S   S5      nXS   -  nXS   -  n[        R                  " Xg-  U	4[        R
                  U R                  S9n[        U5       H  nXS   -  nUUS   -   n[        U5       H  nUUS   -  nUUS   -   nU
SS2UU24   nUUU2UU24   nUSS2UUS-   24   nX?U4   n[        R                  " UUR                  5       [        R                  " S[        R
                  U R                  S9UUS9U-  nUSS2UU24==   U-  ss'   M     M     UR                  XgU	5      nUR                  U5      $ )a  
Performs blocked matrix multiplication with FP8 quantized matrices.

Args:
    input_q: Quantized input tensor with 1x128 block quantization
    weight_q: Quantized weight tensor with 128x128 block quantization
    input_scale: Scaling factors for input blocks
    weight_scale: Scaling factors for weight blocks
    block_size: Tuple of (M, N) for weight block dimensions
    output_dtype: Desired output dtype
   r|   r   r*   r   deviceN)scale_ascale_b	out_dtype)r   r4   viewr5   rC   r   r   rD   
_scaled_mmttensorr   )r   r   r   r   r'   rz   
batch_sizeseq_len
hidden_dimout_featuresinput_reshapedinput_scale_reshapednum_weight_blocks_mnum_weight_blocks_noutputim_startm_endjn_startn_endinput_blockweight_blockcurr_input_scalecurr_weight_scaleblock_results                             r$   w8a8_block_fp8_matmul_compiler      s   ( 8?||q7HgmmqRYR_R_`aRbdkdqdqrsdtNu#J>>!$L \\"j1N&++K,=,=a,@"E&Q-7$15[[*.=U]][b[i[ijF&'m#*Q-'*+A*Q-'Gjm+E )GEM)9:K#GEM75=$@AL  4Aq1q5yLA ,T 2    NN$!LL%--W-* ##  1gem#$4$/ ,	 (: [[l;F99\""r&   c                      ^  \ rS rSr\R
                  r     SS\S\S\S\	\
\\4      4U 4S jjjrS\R                  S\R                  4S	 jrS
rU =r$ )	FP8Lineari&  in_featuresr   biasr'   c           	        > [         T
U ]  X5        Xl        X l        [        R
                  R                  [        R                  " X![        R                  US95      U l
        U R                  R                  5       S:X  a^  X%S   -   S-
  US   -  nXS   -   S-
  US   -  n	[
        R                  " [        R                  " X[        R                  US95      U l        OU R                  SS 5        XPl        Xpl        U(       a:  [
        R                  " [        R                  " U R                  5      5      U l        g U R                  SS 5        g )Nr   r|   r   weight_scale_invr   )super__init__r   r   r5   nn	Parameteremptyr   r   weightelement_sizer   r   register_parameterr'   activation_schemer   )selfr   r   r   r   r'   r   r   scale_out_featuresscale_in_features	__class__s             r$   r   FP8Linear.__init__)  s    	3&(hh((\V_VeVent)uv;;##%*".A">"BzRS}!T!,!}!<q!@ZPQ] R$&LL._ef%D! ##$6=$!2U[[1B1B%CDDI##FD1r&   inputr(   c           
         U R                   R                  5       S:  a+  [        R                  " XR                   U R                  5      $ [        5       (       a(  [        R                  R                  5       R                  OSn[        [        U[        R                  5      nUR                  UR                  5         [        XR                  S   5      u  pE[        UU R                   UU R                   U R                  UR"                  S9nS S S 5        UR%                  5         U R                  b  WU R                  -   nWR'                  UR"                  S9$ ! , (       d  f       NS= f)Nr|   cuda)rz   r+   )r   r   Flinearr   r   r5   acceleratorcurrent_acceleratortypegetattrr   r   r:   r'   r   r   r   synchronizer   )r   r   device_typetorch_accelerator_moduleqinputscaler   s          r$   forwardFP8Linear.forwardK  s
   ;;##%)88E;;		:: KiJjJj%++??AFFpvK'.uk5::'N$)00> )%1C D5KK))OO!& ? %002yy$$))+995;;9// ?>s   =AE
E))r   r   r'   r   r   r   r   )FNNNdynamic)__name__
__module____qualname____firstlineno__r5   r7   r   intboolr   r   r   Tensorr   __static_attributes____classcell__)r   s   @r$   r   r   &  s}    E 04# 2 2  2 	 2 U38_- 2  2D0U\\ 0ell 0 0r&   r   c                   ^	 Uc  / nU R                  5        GHL  u  pgUR                  U5        [        U[        R                  5      (       a  Xb=(       d    / ;  a  SR                  U5      m	[        U	4S jU=(       d    /  5       5      (       d  [        5          [        UR                  UR                  UR                  SLUR                  R                  UR                  R                  UR                  UR                   S9U R"                  U'   SnSSS5        [%        ['        UR)                  5       5      5      S:  a  [+        UUUUUUS9u  pUR-                  S5        GMO     X4$ ! , (       d  f       N]= f)	z%Replace Linear layers with FP8Linear.N.c              3   ,   >#    U  H	  oT;   v   M     g 7f)N ).0keycurrent_key_name_strs     r$   	<genexpr>+_replace_with_fp8_linear.<locals>.<genexpr>u  s     ]?[s22?[s   )r   r   r   r   r   r   r'   Tr   )has_been_replacedr*   )named_childrenappend
isinstancer   Linearjoinanyr   r   r   r   r   r   r   r   r   weight_block_size_modulesr   listchildren_replace_with_fp8_linearpop)
modeltp_planmodules_to_not_convertcurrent_key_namequantization_configr   namemodule_r   s
            @r$   r   r   d  sI    ,,.%fbii((T:VTV-W#&88,<#= ]?U?[Y[?[]]]')+4$*$6$6%+%8%8#[[4%}}33$mm11*=*O*O#6#H#H,ENN4( )-% * tFOO%&'!+#;& #"3$ A 	R ; /> ##3 *)s   A;E++
E9	c                     Uc  S/OUnUR                   b  UR                  UR                   5        [        [        U5      5      n[	        U U R
                  UUS9u  pU(       d  [        R                  S5        U $ )z:Helper function to replace model layers with FP8 versions.lm_head)r   r   r   zYou are loading your model using fp8 but no linear modules were found in your model. Please double check your model architecture.)r   extendr   setr   _tp_planloggerwarning)r   r   r   r   s       r$   replace_with_fp8_linearr    s     -C,Ji[Pf11=%%&9&P&PQ!#&<"=>75/	 E <	

 Lr&   )r}   )NNNNF)NN)'typingr   r   r   utilsr   r   r	   r
   r5   torch.nnr   r-   triton.languagelanguager   r   r   
accelerater   
get_loggerr   r  jit	constexprr%   r   r   r:   ry   r   r   r   compiler   r   r   r   r  r   r&   r$   <module>r     s+    ) ( h h  (- 
		H	% bll  
 
3 
u||U\\?Y9Z 
 Q%4 ,,5Q%6 ,,7Q%8 ,,9Q%: ,,;Q% Q%t !&M||M||M 	M 		M
 S	M ++M \\Mb  -1 %>#\\>#ll># ># ,,	>#
 sCx)># ++># \\># >#B;0		 ;0@ +$`  r&   