
    fThA                        S r SSKrSSKJr  SSKJrJrJr  SSKrSSK	rSSKJ
r
  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSKJr  \R4                  " \5      r\" 5       (       a  SSKJr  SSKJ r J!r!  OSu  r r!r\" 5       (       a	  SSK"J#r#J$r$  OSu  r$r#\%" \\ \!\#\$45      r&S\RN                  S\(4S jr)S r*S r+S r, " S S5      r- " S S\R                  R\                  5      r/ " S S\
R\                  5      r0 " S S \
R\                  5      r1 " S! S"\
R\                  5      r2\ " S# S$\5      5       r3\ " S% S&\5      5       r4\ " S' S(\5      5       r5\ " S) S*\35      5       r6\" S+S,9 " S- S.\3\5      5       r7/ S/Qr8g)0zPyTorch MAMBA2 model.    N)	dataclass)OptionalTupleUnion)nn   )ACT2FN)GenerationMixin)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_mamba_2_ssm_available   )Mamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_update)NNinput_tensorpad_sizec                     [        U R                  5      S:X  a
  SSSSSUSS4OSSSUSS4n[        R                  R                  R                  XSSS9$ )zv
Padding x tensor with `pad_size` on the seq_len dim (dim=1)

Assumes that we only have tensors of either size 4 or 3
   r   constant)modevalue)lenshapetorchr   
functionalpad)r   r   	pad_shapes      b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mamba2/modeling_mamba2.pypad_tensor_by_sizer'   A   sd     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UU    c                    [        X5      n [        U R                  5      S:X  a-  U R                  U R                  S   SX R                  S   5      $ U R                  U R                  S   SX R                  S   U R                  S   5      $ )z
Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
simultaneously splitting it into chunk sequences.

Assumes that we only have tensors of either size 4 or 3
r   r      )r'   r    r!   reshape)r   r   
chunk_sizes      r&   reshape_into_chunksr.   L   s     &l=L
<!###L$6$6q$92zK]K]^_K`aa ##q!2z3E3Ea3H,J\J\]^J_
 	
r(   c           	      
   U R                  S5      nU S   R                  " / U R                  5       QUP76 n [        R                  " [        R                  " XU R
                  [        R                  S9SS9nU R                  U) S5      n [        R                  " U SS9n[        R                  " [        R                  " XU R
                  [        R                  S9SS9nUR                  U) [        R                  * 5      nU$ )zg
More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
r*   .Ndevicedtype)diagonalr   dim)
sizeexpandr"   trilonesr2   boolmasked_fillcumsuminf)r   r-   masktensor_segsums       r&   segment_sumrB   `   s     ""2&J  	*11S<3D3D3FS
SL::ejj@S@S[`[e[efqstD++TE15LLL26M ::ejj@S@S[`[e[efqrsD!--teeiiZ@Mr(   c                     UbO  UR                   S   S:  a<  UR                   S   S:  a)  U R                  nXSS2SS2S4   -  R                  U5      n U $ )ze
Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
Nr   r   )r!   r3   to)hidden_statesattention_maskr3   s      r&   apply_mask_to_padding_statesrG   t   s_     !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr(   c            
           \ rS rSrSr\R                  S4S\S\S\R                  S\
\   4S jjr SS	\S
\R                  S\S\R                  4S jjrS	\S\R                  4S jrS rSrg)Mamba2Cache   af  
Arguments:
    config: Mamba2Config
    batch_size: int
    dtype: torch.dtype
    device: torch.device

Attributes:
    dtype: (`torch.dtype`):
        The default `dtype` used to initializing the cache.
    conv_kernel_size: (`int`):
        Model's convolution kernel size taken from config.
    n_groups: (`int`):
        Model's number of groups taken from the config - similar to tensor parallel in Transformer.
    state_size: (`int`):
        Model's SSM state size taken from config.
    num_heads: (`int`):
        The number of heads used in the linear attention / SSM.
    head_dim: (`int`):
        The respective dimension of the heads used in the linear attention / SSM.
    intermediate_size: (`int`):
        Model's intermediate_size based on (expand * hidden_dim) from config.
    conv_states: (`torch.Tensor`):
        A tensor of shape `[num_layers, batch_size, conv_kernel_size, intermediate_size + 2 * n_groups * state_size]` that holds convolutional states.
    ssm_states: (`torch.Tensor`):
        A tensor of shape `[num_layers, batch_size, num_heads, head_dim, state_size]` that holds ssm states.
Nconfig
batch_sizer3   r2   c           
      H   X0l         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        [        UR                  UR                  -  5      U l
        [        R                  " UR                  UU R                  SU R                  -  U R                  -  -   U R                  UUS9U l        [        R                  " UR                  UU R
                  U R                  U R                  UUS9U l        g )Nr+   r1   )r3   conv_kernelconv_kernel_sizen_groups
state_size	num_headshead_dimintr9   hidden_sizeintermediate_sizer"   zerosnum_hidden_layersconv_states
ssm_states)selfrK   rL   r3   r2   s        r&   __init__Mamba2Cache.__init__   s     
 & 2 2 ++))!$V]]V5G5G%G!H ;;$$""Q%6%HH!!
  ++$$NNMMOO
r(   	layer_idxnew_conv_state
cache_initreturnc                 t   U(       a3  UR                  U R                  R                  5      U R                  U'   OpU R                  U   R                  SSS9U R                  U'   US S 2SS S 24   R                  U R                  R                  5      U R                  U   S S 2S S 2S4'   U R                  U   $ )Nr*   )shiftsdimsr   )rD   rY   r2   roll)r[   r^   r_   r`   s       r&   update_conv_stateMamba2Cache.update_conv_state   s     *8*;*;D<L<L<S<S*TDY'*.*:*:9*E*J*JRT[]*J*^DY'4B1a74K4N4NtO_O_OfOf4gDY'1b1	**r(   new_ssm_statec                     UR                  U R                  R                  5      U R                  U'   U R                  U   $ N)rD   rZ   r2   )r[   r^   rh   s      r&   update_ssm_stateMamba2Cache.update_ssm_state   s4    %2%5%5doo6L6L%M	"y))r(   c                 l    U R                   R                  5         U R                  R                  5         g rj   )rY   zero_rZ   r[   s    r&   resetMamba2Cache.reset   s$     r(   )	rO   rY   r3   rS   rV   rP   rR   rZ   rQ   )F)__name__
__module____qualname____firstlineno____doc__r"   float16r   rT   r3   r   strr\   Tensorr<   rf   rk   rp   __static_attributes__ r(   r&   rI   rI      s    : KP--qu
"
03
<AKK
aijman
< PU++.3ll+HL+	+*# *ell * r(   rI   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )MambaRMSNormGated   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g rj   superr\   r   	Parameterr"   r;   weightvariance_epsilonr[   rU   eps	__class__s      r&   r\   MambaRMSNormGated.__init__   s-    ll5::k#:; #r(   c                    UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  S5      R                  SSS9nU[        R                  " X@R                  -   5      -  nU R                  UR                  U5      -  $ Nr+   r*   T)keepdim)r3   rD   r"   float32r   r#   silupowmeanrsqrtr   r   )r[   rE   gateinput_dtypevariances        r&   forwardMambaRMSNormGated.forward   s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%H?T?T4T(UU{{]--k:::r(   r   r   gư>rj   rr   rs   rt   ru   r\   r   rz   __classcell__r   s   @r&   r}   r}      s    $
	; 	;r(   r}   c            
       v  ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
\   S\
\R                     S	\
\R                     4S
 jjr   SS\R                  S\
\   S\
\R                     S	\
\R                     4S jjr   SS\
\   S\
\R                     S	\
\R                     4S jjrSrU =r$ )Mamba2Mixer   uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
rK   r^   c           	      f  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        UR                  U R                  -  5      U l
        [        UR                  5      U l        X l        UR                  U l        UR                  U l        [         UR                     U l        UR$                  U l        UR&                  U l        UR(                  U l        UR*                  U l        UR,                  U l        UR.                  U l        UR0                  U l        UR2                  U l        U R                  SU R(                  -  U R
                  -  -   U l        [6        R8                  " U R4                  U R4                  UR                  UR                  U R4                  UR                  S-
  S9U l        U R                  U R4                  -   U R                  -   n[6        R<                  " U R                  UUR>                  S9U l         [6        RB                  " [D        RF                  " U R                  5      5      U l$        [D        RJ                  " SU R                  S-   5      n[6        RB                  " [D        RL                  " U5      5      U l'        SU RN                  l(        [S        U R                  U R$                  S9U l*        [6        RB                  " [D        RF                  " U R                  5      5      U l+        SU RV                  l(        [6        R<                  " U R                  U R                  UR>                  S9U l,        UR>                  U l        [Z        (       d  [\        R_                  S5        g g )Nr+   r   )in_channelsout_channelsbiaskernel_sizegroupspaddingr   Tr   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)0r   r\   rR   rU   rQ   ssm_state_sizerN   rO   rT   r9   rV   time_step_rankr^   use_conv_bias
hidden_act
activationr	   actlayer_norm_epsilonrms_normrP   rS   r-   time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearuse_biasin_projr   r"   r;   dt_biasarangelogA_log_no_weight_decayr}   normDout_projis_fast_path_availableloggerwarning_once)r[   rK   r^   projection_sizeAr   s        r&   r\   Mamba2Mixer.__init__   s   ))!--$// & 2 2!$V]]T5E5E%E!F!&"7"78"#11 ++&++,"(";"; ++%55#11#11..T]]1BTEXEX1XXii%%**==&&*
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#%d&<&<$BYBYZ	ejj89"&		$"8"8$:J:JQWQ`Q`a%%> &r(   rE   cache_paramscache_positionrF   c                    [        X5      nU R                  U5      nUR                  u  pgnU R                  U R                  -  n	UR                  S   SU R
                  -  -
  SU R                  -  U R                  -  -
  U R                  -
  S-  n
UGb  UGb  US   S:  Ga  UR                  S5      R                  XU R
                  U R                  U R                  /SS9u    pp[        UUR                  U R                     U R                  R                  R                  S5      U R                  R                  U R                   5      n["        R                  " UU R
                  X/SS9u  pn["        R$                  " U R&                  R)                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R+                  SU R,                  U R                  5      R/                  ["        R0                  S9nUS S 2S S 2S 4   R+                  SSU R,                  5      nU R2                  S S 2S S4   R+                  SU R,                  5      nU R4                  S S 2S S4   R+                  SU R,                  5      nUR7                  X`R                  UR                  S   U R                  -  5      nUR7                  X`R                  UR                  S   U R                  -  5      nUR7                  X`R                  U R,                  5      n[9        UR:                  U R                     UUUUUUS USS	9
nUR7                  X`R                  U R,                  -  5      nU R=                  X5      nU R?                  U5      S S 2S S4   nU$ ["        R$                  " U R&                  R)                  5       5      * nU R@                  S
[)        S5      4:X  a  0 OSU R@                  0nU RB                  (       a  Uc  [E        UU R                  R                  R                  S5      U R                  R                  U R2                  U4U R4                  U RF                  S U R                   U R<                  R                  U R<                  RH                  U R>                  R                  U R>                  R                  U R,                  U R                  SSS.UD6nU$ UR                  XU R
                  U R                  U R                  /SS9u    ppUbi  URK                  SS5      n[L        RN                  RQ                  UURR                  UR                  S   -
  S45      nURU                  U R                  USS9  U R                   S;  aH  U RW                  U R                  URK                  SS5      5      SS U24   RK                  SS5      5      nOm[Y        URK                  SS5      U R                  R                  R                  S5      U R                  R                  U R                   S9RK                  SS5      n[        X5      n["        R                  " UU R
                  X/SS9u  pn[[        UR7                  XgSU R,                  5      UUUR7                  XgU R                  S5      UR7                  XgU R                  S5      4U RF                  U R4                  S S SU R2                  SS.UD6u  nnUb  Ub  UR]                  U R                  US9  UR7                  XgS5      nU R=                  UU5      nU R?                  U5      nU$ )Nr*   r+   r   r   r6   .r3   T)zr   dt_softplusg        r?   dt_limitF)r   r-   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr^   r_   r`   )r   swish)xr   r   r   )r-   r   r   r   r   r   r   r^   rh   )/rG   r   r!   rP   r   rV   rR   squeezesplitr   r   rY   r^   r   r   r   r   r"   expr   floatr9   rS   rD   r   r   r   viewr   rZ   r   r   r   trainingr   r-   r   	transposer   r#   r$   rO   rf   r   r   r   rk   )r[   rE   r   r   rF   projected_statesrL   seq_len_groups_time_state_sized_mlpr   hidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrY   scan_output	ssm_states                             r&   cuda_kernels_forward Mamba2Mixer.cuda_kernels_forward(  s    5]S<<6 "/!4!4
Q!%1D1D!D""2&$((()$--$"5"556 nn  #(B~VWGX[\G\0@0H0H0K0Q0Qt55t}}dnnU[] 1R 1-Aq)
 !5!((8""**1-  ! #(++!'')?X#Ma 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2''7& M *..z>>DMM;YZM IIm:M --.q$|<Cz 
s 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%h 
} 5E4J4J4#9#94==$..Y_a 5K 511-  +3D3N3NqRS3T0"$--"3"34%669U9[9[\^9__abc#K !22"&..Y] 3  ??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'??	)
  i1o & %AAR$c!&+kk%++-C\'#! *C!&&zBNFF:rBFF:rB*  $ff (, LL $* &*&Y" (\-E 11DNNZc1d)..zBG"iiT: mmK0
r(   c                 (   UR                   u  pVnUR                  n[        X5      nU R                  U5      n	U	R                   S   SU R                  -  -
  SU R
                  -  U R                  -  -
  U R                  -
  S-  n
U	R                  XU R                  U R                  U R                  /SS9u    p{pUb  Ub  US   S:  a  UR                  U R                  USS9  UR                  U R                     R                  U R                  R                  R                   S9n["        R$                  " XR                  R                  R'                  S5      -  SS9nU R(                  (       a  XR                  R*                  -   nU R-                  U5      nOUbh  UR/                  SS5      n[0        R2                  R5                  XR6                  UR                   S   -
  S45      nUR                  U R                  US	S9  U R-                  U R                  UR/                  SS5      5      S
S U24   R/                  SS5      5      n[        X5      n["        R                  " UU R                  U R
                  U R                  -  U R
                  U R                  -  /SS9u  nnn["        R8                  " U R:                  R=                  5       5      * nUGb  UGb  US   S:  Ga  UR>                  R                   nUS S 2SS S 24   S S 2S S
4   nUR/                  SS5      RA                  X]R                   S   U RB                  5      nU RD                  S   RA                  U RD                  R                   S   U RB                  5      n["        R0                  R2                  RG                  UUR                  UR                  5      -   5      n["        RH                  " XRJ                  S   U RJ                  S   5      nUS   RA                  U R                  U RB                  U R                  5      R                  ["        RL                  S9n["        R8                  " US   U-  5      R                  US9nURO                  XPR
                  S5      S
S S S 24   nURA                  XPR
                  U R                  U R
                  -  UR                   S   5      RQ                  5       nURO                  USUR                   S   5      nUS   US
S S S 24   -  nURO                  USU RB                  5      nUUS   -  R                  US9nURS                  U R                  UR>                  U R                     U-  U-   S9  URO                  XPR
                  S5      S
S S S 24   nURA                  XPR
                  U R                  U R
                  -  UR                   S   5      RQ                  5       nURO                  USUR                   S   5      nUR>                  U R                     R                  UR                   UR                  S9nURU                  XPR                  -  U RB                  U R                  5      nURU                  XPR                  -  U R                  S5      n["        RV                  " UU5      nURU                  XPR                  U RB                  5      nU RX                  S   RA                  U RX                  R                   S   U RB                  5      nUUU-  -   R                  UR                  5      nURO                  US5      S S 2S S
4   nGO[0        R2                  RG                  XRD                  -   5      n["        RH                  " XRJ                  S   U RJ                  S   5      nURO                  XVSU RB                  5      R=                  5       nURO                  XVSU R                  5      R=                  5       nURO                  XVSU R                  5      R=                  5       nUR[                  U R                  U R
                  -  SU R                  S9nUR[                  U R                  U R
                  -  SU R                  S9nU R\                  X`R\                  -  -
  U R\                  -  nU RX                  S   [_        UU5      -  nXS   -  nUR                  UR                  5      U-  nUUUU4 Vs/ s H  n[a        UUU R\                  5      PM     snu  nnnnURc                  SSSS5      n["        Rd                  " USS9n ["        R8                  " [g        U5      5      n!US S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n"U"R%                  SS9n#U#S   U!Rc                  SSSSS5      S   -  n$U$R%                  SS9n%U%S   US S 2S S 2S 4   -  R%                  SS9n&["        R8                  " U S S 2S S 2S S 2SS 24   U -
  5      n'UU'Rc                  SSSS5      S   -  n(U(S
S S S 24   US   -  R%                  SS9n)UbE  UbB  US   S:  a9  UR>                  U R                     S S 2S S
4   R                  U)R                   S9n*O["        Rh                  " U)S S 2S S24   5      n*["        Rj                  " U*U)/SS9n)["        R8                  " [g        [0        R2                  R5                  U S S 2S S 2S S 2S4   S5      5      5      n+U+R/                  SS5      n+U+S   U)S S 2S S 2S S
4   -  R%                  SS9n,U,S S 2S S24   U,S S 2S4   n-n)["        R8                  " U 5      n.US
S S S 24   U)S S 2S S 2S S
4   -  n/U.Rc                  SSSS5      n0U/R%                  S5      U0S   -  n1U&U1-   nURO                  USU R                  U RB                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nURO                  XVS5      nU-b  Ub  URS                  U R                  U-S9  U Rm                  UU5      n2U Ro                  U2R                  U5      5      n3U3$ s  snf )Nr*   r+   r6   r   Fr   r2   r   T.r0   ).NNr   r   r1   )r7   output_sizer   r   r5   )r   r   )8r!   r3   rG   r   rV   rP   r   rR   r   r   rf   r^   rY   rD   r   r   r2   r"   sumr   r   r   r   r   r   r#   r$   rO   r   r   r   rZ   r9   rS   r   softplusclampr   r   r,   
contiguousrk   r   bmmr   repeat_interleaver-   r'   r.   permuter>   rB   
zeros_likecatr   r   )4r[   rE   r   r   rF   rL   r   r   r3   r   r   r   r   r   rY   r   r   r   r   cache_devicer   dAdBdBxrZ   ssm_states_reshaped
C_reshapedyr   r   
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statess4                                                       r&   torch_forwardMamba2Mixer.torch_forward  s    "/!4!4
Q## 5]S<<6!''+a$2H2H.HH1t}}K\_c_r_rKrrsw  tB  tB  B  GH  H,<,B,Bt55t~~V\^ -C -
)1%
 #(B~VWGX[\G\**T^^Terw*x '224>>BEET[[M_M_MfMfEgK %		kk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//03P3PSoSuSuvxSy3y{|2} ..Xcpt.u $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89J[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**#(B~VWGX[\G\'2299L Aq!GQc\*Ba#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ))..*55dnnEJSP *  		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!5!5a!8$:N:Nq:QRB)11*r4==Y__aM		*r43F3FGMMOA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99hq!Q|&<x&GIL,..q"b!<YGGGc4l+mI.FFKKPQKRF 'N,F>Z[K\_`K`"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A $)A--V_-`ii4(
 !%knnU.C D$$G &{s   #!rc                     [         (       a@  SU R                  R                  R                  R                  ;   a  U R                  XX45      $ U R                  XX45      $ )Ncuda)r   r   r   r2   typer   r  )r[   rE   r   r   rF   s        r&   r   Mamba2Mixer.forward  sM     "!f0C0C0J0J0O0O&O,,].ii!!-~^^r(   )r   r   r   r   r-   r   r   rO   r   rS   rU   r   rV   r^   r   rP   r   rR   r   r   r   r   r   r   r   r   r   r   )rr   rs   rt   ru   rv   r   rT   r\   r"   ry   r   rI   
LongTensorr   r  r   rz   r   r   s   @r&   r   r      s"   @| @ @J /35915`||` {+` !!1!12	`
 !.`L -126/3B%||B% {+B%   0 01	B%
 !.B%P /35915	_ {+	_ !!1!12		_
 !.	_ 	_r(   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Mamba2RMSNormi  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z=
Mamba2RMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
Nr   r   s      r&   r\   Mamba2RMSNorm.__init__  s/     	ll5::k#:; #r(   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ r   )	r3   rD   r"   r   r   r   r   r   r   )r[   rE   r   r   s       r&   r   Mamba2RMSNorm.forward  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r(   r   r   r   r   s   @r&   r!  r!    s    $; ;r(   r!  c                      ^  \ rS rSrU 4S jr   SS\\   S\\R                     S\\R                     4S jjr
SrU =r$ )	Mamba2Blocki  c                    > [         TU ]  5         Xl        X l        UR                  U l        [        UR                  UR                  S9U l        [        XS9U l
        g )Nr   r^   )r   r\   rK   r^   residual_in_fp32r!  rU   r   r   r   mixer)r[   rK   r^   r   s      r&   r\   Mamba2Block.__init__  sL    " & 7 7!&"4"4&:S:ST	 =
r(   r   r   rF   c                 
   UnU R                  UR                  U R                   R                  R                  S95      nU R                  (       a  UR                  [
        R                  5      nU R                  XX4S9nXQ-   nU$ )Nr   r   r   rF   )r   rD   r   r3   r*  r"   r   r+  )r[   rE   r   r   rF   residuals         r&   r   Mamba2Block.forward  sx     !		-"2"29I9I9O9O"2"PQ  {{5==1H

^ # 
 !0r(   )rK   r^   r+  r   r*  r   )rr   rs   rt   ru   r\   r   rI   r"   r  ry   r   rz   r   r   s   @r&   r'  r'    sT    > /35915 {+ !!1!12	
 !. r(   r'  c                   0    \ rS rSr\rSrS/rSrSr	S r
Srg)Mamba2PreTrainedModeli  backboner'  Tc                 |   [        U[        5      (       Ga{  SUR                  l        SUR                  l        [
        R                  " [
        R                  " U R                  R                  5      [        R                  " U R                  R                  5      [        R                  " U R                  R                  5      -
  -  [        R                  " U R                  R                  5      -   5      R                  U R                  R                  S9nU[
        R                  " [
        R                   " U* 5      * 5      -   n[
        R"                  " 5          UR$                  R'                  U5        SSS5        SUR$                  l        [        U[*        R,                  5      (       aS  UR.                  bE  [1        UR.                  SS5      (       d)  [*        R2                  R5                  UR.                  5        O[[        U[*        R6                  5      (       a<  [*        R2                  R9                  UR:                  U R                  R<                  S9  U R                  R>                  (       a  URA                  5        H  u  pEUS;   d  M  [*        R2                  RC                  U[        RD                  " S5      S	9  [
        R"                  " 5          U[        RD                  " U R                  RF                  5      -  nSSS5        M     gg! , (       d  f       GN= f! , (       d  f       M  = f)
zInitialize the weights.T)minN
_no_reinitF)std)zout_proj.weight   )a)$
isinstancer   r   r   r   r"   r   randrK   rR   mathr   r   r   r   time_step_floorexpm1no_gradr   copy_r6  r   r   r   getattrinitzeros_	Embeddingnormal_r   initializer_rangerescale_prenorm_residualnamed_parameterskaiming_uniform_sqrtrX   )r[   moduler   inv_dtnameps         r&   _init_weights#Mamba2PreTrainedModel._init_weights  s
   fk**,0FLL)(,FHH%

4;;00188DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566F$$V, !(,FNN%fbii(({{&v{{L%@@GGNN6;;/--GGOOFMMt{{/L/LOM;;// "224..
 GG,,Q$))A,,?TYYt{{'D'DEE ) 5 0 !2 )s   L.L,
L),
L;	r{   N)rr   rs   rt   ru   r   config_classbase_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulrO  rz   r{   r(   r&   r2  r2    s&    L"&&*#L(Fr(   r2  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\   \	S'   Sr\\\R                        \	S'   Srg)Mamba2Outputi  a  
Class for the MAMBA2 model outputs.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
Nlast_hidden_stater   rE   r{   )rr   rs   rt   ru   rv   rX  r   r"   FloatTensor__annotations__r   rI   rE   r   rz   r{   r(   r&   rW  rW    sH    $ 6:x 1 129*.L(;'.8<M8E%"3"345<r(   rW  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\   \	S'   Sr\\\R                        \	S'   Srg)	Mamba2CausalLMOutputi  a  
Base class for causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
Nlosslogitsr   rE   r{   )rr   rs   rt   ru   rv   r]  r   r"   rY  rZ  r^  r   rI   rE   r   rz   r{   r(   r&   r\  r\    s\    ( )-D(5$$
%,*.FHU&&'.*.L(;'.8<M8E%"3"345<r(   r\  c                     ^  \ rS rSrU 4S jrS rS rS r\        SS\	\
R                     S\	\
R                     S\	\   S	\	\   S
\	\   S\	\   S\	\
R                     S\	\
R                     S\\\4   4S jj5       rSrU =r$ )Mamba2Modeli3  c           
        > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l        [        UR
                  UR                  S9U l        U R!                  U R"                  5        U R%                  5         g s  snf )Nr)  Fr   )r   r\   r   rD  
vocab_sizerU   
embeddings
ModuleListrangerX   r'  layersgradient_checkpointingr!  r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)r[   rK   idxr   s      r&   r\   Mamba2Model.__init__5  s     ,,v'8'8&:L:LMmmSXY_YqYqSr$sSrC[%GSr$st&+##F$6$6F<U<UV//? %ts   (Cc                 l    U H.  nSU;   d  M  UR                  U5      XR                  SS5      '     g    g )Nz
embedding.zembeddings.)popreplace)r[   
state_dictprefixargsks        r&   rj  Mamba2Model.load_hookA  s4    Aq EO^^TUEV
99\=AB r(   c                     U R                   $ rj   rc  ro   s    r&   get_input_embeddings Mamba2Model.get_input_embeddingsG  s    r(   c                     Xl         g rj   rw  r[   new_embeddingss     r&   set_input_embeddings Mamba2Model.set_input_embeddingsJ  s    (r(   	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr   rF   ra   c	                    Ub  UOU R                   R                  nUb  UO(U R                  (       d  U R                   R                  OSnUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUc  U R                  U5      nU R                  (       a  U R                  (       a	  U(       a  SnU(       a  Ucn  [        U R                   UR                  S5      UR                  UR                  S9n[        R                  " SU R                   R                  UR                  S9nOUc  [        S5      eOSnUn
U(       a  SOSnU R                   HZ  nU R                  (       a/  U R                  (       a  U R!                  UR"                  XXx5      n
O	U" U
UUUS	9n
U(       d  MU  X4-   nM\     U R%                  U
5      n
U(       a  X4-   nU(       d  ['        S
 XU4 5       5      $ [)        U
U(       a  UUS9$ SUS9$ )a}  
cache_params (`Mamba2Cache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
    If `cache_params` is passed, `cache_position` should also be passed.
NFz:You must specify exactly one of input_ids or inputs_embedsr   r1   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyr{   r.  c              3   .   #    U  H  oc  M  Uv   M     g 7frj   r{   ).0vs     r&   	<genexpr>&Mamba2Model.forward.<locals>.<genexpr>  s     f$Tq$Ts   	)rX  r   rE   )rK   r  r   r  use_return_dict
ValueErrorrc  rg  rI   r8   r2   r3   r"   r   rN   rf  _gradient_checkpointing_func__call__rh  tuplerW  )r[   r  r  r   r  r  r  r   rF   kwargsrE   all_hidden_statesmixer_blocks                r&   r   Mamba2Model.forwardM  s   0 %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]-t";<YZZ  OOI6M&&4==YI#*KK!3!3A!6}?S?S[h[n[n  "'a1H1HQ^QeQe!f' !; 	 (  L%"6BD;;K**t}} $ A A((-~! !,!!-#1#1	! $#$58H$H! '  M2 14D Df]BS$Tfff+)2+
 	
8<+
 	
r(   )rc  rg  rf  rh  )NNNNNNNN)rr   rs   rt   ru   r\   rj  rx  r}  r   r   r"   r  rI   r<   ry   r   r   rW  r   rz   r   r   s   @r&   r`  r`  3  s    
)  1548.2$(/3&*5915U
E,,-U
   0 01U
 {+	U

 D>U
 'tnU
 d^U
 !!1!12U
 !.U
 
ul"	#U
 U
r(   r`  z
    The MAMBA2 Model transformer with a language modeling head on top (linear layer with weights not tied to the input
    embeddings).
    )custom_introc                     ^  \ rS rSr/ rU 4S jrS rS rS rS r	     SS\
\   S\
\R                     S	\
\R                     4S
 jjr\         SS\
\R                     S\
\R"                     S\
\   S\
\R                     S\
\   S\
\   S\
\   S\
\R                     S	\
\R                     S\\\4   4S jj5       rSrU =r$ )Mamba2ForCausalLMi  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r   r\   r`  r3  r   r   rU   rb  lm_headrk  )r[   rK   r   s     r&   r\   Mamba2ForCausalLM.__init__  sF     #F+yy!3!3V5F5FUSr(   c                     U R                   $ rj   r  ro   s    r&   get_output_embeddings'Mamba2ForCausalLM.get_output_embeddings  s    ||r(   c                     Xl         g rj   r  r{  s     r&   set_output_embeddings'Mamba2ForCausalLM.set_output_embeddings  s    %r(   c                 6    U R                   R                  5       $ rj   )r3  rx  ro   s    r&   rx  &Mamba2ForCausalLM.get_input_embeddings  s    }}1133r(   c                 8    U R                   R                  U5      $ rj   )r3  r}  r{  s     r&   r}  &Mamba2ForCausalLM.set_input_embeddings  s    }}11.AAr(   r   r   rF   c                    U(       a]  Uc  [        S5      eUS   S:  a  US S 2S4   S   nUb  S nO4[        R                  " SU R                  R                  UR
                  S9nUb  Uc  SU0nOSU0nUR                  UUUUS.5        U$ )	Nz`cache_position` should not be None as it should have been initialized in `model.generate`, you are responsible for passing in a valid `cache_position` if you are calling `prepare_inputs_for_generation` directly with `use_cache=True`r   r*   r0   r   r  r  )rF   r   r  r   )r  r"   r   rK   rN   r2   update)	r[   r  r  r  r   r   rF   r  model_inputss	            r&   prepare_inputs_for_generation/Mamba2ForCausalLM.prepare_inputs_for_generation  s     % e 
 a 1$%ae,Y7	!-%)N "'a1H1HQZQaQa!b$)=+];L'3L"0 ,&"0		
 r(   r  r  labelsr  r  r  ra   c
                    Ub  UOU R                   R                  nU R                  UUUUUUUU	S9nUS   nU R                  UR	                  U R                  R
                  R                  5      5      R                  5       nSnUb)  U R                  " SXU R                   R                  S.U
D6nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
cache_params (`Mamba2Cache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
    If `cache_params` is passed, `cache_position` should also be passed.
N)r   r  r  r  r  r   rF   r   )r^  r  rb  r   )r]  r^  r   rE   r{   )rK   r  r3  r  rD   r   r3   r   loss_functionrb  r\  r   rE   )r[   r  r  r   r  r  r  r  r   rF   r  mamba2_outputsrE   r^  r]  outputs                   r&   r   Mamba2ForCausalLM.forward  s   8 &1%<k$++B]B]%'!5#)) ' 	
 'q)m..t||/B/B/H/HIJPPR%%pVt{{OeOepiopDY!33F)-)9TGf$EvE#'44(66	
 	
r(   )r3  r  )NNNNN)	NNNNNNNNN)rr   rs   rt   ru   _tied_weights_keysr\   r  r  rx  r}  r   rI   r"   r  ry   r  r   rY  r<   r   r   r\  r   rz   r   r   s   @r&   r  r    sU    &4B .25915-
 {+- !!1!12- !.-^  1559.2-1/3&*$(15158
E,,-8
   1 128
 {+	8

 ))*8
 'tn8
 d^8
 D>8
 !.8
 !.8
 
u**	+8
 8
r(   r  )r  r`  r2  )9rv   r<  dataclassesr   typingr   r   r   r"   torch.utils.checkpointr   activationsr	   
generationr
   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   configuration_mamba2r   
get_loggerrr   r   +mamba_ssm.ops.triton.selective_state_updater   !mamba_ssm.ops.triton.ssd_combinedr   r   causal_conv1dr   r   allr   ry   rT   r'   r.   rB   rG   rI   Moduler}   r   r!  r'  r2  rW  r\  r`  r  __all__r{   r(   r&   <module>r     s     ! ) )    ! ) - 
 W . 
		H	% RmmZjW?AWDD-7**!( VU\\ VS V
((J  J Z; ;${_")) {_|;BII ;"")) 8 /FO /F /Fd =; = =0 =; = =6 o
' o
 o
d ~
- ~
~
B Hr(   