
    fThX                     R   S SK r S SKrS SKJr  S SKJrJrJrJr  S SK	r	S SK
r	S SK	Jr  SSKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJr  SSKJ r J!r!J"r"  SSK#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+J,r,J-r-  SSK.J/r/  \" 5       (       a  S SK0J1r1  S SK2J3r3J4r4  OSu  r1r3r4\" 5       (       a	  S SK5J6r6J7r7  OSu  r7r6\8" \1\6\745      r9Sr:\Rv                  " \<5      r= " S S\	R                  R|                  5      r? " S S\,5      r@ " S S\(5      rA " S S \5      rB " S! S"\$5      rC " S# S$\R|                  5      rD " S% S&\R|                  5      rE " S' S(\%5      rF " S) S*\*5      rG " S+ S,\)5      rH " S- S.\5      rI " S/ S0\+\I5      rJ " S1 S2\&5      rK " S3 S4\'5      rL/ S5QrMg)6    N)cycle)CallableOptionalTupleUnion)nn   )ACT2FN)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)logging)is_causal_conv1d_availableis_mamba_ssm_available   )LlamaRotaryEmbeddingapply_rotary_pos_emb)pad_tensor_by_sizereshape_into_chunkssegment_sum)
ZambaAttentionZambaAttentionDecoderLayerZambaForCausalLMZambaForSequenceClassificationZambaHybridDynamicCacheZambaHybridLayerZambaMambaDecoderLayer
ZambaModelZambaRMSNormeager_attention_forward   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNzZyphra/Zamba2-2.7Bc                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )Zamba2RMSNormGatedI   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X0l        X l        g N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer8   eps	__class__s       a/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/zamba2/modular_zamba2.pyr2   Zamba2RMSNormGated.__init__J   s2    ll5::k#:; #$    c                 X   UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  Gt pEXPR                  -  nUR                  " / UQUPU R                  P76 nUR                  S5      R                  SSS9nU[        R                  " XR                  -   5      -  nUR                  " / UQX`R                  -  P76 nU R                  UR                  U5      -  $ )Nr   T)keepdim)dtypetor4   float32r   
functionalsilushaper8   viewpowmeanrsqrtr7   r6   )	r9   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariances	            r=   forwardZamba2RMSNormGated.forwardP   s    #))%((7)BMM,>,>twwu}}?U,VVM!.!4!4//1+00\+\{\DOO\&**1-222t2D1EKKK`K`@`4aa+00]+]{__?\]{{]--k:::r?   )r8   r7   r6   )gư>r0   )__name__
__module____qualname____firstlineno__r2   rU   __static_attributes____classcell__r<   s   @r=   r-   r-   I   s    %; ;r?   r-   c                       \ rS rSrSrg)Zamba2RMSNorm^    NrW   rX   rY   rZ   r[   ra   r?   r=   r_   r_   ^       r?   r_   c            
           \ rS rSrSr\R                  S4S\S\S\R                  S\
\   4S jjrS	\S
\R                  S\R                  S\R                  4S jrS rSS	\
\   S\4S jjrSrg)Zamba2HybridDynamicCacheb   a|  
A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
(which has a constant shape regardless of seq_len).

This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
Nconfig
batch_sizerC   devicec           
      ,   X0l         UR                  U l        SU l        [        UR                  UR
                  -  5      U l        UR                  U l        UR                  U l
        UR                  U l        / U l        0 U l        0 U l        0 U l        0 U l        0 U l        [%        UR&                  5       H  n[(        R*                  " UU R                  SUR,                  -  UR                  -  -   U R                  UUS9U R                   U'   [(        R*                  " X R                  UR.                  U R                  XCS9U R"                  U'   U R                  U   S:X  d  M  U R                  R1                  U5        M     [%        UR&                  5       Vs/ s H  n[(        R2                  " / /U-  US9PM     snU l        [%        UR&                  5       Vs/ s H  n[(        R2                  " / /U-  US9PM     snU l        g s  snf s  snf )NFr   ri   rC   hybridri   )rC   layers_block_typehas_previous_stateintmamba_expandr:   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr4   zerosmamba_ngroupsmamba_headdimappendtensor	key_cachevalue_cache)r9   rg   rh   rC   ri   i_s          r=   r2   !Zamba2HybridDynamicCache.__init__p   s    
!'!9!9"'!$V%8%86;M;M%M!N$22 & 3 3#11"$v//0A"'++&&V-A-A)AFDXDX)XX%%#DQ "'..0D0DdFYFYbh"DOOA %%a(H4''..q1 1 SXX^XpXpRqrRqQ%,,tj'8HRqrTYZ`ZrZrTstTsqELL"
):6JTst sts   #H #H	layer_idxnew_conv_statecache_positionreturnc                 N   U R                   U   nUR                  SU R                  S-
  5      nUR                  SSS9nUR	                  UR
                  5      US S 2S S 2U4'   U R                   U   R                  5         U R                   U==   U-  ss'   U R                   U   $ )Nr   r#   rA   shiftsdims)r|   clamprv   rollrD   ri   zero_)r9   r   r   r   
conv_states        r=   update_conv_state*Zamba2HybridDynamicCache.update_conv_state   s     %%i0
'--a1F1F1JK__BR_8
+9+<+<Z=N=N+O
1a'(#))+#z1#	**r?   c                 l    U R                   R                  5         U R                  R                  5         g r0   )r|   r   r}   )r9   s    r=   resetZamba2HybridDynamicCache.reset   s$     r?   c                     XR                   ;  a  U R                   S   OUn[        U R                  5      U::  d!  U R                  U   R                  5       S:X  a  gU R                  U   R                  S   $ )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rx   lenr   numelrH   )r9   r   s     r=   get_seq_length'Zamba2HybridDynamicCache.get_seq_length   sj     3<CZCZ2ZD++A.`i	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r?   )r{   ry   rz   rv   r|   rC   ro   rr   r   rn   rw   rt   r}   rx   r   )r   )rW   rX   rY   rZ   __doc__r4   float16r$   rp   rC   r   strr2   Tensor
LongTensorr   r   r   r[   ra   r?   r=   re   re   b   s     KP--quu"u03u<AKKuaijmanu@
+
+.3ll
+LQL\L\
+	
+ 3 3c 3 3r?   re   c                   6   ^  \ rS rSr SS\4U 4S jjjrSrU =r$ )Zamba2RotaryEmbedding   rg   c                 z   > [         TU ]  X5        U R                  X!R                  UR                  S9u  o0l        g )N)ri   basedim)r1   r2   rope_init_fn
rope_thetaattention_head_dimattention_scaling)r9   rg   ri   inv_freqr<   s       r=   r2   Zamba2RotaryEmbedding.__init__   s>    
 	(+/+<+< 1 1v7P7P ,= ,
((r?   )r   r0   )rW   rX   rY   rZ   r$   r2   r[   r\   r]   s   @r=   r   r      s     	
	
 	
r?   r   c                   X  ^  \ rS rSrSr   SS\S\\   S\\   S\\   4U 4S jjjr   SS\	R                  S\S	\\	R                     S
\\   S\\\	R                  \	R                  4      S\\   S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )Zamba2Attention   a  
Multi-headed attention from 'Attention Is All You Need' paper.

Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://arxiv.org/pdf/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
expressivity with a small memory overhead (see Fig. 2 of https://arxiv.org/pdf/2411.15242).
rg   r   num_fwd_mem_blocksblock_idc           
        > [         TU ]  X5        X0l        UR                  U l        X@l        UR                  (       Ga  [        R                  " / 5      U l	        [        R                  " / 5      U l
        [        R                  " / 5      U l        [        U R                  5       GH  nXQR                  -  U:X  Gar  [        R                  " [        R                  " U R                   U R"                  R$                  SS9[        R                  " U R"                  R$                  U R                   SS95      n[        R                  " [        R                  " U R                   U R"                  R$                  SS9[        R                  " U R"                  R$                  U R                   SS95      n[        R                  " [        R                  " U R                   U R"                  R$                  SS9[        R                  " U R"                  R$                  U R                   SS95      nO?[        R&                  " 5       n[        R&                  " 5       n[        R&                  " 5       nU R                  R)                  U5        U R                  R)                  U5        U R                  R)                  U5        GM     [+        U R                  5       V	V
s0 s H  u  pX_M	     sn
n	U l        g s  sn
n	f )NFbias)r1   r2   r   hybrid_layer_idslayer_block_mapr   use_shared_attention_adapterr   
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listr~   num_mem_blocks
SequentialLinearattention_hidden_sizerg   adapter_rankIdentityr   	enumerate	layer_dic)r9   rg   r   r   r   r   linear_q_adapterlinear_k_adapterlinear_v_adapterindexvaluer<   s              r=   r2   Zamba2Attention.__init__   s    	+"4%66 ...)+r):D&)+r):D&)+r):D&4223,,,8')}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($
 (*{{}$'){{}$'){{}$**112BC**112BC**112BC) 4, <ETEYEY;Z[;Z<5%,;Z[[s   K4rM   attention_maskpast_key_valueposition_embeddingskwargsr   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R
                  R                  (       aT  U R                  U   nXR                  U   " U5      -   n	XR                  U   " U5      -   n
XR                  U   " U5      -   nU	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
UR                  U5      R                  SS5      nU R
                  R                  (       a  Uu  p[        XX5      u  pUb  UR                  XU5      u  p[         nU R
                  R"                  S:w  ad  U R
                  R"                  S:X  a-  UR%                  SS5      (       a  [&        R)                  S5        O[*        U R
                  R"                     nU" U U	U
UU4U R,                  (       d  S	OU R.                  U R0                  S
.UD6u  nnUR2                  " / UQSP76 R5                  5       nU R7                  U5      nUU4$ )NrA   r#   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )dropoutscaling)rH   head_dimq_projk_projv_projrg   r   r   r   r   r   rI   	transposeuse_mem_roper   updater"   _attn_implementationgetloggerwarning_oncer   trainingattention_dropoutr   reshape
contiguouso_proj)r9   rM   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesadapter_layer_idxcossinattention_interfaceattn_outputattn_weightss                     r=   rU   Zamba2Attention.forward   sA    $))#2.88b8$--8{{=1[[/
{{=1;;33 $y 9'*D*DEV*WXe*ffL#&@&@AR&STa&bbJ'*D*DEV*WXe*ffL#((6@@AF__\2<<QB
#((6@@AF;;##*HC';LVY'_$L%'5'<'<ZW`'a$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r?   )r   r   r   r   r   r   r   r(   )rW   rX   rY   rZ   r   r$   r   rp   r2   r4   r   re   r   r   r   rU   r[   r\   r]   s   @r=   r   r      s   $ $(,0"&'\'\ C='\ %SM	'\
 3-'\ '\Z 26=AKO7)||7) 7) !.	7)
 !!9:7) &eELL%,,,F&GH7) -.7) 
u||Xell3XeELL>Q5RR	S7) 7)r?   r   c                     ^  \ rS rSrSrSS\S\\   4U 4S jjjr  SS\	R                  S\\   S\\	R                     4S	 jjrSS\\   S\\	R                     4S
 jjr  SS\\   S\\	R                     4S jjrSrU =r$ )Zamba2MambaMixeri)  uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
rg   r   c           	        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR                  U l        [        UR                  U R                  -  5      U l
        X l        UR                  U l        SU l        [        R                  " 5       U l        UR"                  U l        UR$                  U l        UR(                  U l        U R                  R,                  U l        UR0                  U l        UR2                  U l        UR4                  U l        UR6                  U l        U R                  SU R&                  -  U R
                  -  -   U l        [        R:                  " U R8                  U R8                  SUR                  U R8                  UR                  S-
  S9U l        U R                  U R8                  -   U R.                  -   n[        R>                  " U R                  UUR@                  S9U l!        [        RD                  " [F        RH                  " U R.                  5      5      U l%        [F        RL                  " SU R.                  S-   5      n[        RD                  " [F        RN                  " U5      5      U l(        SU RP                  l)        [U        U R                  U R                  U R&                  -  SS9U l+        [        RD                  " [F        RH                  " U R.                  5      5      U l,        SU RX                  l)        [        R>                  " U R                  U R                  UR@                  S9U l-        [\        (       d  [^        Ra                  S	5        g g )
NrG   r   Tr#   )in_channelsout_channelsr   kernel_sizegroupspaddingr   gh㈵>)r8   r;   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)1r1   r2   rg   r:   rs   rt   ru   rv   rp   rq   rr   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathr   n_groupsr   r   rw   	num_heads
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr   add_bias_linearin_projr3   r4   r5   dt_biasarangelogA_log_no_weight_decayr-   normDout_projis_fast_path_availabler   r   )r9   rg   r   projection_sizeAr<   s        r=   r2   Zamba2MambaMixer.__init__1  s   !--$22 & 3 3!$V%8%84;K;K%K!L"#11 779 & 7 7,,,,22 ++%55#11#11..T]]1BTEXEX1XXii++==''!+
 004==@4>>Qyy''
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#&""t/E/E/V\`
	 ejj89"&		$"8"8$:J:JQWQgQgh%%> &r?   rM   cache_paramsr   c                    UR                   u  pEnU R                  U R                  -  nSU R                  -  SU R                  -  U R                  -  -   U R                  -   nUGb%  UR
                  (       Ga  U R                  UR                  S5      5      n	U	R                   S   U-
  S-  n
XU R                  U R                  U R                  /n[        R                  " XSS9u    plp[        UUR                  U R                     U R                  R                  R                  S5      U R                  R                   U R"                  5      n[        R                  " UU R                  Xw/SS9u  pn[        R$                  " U R&                  R)                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R+                  SU R,                  U R                  5      R/                  [        R0                  S9nUS S 2S S 2S 4   R+                  SSU R,                  5      nU R2                  S S 2S S4   R+                  SU R,                  5      nU R4                  S S 2S S4   R+                  SU R,                  5      nUR7                  X@R                  UR                   S   U R                  -  5      nUR7                  X@R                  UR                   S   U R                  -  5      nUR7                  X@R                  U R,                  5      n[9        UR:                  U R                     UUUUUUS USS9
nUR7                  X@R                  U R,                  -  5      nU R=                  X5      nU R?                  U5      S S 2S S4   nU$ UbG  [        R@                  " US:H  5      (       d)  URB                  nXS S 2S S 2S 4   -  R/                  U5      nU R                  U5      n[        R$                  " U R&                  R)                  5       5      * nU RD                  c  0 OS	U RD                  0nUb  [        R@                  " US:H  5      nOSnU RF                  (       Ga   U RH                  (       a  Uc  U(       a  [K        UU R                  R                  R                  S5      U R                  R                   U R2                  U4U R4                  U RL                  S U R"                  U R<                  R                  U R<                  RN                  U R>                  R                  U R>                  R                   U R,                  U R                  S
SS.UD6u  nnU$ [        R                  " UU R                  U R                  U R                  /SS9u  pnUbv  URQ                  SS5      n[R        RT                  RW                  UU RX                  UR                   S   -
  S45      nUR                  U R                     R[                  U5        [\        b  U R"                  S;  aJ  U R_                  U R                  URQ                  SS5      5      RQ                  SS5      S S 2S U24   5      nOv[]        URQ                  SS5      U R                  R                  R                  S5      U R                  R                   U R"                  S9RQ                  SS5      S S 2S U24   n[        R                  " UU R                  Xw/SS9u  pnUbG  [        R@                  " US:H  5      (       d)  URB                  nXS S 2S S 2S 4   -  R/                  U5      n[a        UR7                  XESU R,                  5      UUUR7                  XEU R                  S5      UR7                  XEU R                  S5      4U RL                  U R4                  S S SU R2                  SS.UD6u  nnUb+  Ub(  UR:                  U R                     R[                  U5        UR7                  XES5      nU R=                  UU5      nU R?                  U5      nU$ )Nr   r#   rA   r   .rC   T)zr  dt_softplusdt_limitF)r  r  seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rG   swish)xr6   r   r   )r  r  r  r!  r)  r  r  )1rH   r  rt   rr   r  ro   r  squeezer	  r4   splitr*   r|   r   r  r6   r   r   expr  floatexpandr   rD   rE   r  r  rI   r%   r}   r  r  allrC   r  r  r   r'   r  r7   r   r   rF   padrv   copy_r)   r  r&   )r9   rM   r  r   rh   seq_lenr   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrN   hidden_states_B_CdtBCr  r  r  hidden_states_reshapedoutrC   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputs                                  r=   cuda_kernels_forward%Zamba2MambaMixer.cuda_kernels_forwardr  sT    "/!4!4
Q!%1D1D!D$0001t}}3DtGZGZ3ZZ]a]k]kk #(G(G(G"&,,}/D/DQ/G"H(..r2[@QFE$)$2H2H$--Y]YgYg#h 05<Okm0n-Aq) 4!((8""**1-  ! #(++!'')?X#Ma
 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2''7& M *..z>>DMM;YZM IIm:M--.q$|<Cz 
u )%))Na<O2P2P%++!.1d
1K!K O OPU V#||M:4::++-..A$($8$8$@bzSWSgSgFhO)#(99^q-@#A #' $$$<;OTd!A$KK&&..q1KK$$LL" ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(,#"$ &%"YX 
m 6;[[$++T]]DNNK62  +*;*E*Ea*K'!#!2!2+d.C.CFYF_F_`bFc.cef-g"J !,,T^^<BB:N#+tFW/W(,$5$?$?1$EFPPQRTUVWXZb[bZbWbc)% )9+55a;#{{1199!<![[--#'??	)
  i1oa'k)3% ',kk%++-C\'#!
 "-eiiRS@S6T6T)//E%2Aq$J5O%O$S$STY$ZM)B!&&zBNFF:rBFF:rB*  $ff (, LL $* &*&Y (\-E ++DNN;AA)L)..zBG"iiT:mmK0
r?   c                    UR                   u  pEnUR                  nUb2  UR                  (       a!  U R                  UR	                  S5      5      nOOUb;  [
        R                  " US:H  5      (       d  XS S 2S S 2S 4   -  R                  U5      nU R                  U5      nUR                   S   SU R                  -  -
  SU R                  -  U R                  -  -
  U R                  -
  S-  n	UR                  XU R                  U R                  U R                  /SS9u    pjpUGb  UR                  U R                     R!                  5       nUR                  UR"                  5      nUR                  (       Ga2  U
R%                  S5      n
UR&                  U R                     n[
        R(                  " USSS9nUR*                  S:X  a  US S 2SS S 24   OUUS S 2S S 2S4'   UR&                  U R                     R-                  U5        [
        R.                  " UR                  UR"                  5      U R0                  R2                  S S 2SS S 24   -  SS9nU R4                  (       a  XR0                  R6                  -  nU R9                  U5      R                  U5      S S 2S S4   nGOUR;                  SS5      n[<        R>                  RA                  UU RB                  UR                   S   -
  S45      nUR&                  U R                     R-                  U5        U R9                  U R1                  U5      R;                  SS5      5      S S 2S U2S S 24   nUbG  [
        R                  " US:H  5      (       d)  UR                  nXS S 2S S 2S 4   -  R                  U5      nO[
        RD                  " X@R                  U RF                  U R                  4UR"                  US	9nU R9                  U R1                  UR;                  SS5      5      SS U24   R;                  SS5      5      n[
        R                  " XR                  U R                  U R                  -  U R                  U R                  -  /SS9u  pn[
        RH                  " U RJ                  RM                  5       5      * nUGbq  UR                  (       Ga_  UR*                  S:X  a
  US S 2S S4   OUS S 2SS S 24   S S 2S S4   nUR;                  SS5      RO                  XLR                   S   U RF                  5      nU RP                  S
   RO                  U RP                  R                   S   U RF                  5      n[
        R<                  R>                  RS                  UUR                  UR                  5      -   5      n[
        RT                  " XRV                  5      nUS   RO                  U R                  U RF                  U R                  5      R                  [
        RX                  S9n[
        RH                  " US
   U-  5      nUR[                  X@R                  S5      SS S S 24   nURO                  X@R                  U R                  U R                  -  UR                   S   5      R]                  5       nUR[                  USUR                   S   5      nUS
   USS S S 24   -  nUR[                  USU RF                  5      nUUS
   -  nUR                  U R                     R-                  UR                  U R                     U-  U-   5        UR[                  X@R                  S5      SS S S 24   nURO                  X@R                  U R                  U R                  -  UR                   S   5      R]                  5       nUR[                  USUR                   S   5      nUR                  U R                     R                  UR                  5      nUR_                  X@R                  -  U RF                  U R                  5      nUR_                  X@R                  -  U R                  S5      n[
        R`                  " UU5      nUR_                  X@R                  U RF                  5      nU Rb                  S
   RO                  U Rb                  R                   S   U RF                  5      nUUU-  -   R                  UR                  5      nUR[                  US5      S S 2S S4   nGO([<        R>                  RS                  XRP                  -   5      n[
        RT                  " XRV                  5      nUR[                  XESU RF                  5      RM                  5       nUR[                  XESU R                  5      RM                  5       nUR[                  XESU R                  5      RM                  5       nURe                  U R                  U R                  -  SU R                  S9nURe                  U R                  U R                  -  SU R                  S9nU Rf                  XPRf                  -  -
  U Rf                  -  nU Rb                  S
   [i        UU5      -  nXS
   -  nUR                  UR                  5      U-  nUUUU4 Vs/ s H  n[k        UUU Rf                  5      PM     snu  nnnnURm                  SSSS5      n[
        Rn                  " USS9n[
        RH                  " [q        U5      5      nUS S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n U R/                  SS9n!U!S
   URm                  SSSSS5      S
   -  n"U"R/                  SS9n#U#S
   US S 2S S 2S 4   -  R/                  S5      n$[
        RH                  " US S 2S S 2S S 2SS 24   U-
  5      n%UU%Rm                  SSSS5      S
   -  n&U&Rm                  SSSSS5      S
   URm                  SSSSS5      SS S S 24   -  R/                  SS9Rm                  SSSSS5      n'Ub3  UR                  (       a"  UR                  U R                     S S 2S S4   n(O[
        Rr                  " U'S S 2S S24   5      n([
        Rt                  " U(U'/SS9n'[
        RH                  " [q        [<        R>                  RA                  US S 2S S 2S S 2S4   S5      5      5      n)U'Rm                  SSSSS5      n*U)S   U*S S 2S S 2S S4   -  R/                  SS9n+U+Rm                  SSSSS5      n,U,S S 2S S24   U,S S 2S4   nn'[
        RH                  " U5      n-USS S S 24   U'S S 2S S 2S S4   -  n.U-Rm                  SSSS5      n/U.R/                  S5      U/S
   -  n0U$U0-   nUR[                  USU R                  U RF                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nUR[                  XES5      nUb+  Ub(  UR                  U R                     R-                  U5        U Rw                  UU
5      n1U Ry                  U1R                  U5      5      n2U2$ s  snf )Nr#   rA   r   r  r   r	   r   .rk   ).N).NNr  )r   output_size   )r#   r   )=rH   rC   ro   r  r,  r4   r1  rD   rr   r  rt   r  r-  r	  r}   r   cloneri   	unsqueezer|   r   ndimr3  sumr  r6   r   r   r  r   r   rF   r2  rv   r   r   r.  r  r/  r0  r  softplusr   r  rE   r   r   rI   bmmr  repeat_interleaver  r   r   permutecumsumr   
zeros_likecatr  r  )3r9   input_statesr  r   rh   r4  r   rC   r@  r8  rN   rM   r;  rC  r   r<  r=  r  r  dAdBdBxr}   ssm_states_reshaped
C_reshapedyr  pad_size
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offrF  contextualized_statess3                                                      r=   torch_forwardZamba2MambaMixer.torch_forward	  s[   !-!3!3
Q""#(G(G $\-A-A!-D E)%))NA<M2N2N$0!Q*3M$M#Q#QRW#XL $\ :!''+a$2H2H.HHAPTP]P]L]`d`s`sLssuy  vD  vD  D  IJ  J(8(>(>t55t~~V\^ )? )
%1M
 #$//?EEGI!]%9%9:I...~~a()55dnnE
"ZZ
2BG
ANASASWXAX}Q1W'=^k
1a8$((8>>zJ %		*--8H8O8O*PSWS^S^SeSefgijlmfmSn*ntv w%%![[%5%55M $ 7 : :5 A!T3, O - 7 7! <]]..!**]-@-@-DDaH
 ((8>>zJ $])C)M)MaPQ)R STUW_X_W_abTb c!-eiiPQ@Q6R6R)//E%2Aq$J5O%O$S$STY$ZM^^T]]D<O<OP$++5I !HHT[[1H1HA1N%OPSU]V]U]P]%^%h%hijlm%noM#kk-:P:PRVR_R_bfbubuRuw{  xE  xE  HL  H[  H[  x[  :\  bd  e!YYtzz'')**#(G(G(G &(WW\AtSL!r!Q'{1dC<7PBa#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!3!34B/"))$..$--I\I\]``glgtgt`uA2i=1,-B
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PM}Y//C ##DNN399''7"<sB 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CCAGGLJ",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!3!34B)11*r4==Y__aM		*D4G4GHNNPA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCAFF !99hq!Q|&<x&GIL"#l&:&:1aA&Fy&Q"Q)11!Q1a@K}OdOdefhiklnoqrOstwy}  @A  uA  PB  B  G  G  LM  G  N  V  V  WX  Z[  ]^  `a  cd  eF'L,K,K"."9"9$.."I!TSV,"W"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK$nnQ1a;O!/2_Q4QT_5UUZZ_`ZaF1aA6J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A$)A''7==iHii4(
 !%knnU.C D$$I &{s   	!zc                     [         (       a@  SU R                  R                  R                  R                  ;   a  U R                  XU5      $ U R                  XU5      $ )Ncuda)r  r  r6   ri   typerG  ru  )r9   rM   r  r   s       r=   rU   Zamba2MambaMixer.forward  sM     "!f0C0C0J0J0O0O&O,,].YY!!-~NNr?   )r  r  r  r   r  rg   r  r	  rv   r  r   r:   r  rr   r   r  r  r  r  rt   r  r  r  r   r  r0   r+   )rW   rX   rY   rZ   r   r$   r   rp   r2   r4   r   re   rG  ru  rU   r[   r\   r]   s   @r=   r   r   )  s    ?| ? ? ?H <@15	T||T 78T !.	Tn%AY8Z %qyz  {G  {G  rH %J <@15		O 78	O !.		O 	Or?   r   c                   H   ^  \ rS rSrSS\S\\   4U 4S jjjrSS jrSr	U =r
$ )		Zamba2MLPi  rg   r   c           
        > [         T	U ]  5         Xl        UR                  U l        UR                  U l        X l        X0l        [        R                  " U R                  SU R                  -  UR                  S9U l
        [        R                  " U R                  U R                  UR                  S9U l        [        UR                     U l        [        R                  " / 5      U l        [#        U R
                  5       H  nXAR$                  -  U:X  a  [        R&                  " [        R                  " U R                  R                  U R                  R(                  SS9[        R                  " U R                  R(                  SU R                  -  SS95      nO[        R*                  " 5       nU R                   R-                  U5        M     UR.                  n[1        U5       VVs0 s H  u  pxX_M	     snnU l        gs  snnf )a9  
This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
r   r   FN)r1   r2   rg   r:   rr   r   r   r   r   r  gate_up_proj	down_projr
   
hidden_actact_fnr   gate_up_proj_adapter_listr~   r   r   r   r   r   r   r   r   )
r9   rg   r   r   r   gate_up_proj_adapterr   r   r   r<   s
            r=   r2   Zamba2MLP.__init__  s   
 	!--!'!9!9"4 IId&6&6D<R<R8RY_YoYop4#9#94;K;KRXRhRhiV../)+r):&t../A(((H4')}}IIdkk55t{{7O7OV[\IIdkk66D<R<R8RY^_($
 (*{{}$**112FG 0 !11;D_;UV;U<5%,;UVVs   -Hc                     U R                  U5      nU R                  U   nX0R                  U   " U5      -   n[        R                  " USSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr   rA   r  r   r#   )r~  r   r  r4   chunkr  r  )r9   hidden_stater   gate_up_stateoutputs        r=   rU   Zamba2MLP.forward  s{    )),7NN9-	%(F(Fy(QR^(__M1"={{=#34}Q7GG-r?   )
r  r   rg   r  r~  r  r:   rr   r   r   r+   r0   )rW   rX   rY   rZ   r$   r   rp   r2   rU   r[   r\   r]   s   @r=   r|  r|    s0    W| WPXY\P] W W< r?   r|  c                   F  ^  \ rS rSrSS\S\\   S\\   4U 4S jjjr    SS\R                  S\R                  S\S\\R                     S	\\
   S
\\   S\\R                     S\\   S\\R                   \\\R                   \R                   4      4   4S jjrSrU =r$ )Zamba2AttentionDecoderLayeri  rg   r   r   c                    > X l         [        UR                  5      n[        TU ]  X5        [        USXBS9U l        [        XUS9U l        g )NrA   )r   r   r   )r   r   )	r   r   r   r1   r2   r   	self_attnr|  feed_forward)r9   rg   r   r   num_gsr<   s        r=   r2   $Zamba2AttentionDecoderLayer.__init__  sF     V,,-+(2RXl%fRZ[r?   rM   original_hidden_statesr   r   r   r   r   r   c           
          [         R                  " X/SS9nU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R	                  U5      nU R                  X5      nU4n
U(       a  X4-  n
U
$ )ab  
Args:
    hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
        This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
        concatenated tensor is then used as input of the pre-attention RMSNorm
        (see fig. 2 in https://arxiv.org/pdf/2405.16712).
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
rA   r  )rM   r   r   r   r   r   ra   )r4   concatenateinput_layernormr  pre_ff_layernormr  )r9   rM   r  r   r   r   r   r   r   self_attn_weightsoutputss              r=   rU   #Zamba2AttentionDecoderLayer.forward	  s    > ))=*QWYZ,,];+/>> ,
'))/ 3,
 ,
( --m<))-C "++Gr?   )r   r  r  r+   )NNFN)rW   rX   rY   rZ   r$   r   rp   r2   r4   r   re   boolr   r   r   r   FloatTensorrU   r[   r\   r]   s   @r=   r  r    s    \| \x} \X`adXe \ \ 26=A,1:>3||3 !&3 	3
 !.3 !!9:3 $D>3 &e&6&673 -.3 
u  (51B1BEDUDU1U+V"WW	X3 3r?   r  c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )Zamba2MambaDecoderLayeri?  rg   r   c                    > [         TU ]  X5        [        XS9U l        [	        UR
                  UR                  S9U l        g )N)rg   r   r;   )r1   r2   r   mambar_   r:   rms_norm_epsr  )r9   rg   r   r<   s      r=   r2    Zamba2MambaDecoderLayer.__init__@  s7    +%VI
,V-?-?VEXEXYr?   )r  r  )	rW   rX   rY   rZ   r$   rp   r2   r[   r\   r]   s   @r=   r  r  ?  s    Z| Z Z Zr?   r  c                   |  ^  \ rS rSrS\S\R                  S\4U 4S jjr        SS\	R                  S\\	R                     S\\   S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\	R                     S\\	R"                  \\\	R"                  \	R"                  4      4   4S jjrSrU =r$ )Zamba2HybridLayeriF  shared_transformerlinearr  c                 6   > [         TU ]  XU5        U ?Xl        g r0   )r1   r2   shared_transfr  )r9   r  r  r  r<   s       r=   r2   Zamba2HybridLayer.__init__G  s!     	+U;"4r?   rM   r  r   r   causal_maskr   r   	use_cacher   r   c
           
          U R                  UUUUUUU	S9n
U
S   nU(       a  U
S   nU R                  U5      nU R                  UUUUUUU	S9n
U(       a  U
S   W4U
SS -   n
U
$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
    hidden activations to form the input of the shared transformer layer.
    layer_idx (`int`): layer number.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
)r  r   r   r   r   r   r   r#   )transformer_hidden_statesr   r   r   r  r   r   N)r  r  mamba_decoder)r9   rM   r  r   r   r  r   r   r  r   layer_outputsr  r  s                r=   rU   Zamba2HybridLayer.forwardN  s    @ //#9&)/ 3 0 
 %2!$4! -a 0$(KK0I$J!**&?))/ 3 + 
 *1-/@AMRSRTDUUMr?   )r  )NNNNNFFN)rW   rX   rY   rZ   r  r   r   r  r2   r4   r   r   rp   re   r  r   r   r  rU   r[   r\   r]   s   @r=   r  r  F  s   5"=5GIyy5Yp5 :>#'15.2=A,1$):>>||> !) 6> C=	>
 !.> ell+> !!9:> $D>> D>> &e&6&67> 
u  (51B1BEDUDU1U+V"WW	X> >r?   r  c                   F    \ rS rSr\rSrSrSS/rSr	Sr
SrSrSrSrS rSrg	)
Zamba2PreTrainedModeli  modelTr  r  past_key_valuesc                 ^   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        [        45      (       a&  UR                  R                  R                  S5        g [        U[         5      (       Ga  ["        R$                  " ["        R&                  " U R                   R(                  5      [*        R,                  " U R                   R.                  5      [*        R,                  " U R                   R0                  5      -
  -  [*        R,                  " U R                   R0                  5      -   5      R3                  U R                   R4                  S9nU["        R,                  " ["        R6                  " U* 5      * 5      -   nUR8                  R                  R;                  U5        ["        R<                  " SUR>                  S-   5      nUR@                  R                  R;                  ["        R,                  " U5      5        URB                  R                  R                  S5        g g )Nr   )rK   stdg      ?)minr#   )"rg   initializer_range
isinstancer   r   r
  r6   datanormal_r   r   	Embeddingpadding_idxr_   r-   fill_r   r4   r.  randrw   mathr  r  r  r   time_step_floorexpm1r  r3  r  r  r  r  )r9   moduler  r;  inv_dtr  s         r=   _init_weights#Zamba2PreTrainedModel._init_weights  s%   kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .0B CDDMM$$S) 011

4;;44588DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FNN%%f-Q 0 01 45ALL##EIIaL1HHMM$ 2r?   ra   N)rW   rX   rY   rZ   r$   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_flex_attn_supports_sdpa_supports_cache_class_is_statefulr  r[   ra   r?   r=   r  r    sF    L&*#68QR"3!N L%r?   r  c                   $   \ rS rSrSrS\4S jrS r          SS\\	R                     S\\	R                     S	\\	R                     S
\\   S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\\\4   4S jjrSrg)Zamba2Modeli  zX
Model consisting of *config.num_hidden_layers* layers.

Args:
    config: Zamba2Config
rg   c           	         [         R                  X5        Xl        UR                  U l        UR
                  U l        [        R                  " UR
                  UR                  U R                  5      U l	        [        UR                  5       Vs/ s H  n[        XS9PM     nn/ n/ nUR                  U l        [        UR                  5       H  nUR                  U   S:X  a  UR                  [!        XS95        M0  UR                  U   S:X  d  ME  UR                  [        R"                  " U R                  R                  U R                  R                  SS95        UR                  [!        XS95        M     [%        U5      n[%        U5      n['        U5      nU R)                  X5U5      n[        R*                  " U5      U l        UR.                  U l        [1        UR                  UR2                  S9U l        UR6                  (       a6  UR8                  (       a  [:        R=                  S5        [?        U5      U l         SU l!        U RE                  5         g s  snf )	N)r   r  r   rl   Fr   r  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.)#r  r2   rg   pad_token_idr  
vocab_sizer   r  r:   embed_tokensr~   r   r  rn   r   r   r  r   iterr   
get_layersr   layersr   r_   r  final_layernormr   use_long_contextr   r   r   
rotary_embgradient_checkpointing	post_init)r9   rg   kblocksmamba_layerslinear_layersr   r  s           r=   r2   Zamba2Model.__init__  s   &&t4!.. ++LL):):F<N<NPTP`P`aKPQWQfQfKghKga-fAKgh!'!9!9v//0A''*g5##$;F$PQ))!,8$$RYYt{{/F/FH_H_fk%lm##$;F$PQ 1 L)]+vEmmF+$*$?$?!,V-?-?VEXEXY&&##{ 4F;DO&+# 	7 is   Ic           
         / n/ U l         SU l        [        U R                  5       GH!  u  pVUS:X  Ga  U R                  S:X  a  XPl        [	        U5      nU R
                  R                  [        U R
                  R                  5      -  S:  Gam  SU S3n[        R                  " US-   S-   S-   S	-   S
-   5      n	U R                   R                  U	5        Sn
U R                   Ht  nUS:X  af  XR
                  R                  -  UR                  :X  a@  [        R                  " S[        U
5      -   S-   5      nU R                   R                  U5        U
S-  n
Mv     U R
                  R                  (       a  Sn
U R                   Ht  nUS:X  af  XR
                  R                  -  UR                  :X  a@  [        R                  " S[        U
5      -   S-   5      nU R                   R                  U5        U
S-  n
Mv     UR                  [        U[	        U5      [	        U5      5      5        GM  UR                  [	        U5      5        GM$     U$ )Nr   rl   r#   z	^layers\.z\.shared_transformer\.z(?:z3self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|z1feed_forward\.(?:gate_up_proj|down_proj)\.weight|z,(?:input_layernorm|pre_ff_layernorm)\.weightz)$z>^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\.z\.(?:0|1)\.weight$zg^shared_transformer\.self_attn\.(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\.)_tied_weights_keysfirst_transformer_layer_idr   rn   nextrg   r   r   r   recompiler   r   r   r   r  )r9   r  r  r  r  layer_id
layer_typeblockprefix_patternmain_keys_pattern
adapter_id_layer_typeadapter_patternattn_adapter_patterns                 r=   r  Zamba2Model.get_layers  s3   "$*+'$-d.D.D$E HX%22a76>3V;;--DKK4P4P0QQTUU(1(;Q%RN(*

& !PQ OO J	J
   )% ++223DE!"J'+'='=&(2zKKD^D^7^bgbpbp7p.0jj a"%j/!2"7!8/O
 !33::?K"a
 (> {{??%&
+/+A+AK*h6:HbHb;bfkftft;t79zz%q&)*o%6 '<%<8" 4 !% 7 7 > >?S T&!OJ ,B /tM7JDQ]L^_`d<01S %FT r?   N	input_idsr   position_idsr  inputs_embedsr  r   output_hidden_statesreturn_dictr   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nUn[        R                  " U5      nU(       aM  UcJ  Ub  UR                  S   OUR                  S   n[        U R                   XR                  U R                   S9nU
cM  Ub  UR#                  U R$                  S9OSn[        R&                  " XUR                  S   -   UR                   S9n
Uc  U
R)                  S5      nU R+                  X%U
5      nU R                   R,                  (       a  U R/                  X5      nOS nU(       a  S	OS nU(       a  S	OS n[1        U R2                  5       H  u  nnU(       a  UU4-  nU R                  (       a6  U R                  (       a%  U R5                  UR6                  UUUUUUUUU5
      nOU" UUUUUUUUUS
9	nUS   nU(       d  Mv  US   c  M~  UUS   4-  nM     U R9                  U5      nU(       a  UU4-  nU(       a  UR:                  (       d  SUl        [=        UU(       a  UOS UUS9nU	(       a  U$ UR?                  5       $ )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   )rC   ri   r  r#   rm   ra   )r  r   r   r  r   r   r  r   T)last_hidden_stater  rM   
attentions) rg   r   r  r  use_return_dict
ValueErrorr  r   r   r   r  r4   rL  rH   re   rC   ri   r   r  r  rM  _update_causal_maskr   r  r   r  _gradient_checkpointing_func__call__r  ro   r   to_tuple)r9   r  r   r  r  r  r  r   r  r  r   rM   r  rh   past_seen_tokensr  r   all_hidden_statesall_self_attnsr   layerr  r  s                          r=   rU   Zamba2Model.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0/8/D+-J]J]^_J`J6t{{JV`V`imitituO! #.  ..9X9X.Y 
 #\\ ]5H5H5K"KTaThThN )33A6L..~n] ;;##"&//-"N"&"6BD0d )$++ 6Iu#!m%55!**t}} $ A ANN!*"#%'! !&!+A'#1 +#2&7'(;
! *!,M   #/"}Q'7&99NE !7H ,,];  -!11?#E#E15O.(+/8Od+%	
 %v;&//*;;r?   )r   r  rg   r  r  r  r  r  rn   r  r  r  )
NNNNNNNNNN)rW   rX   rY   rZ   r   r$   r2   r  r   r4   r   r   re   r  r  r   r   r   rU   r[   ra   r?   r=   r  r    s   "| "H.d 151537>B59$(,0/3&*59v<E,,-v< !.v< u//0	v<
 "":;v<   1 12v< D>v< $D>v< 'tnv< d^v< !!1!12v< 
u--	.v< v<r?   r  c                       \ rS rSrSrg)Zamba2ForCausalLMi  ra   Nrb   ra   r?   r=   r	  r	    rc   r?   r	  c                       \ rS rSrSrg)Zamba2ForSequenceClassificationi  ra   Nrb   ra   r?   r=   r  r    rc   r?   r  )r	  r  r  r  )Nr  r  	itertoolsr   typingr   r   r   r   r4   torch.utils.checkpointr   activationsr
   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   r   processing_utilsr   utilsr   utils.import_utilsr   r   llama.modeling_llamar   r   mamba2.modeling_mamba2r   r   r   zamba.modeling_zambar   r   r   r   r   r   r   r    r!   r"   configuration_zamba2r$   +mamba_ssm.ops.triton.selective_state_updater%   !mamba_ssm.ops.triton.ssd_combinedr&   r'   causal_conv1dr)   r*   r1  r  _CONFIG_FOR_DOC
get_loggerrW   r   Moduler-   r_   re   r   r   r   r|  r  r  r  r  r  r	  r  __all__ra   r?   r=   <module>r!     s     	  3 3    ! B 7 F & N Y Y   / RmmZjW57WDD-7**46FH\]^  '			H	%; ;*	L 	D36 D3N

0 

p)n p)fkOryy kO\'		 'T;"< ;|Z4 ZF( FR$%O $%NR<*3 R<j	( 		&D 	r?   