
    fThU                        S SK r S SKrS SKJr  S SKJrJrJrJrJ	r	J
r
Jr  S SKrS SKJr  S SKJrJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJ r   SSK!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+  SSK,J-r-J.r.  SSK/J0r0  \." 5       (       a  S SK1J2r2  S SK3J4r4J5r5  OSu  r2r4r5\-" 5       (       a	  S SK6J7r7J8r8  OSu  r8r7\+Rr                  " \:5      r; " S S\R                  Rx                  5      r= " S S\Rx                  5      r> " S S\5      r? " S S \Rx                  5      r@S!\R                  S"\BS#\R                  4S$ jrC SKS%\Rx                  S&\R                  S'\R                  S(\R                  S)\	\R                     S*\DS+\D4S, jjrES- rFSLS. jrG " S/ S0\Rx                  5      rHS1\R                  S2\B4S3 jrIS4 rJS5 rK\L" \2\7\845      rM " S6 S7\Rx                  5      rN " S8 S9\Rx                  5      rO " S: S;\Rx                  5      rP " S< S=\Rx                  5      rQ " S> S?\Rx                  5      rR " S@ SA\&5      rS\* " SB SC\S5      5       rT " SD SE\S\5      rU\*" SFSG9 " SH SI\S5      5       rV/ SJQrWg)M    N)cycle)AnyCallableDictListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_available   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNc                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )Zamba2RMSNormGated<   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X0l        X l        g N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer7   eps	__class__s       b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/zamba2/modeling_zamba2.pyr1   Zamba2RMSNormGated.__init__=   s2    ll5::k#:; #$    c                 X   UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  Gt pEXPR                  -  nUR                  " / UQUPU R                  P76 nUR                  S5      R                  SSS9nU[        R                  " XR                  -   5      -  nUR                  " / UQX`R                  -  P76 nU R                  UR                  U5      -  $ N   T)keepdim)dtypetor3   float32r   
functionalsilushaper7   viewpowmeanrsqrtr6   r5   )	r8   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariances	            r<   forwardZamba2RMSNormGated.forwardC   s    #))%((7)BMM,>,>twwu}}?U,VVM!.!4!4//1+00\+\{\DOO\&**1-222t2D1EKKK`K`@`4aa+00]+]{__?\]{{]--k:::r>   )r7   r6   r5   gư>r/   )__name__
__module____qualname____firstlineno__r1   rV   __static_attributes____classcell__r;   s   @r<   r,   r,   <   s    %; ;r>   r,   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )Zamba2RMSNormQ   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z,
Zamba2RMSNorm is equivalent to T5LayerNorm
N)r0   r1   r   r2   r3   r4   r5   r6   )r8   r9   r:   r;   s      r<   r1   Zamba2RMSNorm.__init__R   s/     	ll5::k#:; #r>   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ r@   )	rD   rE   r3   rF   rK   rL   rM   r6   r5   )r8   rN   rP   rU   s       r<   rV   Zamba2RMSNorm.forwardZ   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r>   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler5   rI   r6   r8   s    r<   
extra_reprZamba2RMSNorm.extra_repra   s*    ))*+6$2G2G1HIIr>   )r6   r5   rX   )	rY   rZ   r[   r\   r1   rV   rj   r]   r^   r_   s   @r<   ra   ra   Q   s    $;J Jr>   ra   c                   "   \ rS rSrSr\R                  S4S\S\S\R                  S\
\   4S jjr SS	\R                  S
\R                  S\S\
\\\4      S\\R                  \R                  4   4
S jjrS\R$                  4S jrSS\
\   S\4S jjrS\\\R                     \\R                     4   4S jr\SS\
\\\R.                           SS4S jj5       rS\S\R                  S\R$                  S\R                  4S jrS rSrg)Zamba2HybridDynamicCachee   a|  
A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
(which has a constant shape regardless of seq_len).

This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
Nconfig
batch_sizerD   devicec           
      ,   X0l         UR                  U l        SU l        [        UR                  UR
                  -  5      U l        UR                  U l        UR                  U l
        UR                  U l        / U l        0 U l        0 U l        0 U l        0 U l        0 U l        [%        UR&                  5       H  n[(        R*                  " UU R                  SUR,                  -  UR                  -  -   U R                  UUS9U R                   U'   [(        R*                  " X R                  UR.                  U R                  XCS9U R"                  U'   U R                  U   S:X  d  M  U R                  R1                  U5        M     [%        UR&                  5       Vs/ s H  n[(        R2                  " / /U-  US9PM     snU l        [%        UR&                  5       Vs/ s H  n[(        R2                  " / /U-  US9PM     snU l        g s  snf s  snf )NFrA   rq   rD   hybridrq   )rD   layers_block_typehas_previous_stateintmamba_expandr9   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr3   zerosmamba_ngroupsmamba_headdimappendtensor	key_cachevalue_cache)r8   ro   rp   rD   rq   i_s          r<   r1   !Zamba2HybridDynamicCache.__init__s   s    
!'!9!9"'!$V%8%86;M;M%M!N$22 & 3 3#11"$v//0A"'++&&V-A-A)AFDXDX)XX%%#DQ "'..0D0DdFYFYbh"DOOA %%a(H4''..q1 1 SXX^XpXpRqrRqQ%,,tj'8HRqrTYZ`ZrZrTstTsqELL"
):6JTst sts   #H #H
key_statesvalue_states	layer_idxcache_kwargsreturnc                 |   U R                   U   R                  S   S:X  a  XR                   U'   X R                  U'   Ob[        R                  " U R                   U   U/SS9U R                   U'   [        R                  " U R                  U   U/SS9U R                  U'   U R                   U   U R                  U   4$ )NrB   r   rA   dim)r   rI   r   r3   cat)r8   r   r   r   r   s        r<   updateZamba2HybridDynamicCache.update   s     >>)$**2.!3(2NN9%*6Y'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr>   beam_idxc                    [        [        U R                  5      5       GHT  nU R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   GMW     g)zDReorders the cache for beam search, given the selected beam indices.r   N)	r   lenr   rq   index_selectrE   r   r   r   )r8   r   r   rq   s       r<   reorder_cache&Zamba2HybridDynamicCache.reorder_cache   s=   s4>>23I^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI& 4r>   c                     XR                   ;  a  U R                   S   OUn[        U R                  5      U::  d!  U R                  U   R                  5       S:X  a  gU R                  U   R                  S   $ )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )r   r   r   numelrI   )r8   r   s     r<   get_seq_length'Zamba2HybridDynamicCache.get_seq_length   sj     3<CZCZ2ZD++A.`i	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r>   c                     [        S5      eNzAZamba2HybridDynamicCache does not have a legacy cache equivalent.NotImplementedErrorri   s    r<   to_legacy_cache(Zamba2HybridDynamicCache.to_legacy_cache   s    !"effr>   past_key_valuesr   c                     [        S5      er   r   )clsr   s     r<   from_legacy_cache*Zamba2HybridDynamicCache.from_legacy_cache   s    !"effr>   new_conv_statecache_positionc                 N   U R                   U   nUR                  SU R                  S-
  5      nUR                  SSS9nUR	                  UR
                  5      US S 2S S 2U4'   U R                   U   R                  5         U R                   U==   U-  ss'   U R                   U   $ )Nr   r"   rB   shiftsdims)r   clampr~   rollrE   rq   zero_)r8   r   r   r   
conv_states        r<   update_conv_state*Zamba2HybridDynamicCache.update_conv_state   s     %%i0
'--a1F1F1JK__BR_8
+9+<+<Z=N=N+O
1a'(#))+#z1#	**r>   c                 l    U R                   R                  5         U R                  R                  5         g r/   )r   r   r   ri   s    r<   resetZamba2HybridDynamicCache.reset   s$     r>   )r   r   r   r~   r   rD   rw   rz   r   rv   r   r|   r   r   r   r/   )r   )rY   rZ   r[   r\   __doc__r3   float16r#   rx   rD   r   strr1   Tensorr   r   r	   r   
LongTensorr   r   r   classmethodFloatTensorr   r   r   r]    r>   r<   rm   rm   e   sz    KP--quu"u03u<AKKuaijmanuJ 26FLLF llF 	F
 tCH~.F 
u||U\\)	*F"ie&6&6 i3 3c 3guU\\':E%,,<O'O!P g guUEVEV?W9X0Y ges g g
+
+.3ll
+LQL\L\
+	
+ r>   rm   c                   n   ^  \ rS rSr SS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )Zamba2RotaryEmbedding   ro   c                   > [         TU ]  5         [        US5      (       aH  UR                  b;  UR                  R	                  SUR                  R	                  S5      5      U l        OSU l        UR                  U l        UR                  U l        Xl	        [        U R
                     U l        U R                  X!R                  UR                  S9u  o0l        U R                  SUSS9  U R                   U l        g )	Nrope_scaling	rope_typetypedefault)rq   baser   inv_freqF)
persistent)r0   r1   hasattrr   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenro   r   rope_init_fn
rope_thetaattention_head_dimattention_scalingregister_bufferr   original_inv_freq)r8   ro   rq   r   r;   s       r<   r1   Zamba2RotaryEmbedding.__init__   s    
 	6>**v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+< 1 1v7P7P ,= ,
(( 	ZeD!%r>   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rB   r"   mpscpuF)device_typeenabledrA   r   rD   )r   floatexpandrI   rE   rq   
isinstancer   r   r3   autocast	transposer   cosr   sinrD   )
r8   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r<   rV   Zamba2RotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   ro   r   r   r   r   r   r/   )rY   rZ   r[   r\   r#   r1   r3   no_gradr   rV   r]   r^   r_   s   @r<   r   r      s<     // /. ]]_<  <r>   r   rN   n_repr   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r"   N)rI   r   reshape)rN   r   batchnum_key_value_headsslenhead_dims         r<   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr>   modulequerykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )NrA   r   r   rB   )r   rD   )ptrainingr"   )r   num_key_value_groupsr3   matmulr   rI   r   rG   softmaxrF   rE   rD   r  r  
contiguous)r   r   r   r  r  r  r  kwargsr   r   attn_weightscausal_maskattn_outputs                r<   eager_attention_forwardr    s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r>   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..NrB   rA   r   )rI   r3   r   )r   x1x2s      r<   rotate_halfr    sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r>   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer  )qkr   r   r   unsqueeze_dimq_embedk_embeds           r<   apply_rotary_pos_embr  &  sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr>   c                   X  ^  \ rS rSrSr   SS\S\\   S\\   S\\   4U 4S jjjr   SS\	R                  S\S	\\	R                     S
\\   S\\\	R                  \	R                  4      S\\   S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )Zamba2AttentioniA  a+  
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
and "Generating Long Sequences with Sparse Transformers".

Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://arxiv.org/pdf/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)

Multi-headed attention from 'Attention Is All You Need' paper.

Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://arxiv.org/pdf/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
expressivity with a small memory overhead (see Fig. 2 of https://arxiv.org/pdf/2411.15242).
ro   r   num_fwd_mem_blocksblock_idc           
        > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  UR                  -  U l	        UR                  U l
        U R                  S-  S-  U l        SU l        UR                  U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  U R                  -  UR&                  SS9U l        X0l        UR,                  U l        X@l        UR2                  (       Ga  [        R4                  " / 5      U l        [        R4                  " / 5      U l        [        R4                  " / 5      U l        [=        U R*                  5       GH  nXQR>                  -  U:X  Gar  [        R@                  " [        R                  " U R                  U R                  RB                  SS9[        R                  " U R                  RB                  U R                  SS95      n[        R@                  " [        R                  " U R                  U R                  RB                  SS9[        R                  " U R                  RB                  U R                  SS95      n[        R@                  " [        R                  " U R                  U R                  RB                  SS9[        R                  " U R                  RB                  U R                  SS95      nO?[        RD                  " 5       n[        RD                  " 5       n[        RD                  " 5       nU R6                  RG                  U5        U R8                  RG                  U5        U R:                  RG                  U5        GM     [I        U R.                  5       V	V
s0 s H  u  pX_M	     sn
n	U l%        g s  sn
n	f )NrA   g      TFbias)&r0   r1   ro   r   attention_hidden_sizer   r   num_attention_headsr   r  r   r  	is_causalattention_dropoutr   Linearq_projk_projv_projr9   o_projr  hybrid_layer_idslayer_block_mapr   use_shared_attention_adapter
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listr   num_mem_blocks
Sequentialadapter_rankIdentityr   	enumerate	layer_dic)r8   ro   r   r  r   r   linear_q_adapterlinear_k_adapterlinear_v_adapterindexr  r;   s              r<   r1   Zamba2Attention.__init__\  s>    	"%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejk"4%66 ...)+r):D&)+r):D&)+r):D&4223,,,8')}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($
 (*{{}$'){{}$'){{}$**112BC**112BC**112BC) 4, <ETEYEY;Z[;Z<5%,;Z[[s   Q/rN   r  past_key_valueposition_embeddingsr  r   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R
                  R                  (       aT  U R                  U   nXR                  U   " U5      -   n	XR                  U   " U5      -   n
XR                  U   " U5      -   nU	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
UR                  U5      R                  SS5      nU R
                  R                  (       a  Uu  p[        XX5      u  pUb  UR                  XU5      u  p[         nU R
                  R"                  S:w  ad  U R
                  R"                  S:X  a-  UR%                  SS5      (       a  [&        R)                  S5        O[*        U R
                  R"                     nU" U U	U
UU4U R,                  (       d  S	OU R.                  U R0                  S
.UD6u  nnUR2                  " / UQSP76 R5                  5       nU R7                  U5      nUU4$ )NrB   r"   rA   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r  r  )rI   r   r)  r*  r+  ro   r/  r9  r1  r2  r3  rJ   r   use_mem_roper  r   r  _attn_implementationr   loggerwarning_oncer   r  r'  r  r   r  r,  )r8   rN   r   r  r?  r@  r  input_shapehidden_shapequery_statesr   r   adapter_layer_idxr   r   attention_interfacer  r  s                     r<   rV   Zamba2Attention.forward  sA    $))#2.88b8$--8{{=1[[/
{{=1;;33 $y 9'*D*DEV*WXe*ffL#&@&@AR&STa&bbJ'*D*DEV*WXe*ffL#((6@@AF__\2<<QB
#((6@@AF;;##*HC';LVY'_$L%'5'<'<ZW`'a$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r>   )r'  r$  r   ro   r   r&  r*  r.  r9  r   r2  r1  r3  r   r  r  r,  r)  r  r+  r'   )rY   rZ   r[   r\   r   r#   r   rx   r1   r3   r   rm   r	   r   r   rV   r]   r^   r_   s   @r<   r  r  A  s   : $(,0"&6\6\ C=6\ %SM	6\
 3-6\ 6\x 26=AKO7)||7) 7) !.	7)
 !!9:7) &eELL%,,,F&GH7) -.7) 
u||Xell3XeELL>Q5RR	S7) 7)r>   r  input_tensorpad_sizec                     [        U R                  5      S:X  a
  SSSSSUSS4OSSSUSS4n[        R                  R                  R                  XSSS9$ )zv
Padding x tensor with `pad_size` on the seq_len dim (dim=1)

Assumes that we only have tensors of either size 4 or 3
   r   constant)moder  )r   rI   r3   r   rG   pad)rP  rQ  	pad_shapes      r<   pad_tensor_by_sizerX    sd     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr>   c                    [        X5      n [        U R                  5      S:X  a-  U R                  U R                  S   SX R                  S   5      $ U R                  U R                  S   SX R                  S   U R                  S   5      $ )z
Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
simultaneously splitting it into chunk sequences.

Assumes that we only have tensors of either size 4 or 3
r   r   rB   rA   )rX  r   rI   r   )rP  rQ  
chunk_sizes      r<   reshape_into_chunksr[    s     &l=L
<!###L$6$6q$92zK]K]^_K`aa ##q!2z3E3Ea3H,J\J\]^J_
 	
r>   c           	      
   U R                  S5      nU S   R                  " / U R                  5       QUP76 n [        R                  " [        R                  " XU R
                  [        R                  S9SS9nU R                  U) S5      n [        R                  " U SS9n[        R                  " [        R                  " XU R
                  [        R                  S9SS9nUR                  U) [        R                  * 5      nU$ )zg
More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
rB   .Nrs   diagonalr   r   r   )
sizer   r3   trilr4   rq   boolmasked_fillcumsuminf)rP  rZ  masktensor_segsums       r<   segment_sumrh    s     ""2&J  	*11S<3D3D3FS
SL::ejj@S@S[`[e[efqstD++TE15LLL26M ::ejj@S@S[`[e[efqrsD!--teeiiZ@Mr>   c                     ^  \ rS rSrSrSS\S\\   4U 4S jjjr  SS\	R                  S\\   S\\	R                     4S	 jjrSS\\   S\\	R                     4S
 jjr  SS\\   S\\	R                     4S jjrSrU =r$ )Zamba2MambaMixeri  uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
ro   r   c           	        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR                  U l        [        UR                  U R                  -  5      U l
        X l        UR                  U l        SU l        [        R                  " 5       U l        UR"                  U l        UR$                  U l        UR(                  U l        U R                  R,                  U l        UR0                  U l        UR2                  U l        UR4                  U l        UR6                  U l        U R                  SU R&                  -  U R
                  -  -   U l        [        R:                  " U R8                  U R8                  SUR                  U R8                  UR                  S-
  S9U l        U R                  U R8                  -   U R.                  -   n[        R>                  " U R                  UUR@                  S9U l!        [        RD                  " [F        RH                  " U R.                  5      5      U l%        [F        RL                  " SU R.                  S-   5      n[        RD                  " [F        RN                  " U5      5      U l(        SU RP                  l)        [U        U R                  U R                  U R&                  -  SS9U l+        [        RD                  " [F        RH                  " U R.                  5      5      U l,        SU RX                  l)        [        R>                  " U R                  U R                  UR@                  S9U l-        [\        (       d  [^        Ra                  S	5        g g )
NrH   rA   Tr"   )in_channelsout_channelsr#  kernel_sizegroupspaddingr"  gh㈵>)r7   r:   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)1r0   r1   ro   r9   r{   r|   r}   r~   rx   ry   rz   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathr   n_groupsr   r   r   	num_headsrZ  time_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr(  add_bias_linearin_projr2   r3   r4   dt_biasarangelogA_log_no_weight_decayr,   normDout_projis_fast_path_availablerH  rI  )r8   ro   r   projection_sizeAr;   s        r<   r1   Zamba2MambaMixer.__init__  s   !--$22 & 3 3!$V%8%84;K;K%K!L"#11 779 & 7 7,,,,22 ++%55#11#11..T]]1BTEXEX1XXii++==''!+
 004==@4>>Qyy''
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#&""t/E/E/V\`
	 ejj89"&		$"8"8$:J:JQWQgQgh%%> &r>   rN   cache_paramsr  c                    UR                   u  pEnU R                  U R                  -  nSU R                  -  SU R                  -  U R                  -  -   U R                  -   nUGb%  UR
                  (       Ga  U R                  UR                  S5      5      n	U	R                   S   U-
  S-  n
XU R                  U R                  U R                  /n[        R                  " XSS9u    plp[        UUR                  U R                     U R                  R                  R                  S5      U R                  R                   U R"                  5      n[        R                  " UU R                  Xw/SS9u  pn[        R$                  " U R&                  R)                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R+                  SU R,                  U R                  5      R/                  [        R0                  S9nUS S 2S S 2S 4   R+                  SSU R,                  5      nU R2                  S S 2S S4   R+                  SU R,                  5      nU R4                  S S 2S S4   R+                  SU R,                  5      nUR7                  X@R                  UR                   S   U R                  -  5      nUR7                  X@R                  UR                   S   U R                  -  5      nUR7                  X@R                  U R,                  5      n[9        UR:                  U R                     UUUUUUS USS9
nUR7                  X@R                  U R,                  -  5      nU R=                  X5      nU R?                  U5      S S 2S S4   nU$ UbG  [        R@                  " US:H  5      (       d)  URB                  nXS S 2S S 2S 4   -  R/                  U5      nU R                  U5      n[        R$                  " U R&                  R)                  5       5      * nU RD                  c  0 OS	U RD                  0nUb  [        R@                  " US:H  5      nOSnU RF                  (       Ga   U RH                  (       a  Uc  U(       a  [K        UU R                  R                  R                  S5      U R                  R                   U R2                  U4U R4                  U RL                  S U R"                  U R<                  R                  U R<                  RN                  U R>                  R                  U R>                  R                   U R,                  U R                  S
SS.UD6u  nnU$ [        R                  " UU R                  U R                  U R                  /SS9u  pnUbv  URQ                  SS5      n[R        RT                  RW                  UU RX                  UR                   S   -
  S45      nUR                  U R                     R[                  U5        [\        b  U R"                  S;  aJ  U R_                  U R                  URQ                  SS5      5      RQ                  SS5      S S 2S U24   5      nOv[]        URQ                  SS5      U R                  R                  R                  S5      U R                  R                   U R"                  S9RQ                  SS5      S S 2S U24   n[        R                  " UU R                  Xw/SS9u  pnUbG  [        R@                  " US:H  5      (       d)  URB                  nXS S 2S S 2S 4   -  R/                  U5      n[a        UR7                  XESU R,                  5      UUUR7                  XEU R                  S5      UR7                  XEU R                  S5      4U RL                  U R4                  S S SU R2                  SS.UD6u  nnUb+  Ub(  UR:                  U R                     R[                  U5        UR7                  XES5      nU R=                  UU5      nU R?                  U5      nU$ )NrA   r"   rB   r   .r   T)zr  dt_softplusdt_limitF)r  rZ  seq_idxrr  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rH   swish)r   r5   r#  rr  )rZ  r  r  r  r  r  r  )1rI   rv  r|   rz   rw  rw   r  squeezer{  r3   splitr)   r   r   r}  r5   r#  rr  expr  r   r   r   rE   rF   r  r  rJ   r$   r   r  r  allrD   rx  ru  r  r&   rZ  r6   r   r   rG   rV  r~   copy_r(   rt  r%   )r8   rN   r  r  rp   seq_lenr   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrO   hidden_states_B_CdtBCr  r  r  hidden_states_reshapedoutrD   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputs                                  r<   cuda_kernels_forward%Zamba2MambaMixer.cuda_kernels_forwardP  sT    "/!4!4
Q!%1D1D!D$0001t}}3DtGZGZ3ZZ]a]k]kk #(G(G(G"&,,}/D/DQ/G"H(..r2[@QFE$)$2H2H$--Y]YgYg#h 05<Okm0n-Aq) 4!((8""**1-  ! #(++!'')?X#Ma
 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2''7& M *..z>>DMM;YZM IIm:M--.q$|<Cz 
u )%))Na<O2P2P%++!.1d
1K!K O OPU V#||M:4::++-..A$($8$8$@bzSWSgSgFhO)#(99^q-@#A #' $$$<;OTd!A$KK&&..q1KK$$LL" ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(,#"$ &%"YX 
m 6;[[$++T]]DNNK62  +*;*E*Ea*K'!#!2!2+d.C.CFYF_F_`bFc.cef-g"J !,,T^^<BB:N#+tFW/W(,$5$?$?1$EFPPQRTUVWXZb[bZbWbc)% )9+55a;#{{1199!<![[--#'??	)
  i1oa'k)3% ',kk%++-C\'#!
 "-eiiRS@S6T6T)//E%2Aq$J5O%O$S$STY$ZM)B!&&zBNFF:rBFF:rB*  $ff (, LL $* &*&Y (\-E ++DNN;AA)L)..zBG"iiT:mmK0
r>   c                    UR                   u  pEnUR                  nUb2  UR                  (       a!  U R                  UR	                  S5      5      nOOUb;  [
        R                  " US:H  5      (       d  XS S 2S S 2S 4   -  R                  U5      nU R                  U5      nUR                   S   SU R                  -  -
  SU R                  -  U R                  -  -
  U R                  -
  S-  n	UR                  XU R                  U R                  U R                  /SS9u    pjpUGb  UR                  U R                     R!                  5       nUR                  UR"                  5      nUR                  (       Ga2  U
R%                  S5      n
UR&                  U R                     n[
        R(                  " USSS9nUR*                  S:X  a  US S 2SS S 24   OUUS S 2S S 2S4'   UR&                  U R                     R-                  U5        [
        R.                  " UR                  UR"                  5      U R0                  R2                  S S 2SS S 24   -  SS9nU R4                  (       a  XR0                  R6                  -  nU R9                  U5      R                  U5      S S 2S S4   nGOUR;                  SS5      n[<        R>                  RA                  UU RB                  UR                   S   -
  S45      nUR&                  U R                     R-                  U5        U R9                  U R1                  U5      R;                  SS5      5      S S 2S U2S S 24   nUbG  [
        R                  " US:H  5      (       d)  UR                  nXS S 2S S 2S 4   -  R                  U5      nO[
        RD                  " X@R                  U RF                  U R                  4UR"                  US	9nU R9                  U R1                  UR;                  SS5      5      SS U24   R;                  SS5      5      n[
        R                  " XR                  U R                  U R                  -  U R                  U R                  -  /SS9u  pn[
        RH                  " U RJ                  RM                  5       5      * nUGbq  UR                  (       Ga_  UR*                  S:X  a
  US S 2S S4   OUS S 2SS S 24   S S 2S S4   nUR;                  SS5      RO                  XLR                   S   U RF                  5      nU RP                  S
   RO                  U RP                  R                   S   U RF                  5      n[
        R<                  R>                  RS                  UUR                  UR                  5      -   5      n[
        RT                  " XRV                  5      nUS   RO                  U R                  U RF                  U R                  5      R                  [
        RX                  S9n[
        RH                  " US
   U-  5      nUR[                  X@R                  S5      SS S S 24   nURO                  X@R                  U R                  U R                  -  UR                   S   5      R]                  5       nUR[                  USUR                   S   5      nUS
   USS S S 24   -  nUR[                  USU RF                  5      nUUS
   -  nUR                  U R                     R-                  UR                  U R                     U-  U-   5        UR[                  X@R                  S5      SS S S 24   nURO                  X@R                  U R                  U R                  -  UR                   S   5      R]                  5       nUR[                  USUR                   S   5      nUR                  U R                     R                  UR                  5      nUR_                  X@R                  -  U RF                  U R                  5      nUR_                  X@R                  -  U R                  S5      n[
        R`                  " UU5      nUR_                  X@R                  U RF                  5      nU Rb                  S
   RO                  U Rb                  R                   S   U RF                  5      nUUU-  -   R                  UR                  5      nUR[                  US5      S S 2S S4   nGO([<        R>                  RS                  XRP                  -   5      n[
        RT                  " XRV                  5      nUR[                  XESU RF                  5      RM                  5       nUR[                  XESU R                  5      RM                  5       nUR[                  XESU R                  5      RM                  5       nURe                  U R                  U R                  -  SU R                  S9nURe                  U R                  U R                  -  SU R                  S9nU Rf                  XPRf                  -  -
  U Rf                  -  nU Rb                  S
   [i        UU5      -  nXS
   -  nUR                  UR                  5      U-  nUUUU4 Vs/ s H  n[k        UUU Rf                  5      PM     snu  nnnnURm                  SSSS5      n[
        Rn                  " USS9n[
        RH                  " [q        U5      5      nUS S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n U R/                  SS9n!U!S
   URm                  SSSSS5      S
   -  n"U"R/                  SS9n#U#S
   US S 2S S 2S 4   -  R/                  S5      n$[
        RH                  " US S 2S S 2S S 2SS 24   U-
  5      n%UU%Rm                  SSSS5      S
   -  n&U&Rm                  SSSSS5      S
   URm                  SSSSS5      SS S S 24   -  R/                  SS9Rm                  SSSSS5      n'Ub3  UR                  (       a"  UR                  U R                     S S 2S S4   n(O[
        Rr                  " U'S S 2S S24   5      n([
        Rt                  " U(U'/SS9n'[
        RH                  " [q        [<        R>                  RA                  US S 2S S 2S S 2S4   S5      5      5      n)U'Rm                  SSSSS5      n*U)S   U*S S 2S S 2S S4   -  R/                  SS9n+U+Rm                  SSSSS5      n,U,S S 2S S24   U,S S 2S4   nn'[
        RH                  " U5      n-USS S S 24   U'S S 2S S 2S S4   -  n.U-Rm                  SSSS5      n/U.R/                  S5      U/S
   -  n0U$U0-   nUR[                  USU R                  U RF                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nUR[                  XES5      nUb+  Ub(  UR                  U R                     R-                  U5        U Rw                  UU
5      n1U Ry                  U1R                  U5      5      n2U2$ s  snf )Nr"   rB   rA   r   r   r   r   .rs   r]  ).NNr   )r   output_sizerS  )r"   r   )=rI   rD   rw   r  r  r3   r  rE   rz   rv  r|   rw  r  r{  r   r   clonerq   r  r   r   ndimr  sumr}  r5   rq  r#  rt  r   r   rG   rV  r~   r   r   r  r  r   r   r  softplusr   ry  rF   r   r  rJ   bmmr  repeat_interleaverZ  rX  r[  permuterd  rh  
zeros_liker   r  r  )3r8   input_statesr  r  rp   r  r   rD   r  r  rO   rN   r  r  r   r  r  r  r  dAdBdBxr   ssm_states_reshaped
C_reshapedyr  rQ  
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offr  contextualized_statess3                                                      r<   torch_forwardZamba2MambaMixer.torch_forward  s[   !-!3!3
Q""#(G(G $\-A-A!-D E)%))NA<M2N2N$0!Q*3M$M#Q#QRW#XL $\ :!''+a$2H2H.HHAPTP]P]L]`d`s`sLssuy  vD  vD  D  IJ  J(8(>(>t55t~~V\^ )? )
%1M
 #$//?EEGI!]%9%9:I...~~a()55dnnE
"ZZ
2BG
ANASASWXAX}Q1W'=^k
1a8$((8>>zJ %		*--8H8O8O*PSWS^S^SeSefgijlmfmSn*ntv w%%![[%5%55M $ 7 : :5 A!T3, O - 7 7! <]]..!**]-@-@-DDaH
 ((8>>zJ $])C)M)MaPQ)R STUW_X_W_abTb c!-eiiPQ@Q6R6R)//E%2Aq$J5O%O$S$STY$ZM^^T]]D<O<OP$++5I !HHT[[1H1HA1N%OPSU]V]U]P]%^%h%hijlm%noM#kk-:P:PRVR_R_bfbubuRuw{  xE  xE  HL  H[  H[  x[  :\  bd  e!YYtzz'')**#(G(G(G &(WW\AtSL!r!Q'{1dC<7PBa#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!3!34B/"))$..$--I\I\]``glgtgt`uA2i=1,-B
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PM}Y//C ##DNN399''7"<sB 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CCAGGLJ",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!3!34B)11*r4==Y__aM		*D4G4GHNNPA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCAFF !99hq!Q|&<x&GIL"#l&:&:1aA&Fy&Q"Q)11!Q1a@K}OdOdefhiklnoqrOstwy}  @A  uA  PB  B  G  G  LM  G  N  V  V  WX  Z[  ]^  `a  cd  eF'L,K,K"."9"9$.."I!TSV,"W"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK$nnQ1a;O!/2_Q4QT_5UUZZ_`ZaF1aA6J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A$)A''7==iHii4(
 !%knnU.C D$$I &{s   	!zc                     [         (       a@  SU R                  R                  R                  R                  ;   a  U R                  XU5      $ U R                  XU5      $ )Ncuda)r  r  r5   rq   r   r  r  )r8   rN   r  r  s       r<   rV   Zamba2MambaMixer.forward  sM     "!f0C0C0J0J0O0O&O,,].YY!!-~NNr>   )r  r  rt  rr  rZ  ro   r}  r{  r~   r  r   r9   r  rz   r   rv  r  rw  r  r|   rx  rz  ry  rq  ru  r/   r*   )rY   rZ   r[   r\   r   r#   r   rx   r1   r3   r   rm   r  r  rV   r]   r^   r_   s   @r<   rj  rj    s    ?| ? ? ?H <@15	T||T 78T !.	Tn%AY8Z %qyz  {G  {G  rH %J <@15		O 78	O !.		O 	Or>   rj  c                   H   ^  \ rS rSrSS\S\\   4U 4S jjjrSS jrSr	U =r
$ )		Zamba2MLPi  ro   r   c           
        > [         T	U ]  5         Xl        UR                  U l        UR                  U l        X l        X0l        [        R                  " U R                  SU R                  -  UR                  S9U l
        [        R                  " U R                  U R                  UR                  S9U l        [        UR                     U l        [        R                  " / 5      U l        [#        U R
                  5       H  nXAR$                  -  U:X  a  [        R&                  " [        R                  " U R                  R                  U R                  R(                  SS9[        R                  " U R                  R(                  SU R                  -  SS95      nO[        R*                  " 5       nU R                   R-                  U5        M     UR.                  n[1        U5       VVs0 s H  u  pxX_M	     snnU l        gs  snnf )a9  
This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
rA   r"  FN)r0   r1   ro   r9   rz   r  r   r   r(  r~  gate_up_proj	down_projr   
hidden_actact_fnr0  gate_up_proj_adapter_listr   r4  r5  r6  r7  r   r-  r8  r9  )
r8   ro   r  r   r   gate_up_proj_adapterr.  r=  r  r;   s
            r<   r1   Zamba2MLP.__init__  s   
 	!--!'!9!9"4 IId&6&6D<R<R8RY_YoYop4#9#94;K;KRXRhRhiV../)+r):&t../A(((H4')}}IIdkk55t{{7O7OV[\IIdkk66D<R<R8RY^_($
 (*{{}$**112FG 0 !11;D_;UV;U<5%,;UVVs   -Hc                     U R                  U5      nU R                  U   nX0R                  U   " U5      -   n[        R                  " USSS9nU R                  US   5      US   -  nU R                  U5      nU$ )NrA   rB   r   r   r"   )r  r9  r  r3   chunkr  r  )r8   hidden_stater   gate_up_stateoutputs        r<   rV   Zamba2MLP.forward  s{    )),7NN9-	%(F(Fy(QR^(__M1"={{=#34}Q7GG-r>   )
r  r   ro   r  r  r  r9   rz   r9  r  r*   r/   )rY   rZ   r[   r\   r#   r   rx   r1   rV   r]   r^   r_   s   @r<   r  r    s0    W| WPXY\P] W W< r>   r  c                   F  ^  \ rS rSrSS\S\\   S\\   4U 4S jjjr    SS\R                  S\R                  S\S\\R                     S	\\
   S
\\   S\\R                     S\\   S\\R                   \\\R                   \R                   4      4   4S jjrSrU =r$ )Zamba2AttentionDecoderLayeri  ro   r   r   c                 "  > [         TU ]  5         X l        [        UR                  5      n[        USXBS9U l        [        XUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )NrB   )r   r  r   )r  r   r:   )r0   r1   r   r   r-  r  	self_attnr  feed_forwardra   r$  rms_norm_epsinput_layernormr9   pre_ff_layernorm)r8   ro   r   r   num_gsr;   s        r<   r1   $Zamba2AttentionDecoderLayer.__init__  sz     V,,-(2RXl%fRZ[,V-I-IvObObc -f.@.@fFYFY Zr>   rN   original_hidden_statesr  r?  rD  r@  r  r   c           
          [         R                  " X/SS9nU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R	                  U5      nU R                  X5      nU4n
U(       a  X4-  n
U
$ )ab  
Args:
    hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
        This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
        concatenated tensor is then used as input of the pre-attention RMSNorm
        (see fig. 2 in https://arxiv.org/pdf/2405.16712).
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
rB   r   )rN   r   r  r?  rD  r@  r   )r3   concatenater  r  r  r  )r8   rN   r  r   r  r?  rD  r@  r  self_attn_weightsoutputss              r<   rV   #Zamba2AttentionDecoderLayer.forward  s    > ))=*QWYZ,,];+/>> ,
'))/ 3,
 ,
( --m<))-C "++Gr>   )r   r  r  r  r  r*   )NNFN)rY   rZ   r[   r\   r#   r   rx   r1   r3   r   rm   rb  r   r   r   r	   r   rV   r]   r^   r_   s   @r<   r  r    s    [| [x} [X`adXe [ [ 26=A,1:>3||3 !&3 	3
 !.3 !!9:3 $D>3 &e&6&673 -.3 
u  (51B1BEDUDU1U+V"WW	X3 3r>   r  c                     ^  \ rS rSrS\S\4U 4S jjr         SS\R                  S\	\R                     S\	\   S\	\R                     S\	\R                     S	\	\
   S
\	\   S\	\   S\	\R                     S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )Zamba2MambaDecoderLayeri  ro   r   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        X l        g )N)ro   r   r  )	r0   r1   rj  mambara   r9   r  r  r   )r8   ro   r   r;   s      r<   r1    Zamba2MambaDecoderLayer.__init__   s:    %VI
,V-?-?VEXEXY"r>   rN   r  r  r  r?  rD  	use_cacher   transformer_hidden_statesr   c                     UnU
b  X-   OUnU R                  U5      nU R                  UUUS9nSnX-   nU4nU(       a  X4-  nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
N)rN   r  r  )r  r  )r8   rN   r  r   r  r  r?  rD  r  r   r  r  residualr  r   s                  r<   rV   Zamba2MambaDecoderLayer.forward&  s    < !
 :S9^M5dq 	 ,,];

'') # 
 ! !0 "++G((Gr>   )r  r   r  )	NNNNNFFNN)rY   rZ   r[   r\   r#   rx   r1   r3   r   r   rm   rb  r   r	   r   rV   r]   r^   r_   s   @r<   r  r    s   #| # # :>#'15.2=A,1$)59<@:||: !) 6: C=	:
 !.: ell+: !!9:: $D>: D>: !!1!12: $,ELL#9: 
u  (51B1BEDUDU1U+V"WW	X: :r>   r  c                   |  ^  \ rS rSrS\S\R                  S\4U 4S jjr        SS\	R                  S\\	R                     S\\   S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\	R                     S\\	R"                  \\\	R"                  \	R"                  4      4   4S jjrSrU =r$ )Zamba2HybridLayeric  shared_transformerlinearr  c                 F   > [         TU ]  5         X l        X0l        Xl        g r/   )r0   r1   r  mamba_decoderr  )r8   r  r  r  r;   s       r<   r1   Zamba2HybridLayer.__init__d  s!     	""4r>   rN   r  r   r  r  r?  rD  r  r@  r   c
           
          U R                  UUUUUUU	S9n
U
S   nU(       a  U
S   nU R                  U5      nU R                  UUUUUUU	S9n
U(       a  U
S   W4U
SS -   n
U
$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
    hidden activations to form the input of the shared transformer layer.
    layer_idx (`int`): layer number.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
)r  r   r  r?  rD  r@  r   r"   )r  r  r?  rD  r  r@  rA   N)r  r  r  )r8   rN   r  r   r  r  r?  rD  r  r@  layer_outputsr  r  s                r<   rV   Zamba2HybridLayer.forwardl  s    @ //#9&)/ 3 0 
 %2!$4! -a 0$(KK0I$J!**&?))/ 3 + 
 *1-/@AMRSRTDUUMr>   )r  r  r  )NNNNNFFN)rY   rZ   r[   r\   r  r   r(  r  r1   r3   r   r   rx   rm   rb  r   r	   r   rV   r]   r^   r_   s   @r<   r  r  c  s   5"=5GIyy5Yp5 :>#'15.2=A,1$):>>||> !) 6> C=	>
 !.> ell+> !!9:> $D>> D>> &e&6&67> 
u  (51B1BEDUDU1U+V"WW	X> >r>   r  c                   F    \ rS rSr\rSrSrSS/rSr	Sr
SrSrSrSrS rSrg	)
Zamba2PreTrainedModeli  modelTr  r  r   c                 ^   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        [        45      (       a&  UR                  R                  R                  S5        g [        U[         5      (       Ga  ["        R$                  " ["        R&                  " U R                   R(                  5      [*        R,                  " U R                   R.                  5      [*        R,                  " U R                   R0                  5      -
  -  [*        R,                  " U R                   R0                  5      -   5      R3                  U R                   R4                  S9nU["        R,                  " ["        R6                  " U* 5      * 5      -   nUR8                  R                  R;                  U5        ["        R<                  " SUR>                  S-   5      nUR@                  R                  R;                  ["        R,                  " U5      5        URB                  R                  R                  S5        g g )NrE  )rL   stdg      ?)minr"   )"ro   initializer_ranger   r   r(  r|  r5   datanormal_r#  r   	Embeddingpadding_idxra   r,   fill_rj  r3   r  randr   mathr  rz  ry  r   time_step_floorexpm1r  r  r  rw  r  r  )r8   r   r  r  inv_dtr  s         r<   _init_weights#Zamba2PreTrainedModel._init_weights  s%   kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .0B CDDMM$$S) 011

4;;44588DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FNN%%f-Q 0 01 45ALL##EIIaL1HHMM$ 2r>   r   N)rY   rZ   r[   r\   r#   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_flex_attn_supports_sdpa_supports_cache_class_is_statefulr'  r]   r   r>   r<   r  r    sF    L&*#68QR"3!N L%r>   r  c                   N  ^  \ rS rSrSrS\4U 4S jjrS rS r\	          SS\
\R                     S\
\R                     S	\
\R                     S
\
\   S\
\R                     S\
\   S\
\   S\
\   S\
\   S\
\R                     S\\\4   4S jj5       rS rS rSrU =r$ )Zamba2Modeli  zX
Model consisting of *config.num_hidden_layers* layers.

Args:
    config: Zamba2Config
ro   c           	        > [         TU ]  U5        Xl        UR                  U l        UR
                  U l        [        R                  " UR
                  UR                  U R                  5      U l	        [        UR                  5       Vs/ s H  n[        XS9PM     nn/ n/ nUR                  U l        [        UR                  5       H  nUR                  U   S:X  a  UR                  [!        XS95        M0  UR                  U   S:X  d  ME  UR                  [        R"                  " U R                  R                  U R                  R                  SS95        UR                  [!        XS95        M     [%        U5      n[%        U5      n['        U5      nU R)                  X5U5      n[        R*                  " U5      U l        UR.                  U l        [1        UR                  UR2                  S9U l        UR6                  (       a6  UR8                  (       a  [:        R=                  S5        [?        U5      U l         SU l!        U RE                  5         g s  snf )	N)r   r  r   rt   Fr"  r  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.)#r0   r1   ro   pad_token_idr   
vocab_sizer   r  r9   embed_tokensr   r4  r  rv   r   r   r  r(  iterr   
get_layersr0  layersrG  ra   r  final_layernormrF  use_long_contextrH  rI  r   
rotary_embgradient_checkpointing	post_init)	r8   ro   r  blocksmamba_layerslinear_layersr   r<  r;   s	           r<   r1   Zamba2Model.__init__  s    !.. ++LL):):F<N<NPTP`P`aKPQWQfQfKghKga-fAKgh!'!9!9v//0A''*g5##$;F$PQ))!,8$$RYYt{{/F/FH_H_fk%lm##$;F$PQ 1 L)]+vEmmF+$*$?$?!,V-?-?VEXEXY&&##{ 4F;DO&+# 	7 is   Ic                     U R                   $ r/   r9  ri   s    r<   get_input_embeddings Zamba2Model.get_input_embeddings  s       r>   c                     Xl         g r/   rG  r8   r  s     r<   set_input_embeddings Zamba2Model.set_input_embeddings  s    !r>   	input_idsr  r   r   inputs_embedsr  rD  output_hidden_statesreturn_dictr   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nUn[        R                  " U5      nU(       aM  UcJ  Ub  UR                  S   OUR                  S   n[        U R                   XR                  U R                   S9nU
cM  Ub  UR#                  U R$                  S9OSn[        R&                  " XUR                  S   -   UR                   S9n
Uc  U
R)                  S5      nU R+                  X%U
5      nU R                   R,                  (       a  U R/                  X5      nOS nU(       a  S	OS nU(       a  S	OS n[1        U R2                  5       H  u  nnU(       a  UU4-  nU R                  (       a6  U R                  (       a%  U R5                  UR6                  UUUUUUUUU5
      nOU" UUUUUUUUUS
9	nUS   nU(       d  Mv  US   c  M~  UUS   4-  nM     U R9                  U5      nU(       a  UU4-  nU(       a  UR:                  (       d  SUl        [=        UU(       a  UOS UUS9nU	(       a  U$ UR?                  5       $ )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   rD   rq   r6  r"   ru   r   )r  r   r  r  r?  rD  r  r@  T)last_hidden_stater   rN   
attentions) ro   rD  rP  r  use_return_dict
ValueErrorr@  r  rH  rI  r9  r3   r  rI   rm   rD   rq   r   first_transformer_layer_idr  r  _update_causal_maskrF  r?  r8  r<  _gradient_checkpointing_func__call__r=  rw   r   to_tuple)r8   rN  r  r   r   rO  r  rD  rP  rQ  r   rN   r  rp   past_seen_tokensr  r@  all_hidden_statesall_self_attnsr   layerr  r  s                          r<   rV   Zamba2Model.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0/8/D+-J]J]^_J`J6t{{JV`V`imitituO! #.  ..9X9X.Y 
 #\\ ]5H5H5K"KTaThThN )33A6L..~n] ;;##"&//-"N"&"6BD0d )$++ 6Iu#!m%55!**t}} $ A ANN!*"#%'! !&!+A'#1 +#2&7'(;
! *!,M   #/"}Q'7&99NE !7H ,,];  -!11?#E#E15O.(+/8Od+%	
 %v;&//*;;r>   c                    U R                   R                  S:X  a  Ub  SU;   a  U$ g UR                  UR                  pT[        R
                  " U5      R                  nUR                  S   nUS   S-   n[        R                  " Xx4XdUS9n	US:w  a  [        R                  " U	SS9n	U	[        R                  " XS9UR                  SS5      :  -  n	U	S S S S 2S S 24   R                  UR                  S   SSS5      n	Ub  U	R                  5       n	UR                  5       S	:X  ac  UR                  S   n
U	S
S U
24   R                  S5      US S 2S S S S 24   R                  S5      -  nU	S
S U
24   R!                  X5      U	S
S U
24'   U R                   R                  S:X  a3  Ub0  UR                  R"                  S;   a  [$        R&                  " X5      n	U	$ )Nflash_attention_2rE  r"   rB   )
fill_valuerD   rq   r^  ru   r   rA   .rC  )r  xpunpu)ro   rG  rD   rq   r3   finfor  rI   fulltriur  r   r   r  r   eqrc  r   r   _unmask_unattended)r8   r  rP  r   rD   rq   	min_dtypesequence_lengthtarget_lengthr  mask_lengthpadding_masks               r<   rY  Zamba2Model._update_causal_mask  s   ;;++/BB)c^.C%%$**L,?,?vKK&**	&,,Q/&r*Q.jj/!Aimsta**[1=Ku||MANDZDZ[]_`Daaa!$a"23::<;M;Ma;PRSUWY[\%%++-K!!#q(,2226*3+<=@@EWXZ^`dfgWgHhHkHkloHpp1<S,;,=N1O1[1[\h1tC+-. KK,,6*%%**.DD
 1CCK[Kr>   c           
         / n/ U l         SU l        [        U R                  5       GH!  u  pVUS:X  Ga  U R                  S:X  a  XPl        [	        U5      nU R
                  R                  [        U R
                  R                  5      -  S:  Gam  SU S3n[        R                  " US-   S-   S-   S	-   S
-   5      n	U R                   R                  U	5        Sn
U R                   Ht  nUS:X  af  XR
                  R                  -  UR                  :X  a@  [        R                  " S[        U
5      -   S-   5      nU R                   R                  U5        U
S-  n
Mv     U R
                  R                  (       a  Sn
U R                   Ht  nUS:X  af  XR
                  R                  -  UR                  :X  a@  [        R                  " S[        U
5      -   S-   5      nU R                   R                  U5        U
S-  n
Mv     UR                  [        U[	        U5      [	        U5      5      5        GM  UR                  [	        U5      5        GM$     U$ )Nr   rt   r"   z	^layers\.z\.shared_transformer\.z(?:z3self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|z1feed_forward\.(?:gate_up_proj|down_proj)\.weight|z,(?:input_layernorm|pre_ff_layernorm)\.weightz)$z>^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\.z\.(?:0|1)\.weight$zg^shared_transformer\.self_attn\.(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\.)_tied_weights_keysrX  r8  rv   nextro   r4  r   r-  recompiler   r   r   r/  r  )r8   rB  rD  rC  r<  layer_id
layer_typeblockprefix_patternmain_keys_pattern
adapter_id_layer_typeadapter_patternattn_adapter_patterns                 r<   r;  Zamba2Model.get_layers  s3   "$*+'$-d.D.D$E HX%22a76>3V;;--DKK4P4P0QQTUU(1(;Q%RN(*

& !PQ OO J	J
   )% ++223DE!"J'+'='=&(2zKKD^D^7^bgbpbp7p.0jj a"%j/!2"7!8/O
 !33::?K"a
 (> {{??%&
+/+A+AK*h6:HbHb;bfkftft;t79zz%q&)*o%6 '<%<8" 4 !% 7 7 > >?S T&!OJ ,B /tM7JDQ]L^_`d<01S %FT r>   )rG  rs  ro   r9  r=  rX  r@  r<  rv   r   r?  r8  
NNNNNNNNNN)rY   rZ   r[   r\   r   r#   r1   rH  rL  r   r   r3   r   r   rm   r   rb  r
   r	   r   rV   rY  r;  r]   r^   r_   s   @r<   r4  r4    s3   "| "H!"  151537>B59$(,0/3&*59v<E,,-v< !.v< u//0	v<
 "":;v<   1 12v< D>v< $D>v< 'tnv< d^v< !!1!12v< 
u--	.v< v<p!F. .r>   r4  c                     ^  \ rS rSrS\4U 4S jjrS rS rS rS r	S r
S	 r\            SS
\\R                     S\\R                      S\\R                     S\\   S\\R$                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                   4   S\\\4   4S jj5       r      SS jrSrU =r$ )Zamba2ForCausalLMi  ro   c                    > [         TU ]  U5        [        U5      U l        S/U R                  R                  QU l        UR
                  U l        [        R                  " UR                  UR
                  SS9U l	        U R                  5         g )Nzlm_head.weightFr"  )r0   r1   r4  r  rs  r8  r   r(  r9   lm_headrA  r8   ro   r;   s     r<   r1   Zamba2ForCausalLM.__init__  so      (
#3"Tdjj6S6S"T ++yy!3!3V5F5FUS 	r>   c                 .    U R                   R                  $ r/   r  r9  ri   s    r<   rH  &Zamba2ForCausalLM.get_input_embeddings      zz&&&r>   c                 $    XR                   l        g r/   r  rK  s     r<   rL  &Zamba2ForCausalLM.set_input_embeddings      "'

r>   c                     U R                   $ r/   r  ri   s    r<   get_output_embeddings'Zamba2ForCausalLM.get_output_embeddings  s    ||r>   c                     Xl         g r/   r  )r8   new_embeddingss     r<   set_output_embeddings'Zamba2ForCausalLM.set_output_embeddings  s    %r>   c                     Xl         g r/   r  )r8   decoders     r<   set_decoderZamba2ForCausalLM.set_decoder  s    
r>   c                     U R                   $ r/   r  ri   s    r<   get_decoderZamba2ForCausalLM.get_decoder  s    zzr>   rN  r  r   r   rO  labelsr  rD  rP  rQ  r   logits_to_keepr   c                 .   Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R	                  UUUUUUUU	UU
S9
nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " UX`R                  40 UD6nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Zamba2ForCausalLM

>>> model = Zamba2ForCausalLM.from_pretrained("Zyphra/Zamba2-7B-v1")
>>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B-v1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)
rN  r  r   r   rO  r  rD  rP  r   rQ  r   r"   losslogitsr   rN   rU  )ro   rD  rP  rV  r  r   rx   slicer  loss_functionr8  r   r   rN   rU  )r8   rN  r  r   r   rO  r  r  rD  rP  rQ  r   r  loss_kwargsr   rN   slice_indicesr  r  r  s                       r<   rV   Zamba2ForCausalLM.forward  sK   P 2C1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B] **)%+'/!5)#  
  
8B>SV8W8W~ot4]kmA}a,?@A%%ffooUUDY,F'+'7D7V#CVC%#33!//))
 	
r>   c           	         US L n	U	(       d]  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nOaUR                   S   UR                   S   :w  a	  US S 2U4   nO7[        U R                  UR                   S   U R                  U R                  S9nUbZ  UcW  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U	(       d  US S 2UR                   S   * S 24   nUb  U	(       a  SU0n
OSUR                  5       0n
U
R                  UUUUU R                  R                  US.5        U
$ )NrB   r"   r   rS  rO  rN  )r   r   r  r  r  r   )rI   rm   ro   rD   rq   longrd  masked_fill_r  r   num_logits_to_keep)r8   rN  r   r  rO  r   r   r  r  empty_past_kvmodel_inputss              r<   prepare_inputs_for_generation/Zamba2ForCausalLM.prepare_inputs_for_generationD  sd    (4/  )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	6Y__Q/tzz$++O %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r>   )rs  r  r  r8  )NNNNNNNNNNNr   )NNNNNT)rY   rZ   r[   r\   r#   r1   rH  rL  r  r  r  r  r   r   r3   r   r   rm   r   rb  r
   rx   r	   r   rV   r  r]   r^   r_   s   @r<   r  r    s   | '(&  151537>B59-1$(,0/3&*5934O
E,,-O
 !.O
 u//0	O

 "":;O
   1 12O
 ))*O
 D>O
 $D>O
 'tnO
 d^O
 !!1!12O
 c5<</0O
 
u,,	-O
 O
h 9 9r>   r  a  
    The Zamba2 Model with a sequence classification head on top (linear layer).

    [`Zamba2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   Z  ^  \ rS rSrU 4S jrS rS r\          SS\\	R                     S\\	R                     S\\	R                     S\\\\\	R                     4      S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )Zamba2ForSequenceClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        U R                  R
                  U l        [        R                  " UR                  U R                  SS9U l	        U R                  5         g )NFr"  )r0   r1   
num_labelsr4  r  rs  r   r(  r9   scorerA  r  s     r<   r1   (Zamba2ForSequenceClassification.__init__  se      ++ (
"&**"?"?YYv114??O
 	r>   c                 .    U R                   R                  $ r/   r  ri   s    r<   rH  4Zamba2ForSequenceClassification.get_input_embeddings  r  r>   c                 $    XR                   l        g r/   r  rK  s     r<   rL  4Zamba2ForSequenceClassification.set_input_embeddings  r  r>   rN  r  r   r   rO  r  r  rD  rP  rQ  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGb  UR                  UR                  5      nU R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r  r   r   rO  r  rD  rP  rQ  r   r"   z=Cannot handle batch sizes > 1 if no padding token is defined.rB   rs   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`ru   
regressionsingle_label_classificationmulti_label_classificationr  )ro   rV  r  r  rI   r7  rW  rE   rq   r3   int32r  argmaxrH  rI  r;   rY   problem_typer  rD   r  rx   r   r  r   rJ   r   r   r   rN   rU  )r8   rN  r  r   r   rO  r  r  rD  rP  rQ  transformer_outputsrN   r  rp   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  s                         r<   rV   'Zamba2ForSequenceClassification.forward  s   ( &1%<k$++B]B]"jj)%+'/!5# ) 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r>   )rs  r  r  r  r  )rY   rZ   r[   r\   r1   rH  rL  r   r   r3   r   r   r
   r   r   r   rb  r	   r   rV   r]   r^   r_   s   @r<   r  r    s)   '(  151537KO59-1$(,0/3&*[
E,,-[
 !.[
 u//0	[

 "%tE4E4E/F(F"GH[
   1 12[
 ))*[
 D>[
 $D>[
 'tn[
 d^[
 
u66	7[
 [
r>   r  )r  r  r4  r  )rE  )Nr"   )Xr#  ru  	itertoolsr   typingr   r   r   r   r   r	   r
   r3   r   torch.nnr   r   r   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.import_utilsr    r!   configuration_zamba2r#   +mamba_ssm.ops.triton.selective_state_updater$   !mamba_ssm.ops.triton.ssd_combinedr%   r&   causal_conv1dr(   r)   
get_loggerrY   rH  Moduler,   ra   rm   r   r   rx   r   r   r  r  r  r  rX  r[  rh  r  r  rj  r  r  r  r  r  r4  r  r  __all__r   r>   r<   <module>r     s  ,  	  D D D   A A ! . ) > B q q K F & , T . RmmZjW57WDD-7** 
		H	%; ;*JBII J(i | i X%<BII %<P	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % %4(6J)bii J)`VU\\ VS V
(( 46FH\]^ kOryy kO\'		 'T=")) =@Abii AHG		 GT$%O $%N |' | |@h- hV m
&; m
m
` kr>   