
    fThj                     |   S r SSKrSSKJrJrJrJrJrJrJ	r	  SSK
r
SSKr
SSK
Jr  SSKJrJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSK J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(J)r)  SSK*J+r+J,r,  SSK-J.r.  \," 5       (       a  SSK/J0r0J1r1  SSK2J3r3  OSu  r3r1r0\+" 5       (       a	  SSK4J5r5J6r6  OSu  r6r5\7" \3\1\5\6\045      r8\)Rr                  " \:5      r; " S S\Rx                  5      r=\&R|                  " \=5        S\
R~                  S\@S\
R~                  4S jrA " S S \5      rB S@S!\Rx                  S"\
R~                  S#\
R~                  S$\
R~                  S%\\
R~                     S&\CS'\C4S( jjrD " S) S*\Rx                  5      rE " S+ S,\Rx                  5      rF " S- S.\Rx                  5      rG " S/ S0\Rx                  5      rH " S1 S2\Rx                  5      rI " S3 S4\Rx                  5      rJ\( " S5 S6\"5      5       rK\( " S7 S8\K5      5       rL " S9 S:\K\5      rM\(" S;S<9 " S= S>\K5      5       rN/ S?QrOg)AzPyTorch Zamba model.    N)AnyCallableDictListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ALL_LAYERNORM_LAYERS)auto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_available   )ZambaConfig)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_updateNNc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )ZambaRMSNorm@   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
ZambaRMSNorm is equivalent to T5LayerNorm
N)super__init__r
   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/zamba/modeling_zamba.pyr-   ZambaRMSNorm.__init__A   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor/   float32powmeanrsqrtr2   r1   )r3   hidden_statesinput_dtypevariances       r7   forwardZambaRMSNorm.forwardI   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r9   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler1   shaper2   r3   s    r7   
extra_reprZambaRMSNorm.extra_reprP   s*    ))*+6$2G2G1HIIr9   )r2   r1   )gư>)	__name__
__module____qualname____firstlineno__r-   rG   rM   __static_attributes____classcell__r6   s   @r7   r)   r)   @   s    $;J Jr9   r)   rD   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r    N)rK   expandreshape)rD   rV   batchnum_key_value_headsslenhead_dims         r7   	repeat_kvr_   X   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr9   c                      \ rS rSrSr\R                  S4S jr SS\R                  S\R                  S\	S\
\\\4      S	\\R                  \R                  4   4
S
 jjrS\R                   4S jrSS\
\	   S	\	4S jjrS	\\\R                     \\R                     4   4S jr\SS\
\\\R*                           S	S4S jj5       rSrg)ZambaHybridDynamicCached   a|  
A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
(which has a constant shape regardless of seq_len).

This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
Nc                    X0l         UR                  U l        SU l        UR                  UR                  -  U l        UR                  U l        UR                  U l	        UR                  U l
        / U l        / U l        / U l        0 U l        0 U l        0 U l        [#        UR$                  5       H  nU =R                  [&        R(                  " X R
                  U R                  XCS9/-  sl        UU R                  U R
                  U R                  -  U R                  4nU =R                  [&        R(                  " XdUS9/-  sl        U R                  U   S:X  d  M  U R                  R+                  U5        M     [#        UR$                  5       Vs/ s H  n[&        R,                  " / /U-  US9PM     snU l        [#        UR$                  5       Vs/ s H  n[&        R,                  " / /U-  US9PM     snU l        g s  snf s  snf )NFdevicer>   hybridre   )r>   layers_block_typehas_previous_statemamba_expandr4   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headsconv_states
ssm_statestransformer_layers_modules_parameters_buffersrangenum_hidden_layersr/   zerosappendtensor	key_cachevalue_cache)r3   config
batch_sizer>   re   icache_shape_s           r7   r-    ZambaHybridDynamicCache.__init__r   s   
!'!9!9"'!'!4!4v7I7I!I$22 & 3 3#11"$v//0AJ(>(>@U@U^dr!  ""&&$*<*<<##	K OOKe TUUO%%a(H4''..q1 1 SXX^XpXpRqrRqQ%,,tj'8HRqrTYZ`ZrZrTstTsqELL"
):6JTst sts   #H#H
key_statesvalue_states	layer_idxcache_kwargsrW   c                 |   U R                   U   R                  S   S:X  a  XR                   U'   X R                  U'   Ob[        R                  " U R                   U   U/SS9U R                   U'   [        R                  " U R                  U   U/SS9U R                  U'   U R                   U   U R                  U   4$ )Nr<   r   r;   dim)r|   rK   r}   r/   cat)r3   r   r   r   r   s        r7   updateZambaHybridDynamicCache.update   s     >>)$**2.!3(2NN9%*6Y'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr9   beam_idxc                    [        [        U R                  5      5       GHT  nU R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   GMW     g)zDReorders the cache for beam search, given the selected beam indices.r   N)	rw   lenr|   re   index_selectr?   r}   rq   rr   )r3   r   r   re   s       r7   reorder_cache%ZambaHybridDynamicCache.reorder_cache   s=   s4>>23I^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI& 4r9   c                     XR                   ;  a  U R                   S   OUn[        U R                  5      U::  a  gU R                  U   R                  S   $ )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rs   r   r|   rK   )r3   r   s     r7   get_seq_length&ZambaHybridDynamicCache.get_seq_length   sP     3<CZCZ2ZD++A.`i	t~~)+~~i(..r22r9   c                     [        S5      eNz@ZambaHybridDynamicCache does not have a legacy cache equivalent.NotImplementedErrorrL   s    r7   to_legacy_cache'ZambaHybridDynamicCache.to_legacy_cache   s    !"deer9   past_key_valuesr   c                     [        S5      er   r   )clsr   s     r7   from_legacy_cache)ZambaHybridDynamicCache.from_legacy_cache   s    !"deer9   )rv   rt   ru   ro   rq   r>   ri   rk   r|   rh   rp   rm   rr   rs   r}   N)r   )rO   rP   rQ   rR   __doc__r/   float16r-   Tensorintr   r   strr   r   r   
LongTensorr   r   r   classmethodFloatTensorr   rS    r9   r7   ra   ra   d   s    27t uJ 26FLLF llF 	F
 tCH~.F 
u||U\\)	*F$ie&6&6 i3 3c 3fuU\\':E%,,<O'O!P f fuUEVEV?W9X0Y fes f fr9   ra   modulequerykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr;   r   r   r<   )r   r>   )ptrainingr    )r_   num_key_value_groupsr/   matmul	transposerK   r
   
functionalsoftmaxr@   r?   r>   r   r   
contiguous)r   r   r   r   r   r   r   kwargsr   r   attn_weightscausal_maskattn_outputs                r7   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r9   c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S\
\R                     S\
\   S	\\   S
\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )ZambaAttention   a  
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
and "Generating Long Sequences with Sparse Transformers".

Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://arxiv.org/pdf/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
r~   r   c                   > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  UR                  -  U l	        UR                  U l
        U R                  S-  S-  U l        SU l        UR                  U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  U R                  -  UR&                  SS9U l        g )Nr;         TFbias)r,   r-   r~   r   attention_hidden_sizeattention_head_dimr^   num_attention_headsr\   r   max_position_embeddingsr   	is_causalattention_dropoutr
   Linearq_projk_projv_projr4   o_projr3   r~   r   r6   s      r7   r-   ZambaAttention.__init__   s5   "%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejkr9   rD   r   past_key_valuer   rW   c                 z   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Ub  UR                  XU5      u  p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
U4U R                  (       d  S	OU R                   U R"                  S
.UD6u  pUR$                  " / UQSP76 R'                  5       nU R)                  U5      nX4$ )Nr<   r    r;   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   )rK   r^   r   viewr   r   r   r   r   r~   _attn_implementationgetloggerwarning_oncer   r   r   r   rZ   r   r   )r3   rD   r   r   r   r   input_shapehidden_shapequery_statesr   r   attention_interfacer   r   s                 r7   rG   ZambaAttention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST%'5'<'<ZW`'a$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r9   )r   r   r~   r^   r   r   r   r   r   r   r   r   r   r   )rO   rP   rQ   rR   r   r!   r   r-   r/   r   r   ra   r   r   r   rG   rS   rT   rU   s   @r7   r   r      s    l{ ls l. =A))||)) )) !.	))
 !!89)) -.)) 
u||Xell3XeELL>Q5RR	S)) ))r9   r   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\	4S jjr
SS\	4S jjrSS\	4S	 jjrS
rU =r$ )ZambaMambaMixeri(  u!  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)

This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
- Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
`self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
r~   c           	        > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  UR                  -  U l
        UR                  U l        UR                  U l        U R                  U R                  -  U l        UR                  U l        UR"                  U l        [&        R(                  " U R                  U R                  U R                   U R                  U R                  U R                  S-
  S9U l        UR,                  U l        [0        UR,                     U l        UR4                  U l        [&        R8                  " U R                  U R                  S-  U R$                  S9U l        [&        R<                  " [>        R@                  " U R                  U R                  U R                  S-  -   U R                  5      5      U l!        [&        R<                  " [>        R@                  " U R                  U R                  U R                  5      S-
  S-  U R                  S-  -  5      U l"        [&        R<                  " [>        R@                  " U R                  U R                  5      5      U l#        [>        RH                  " SU R                  S-   [>        RJ                  S9S S S 24   nURM                  U R                  S5      RO                  5       n[&        R<                  " [>        RP                  " U5      RS                  U R                  U R                  S5      5      U l*        [&        R<                  " [>        RV                  " U R                  U R                  5      5      U l,        [&        R8                  " U R                  U R                  U R$                  S9U l-        [\        (       d  [^        Ra                  S5        g g )	Nr    )in_channelsout_channelsr   kernel_sizegroupspaddingr;   r   g      ?r>   r<   ap  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)1r,   r-   r~   r   r4   rl   rm   rn   ro   rj   rk   mamba_dt_ranktime_step_rankrp   mamba_head_dimmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr
   Conv1dconv1dhidden_mamba_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projr.   r/   ry   x_proj_weightdt_proj_weightdt_proj_biasaranger@   rY   r   logrZ   A_logr0   Dout_projis_fast_path_availabler   r   )r3   r~   r   Ar6   s       r7   r-   ZambaMambaMixer.__init__5  s
   "!--$22 & 3 3!'!4!4v7I7I!I$22#11"448J8JJ#33..ii..//##--))))A-
 !11&112 & 8 8 yy!1!143I3IA3MTXTaTab  \\&&''$*=*=*AA''
 !ll[[++T-@-@$BUBUVY\\!!3&'

 LLT5G5GI\I\)]^ LLD//!35==I$PQ'RHHT++R0;;=\\%))A,"6"6t7I7I4K^K^`b"cd
ejj););T=P=PQR		$"8"8$:J:JQUQ^Q^_%%^ &r9   rD   cache_paramsc                    UR                   u  pEnUS L=(       a    UR                  =(       a    US:H  nU R                  U5      R                  SS5      nUR	                  USSU5      R                  SSS9u  pUR                  S5      R                  5       nU	R                  S5      n	U	R                  X@R                  SU5      R                  SS5      n	U R                  R                  R	                  U R                  R                  R                  S5      U R                  R                  R                  S5      5      n
U(       ae  [        UR                  S5      UR                  U R                     U
U R                  R                   U R"                  5      nUR%                  S5      nOUb1  [&        R(                  " US:H  5      (       d  XR%                  S5      -  nUbc  [*        R,                  R/                  XR0                  UR                   S   -
  S45      nUR                  U R                     R3                  U5        [5        XU R                  R                   U R"                  S9nUb1  [&        R(                  " US:H  5      (       d  XR%                  S5      -  nUR                  SU R                  U R6                  U5      R                  SS5      nU R8                  S S 2S S S 2S S 24   U-  R                  SS5      n[&        R:                  " XR<                  U R>                  U R>                  /SS9u  pnU R@                  S S 2S 4   UR                  SS5      -  n[&        RB                  " U RD                  RG                  5       5      * nU RH                  b  U RH                  RG                  5       OS n[&        RJ                  " USU4URL                  URN                  S9nU(       a  [Q        U R                  5       H  n[S        URT                  U R                     S S 2U4   UUS	S4   UUS	S4   UU   UUS S 2S4   UUS S 2S4   U RV                  U   U	US	S4   UU   S
S9
R%                  S5      n[&        RX                  " UU4SS9nM     GO<[&        RJ                  " USU R6                  U R>                  4URL                  URN                  S9n[Q        U R                  5       H  n[[        UU   UU   UU   UU   R                  SS5      UU   R                  SS5      U RV                  U   RG                  5       U	U   UU   S
S
S9
u  nn[&        RX                  " UU4SS9R                  5       n[&        RX                  " UUR%                  S5      4SS9nM     Ub+  Ub(  URT                  U R                     R3                  U5        U R]                  UR                  SS5      5      nU$ )Nr    r;   r<   r   r   )r   r   rd   .T)dt_softplus)delta_softplusreturn_last_state)/rK   ri   r   r   r   chunksqueezer   rZ   rp   r   r1   sizer&   rq   r   r   r   	unsqueezer/   allr
   r   padro   copy_r%   r   r   splitr   rm   r   expr  floatr   emptyre   r>   rw   r$   rr   r  r   r#   r  )r3   rD   r	  r   r   seq_lenr   use_precomputed_statesprojected_statesgateconv_weightsrq   ssm_parameters	time_stepBCdiscrete_time_stepr  time_proj_biasscan_outputsnscan_outputs_	ssm_state
ssm_state_contextualized_statess                            r7   cuda_kernels_forward$ZambaMambaMixer.cuda_kernels_forwardt  sn    "/!4!4
Q!-T!9!nl>]>]!nbimnbn  <<6@@AF.33JAwOUUVW]^U_%--a0;;=||A||J(:(:BHRRSTVWX {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M)%))Na<O2P2P -0H0H0K K' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]$++JZJZgkgvgvwM)%))Na<O2P2P -0H0H0K K
 &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++00$2E2EtGZGZ[ac
	a "00D9I<O<OPRTV<WWYYtzz'')** 7;6G6G6S**002Y]{{J7#;MDXDX`m`s`st!4--. 6 ++DNN;AqDA!!S!),&q#qy1aDaAgJaAgJFF1ICO"1% $! )B-   %yy,)FAN /  Q 3 3T5H5HI$++#))I
 4--.,=!!$&q)aDaDNN1a(aDNN1a(FF1IOO%G"1%#'&*-)z  %yy,)FANYY[!IIy*2F2Fq2I&JPQR	 / $)A''7==iH !%l.D.DQ.J K$$r9   c           
      R   UR                   u  pEnUR                  nU R                  U5      R                  SS5      nUR	                  USSU5      R                  SSS9u  pU	R                  S5      R                  5       n	U
R                  S5      n
U
R                  X@R                  SU5      R                  SS5      n
[        U[        5      nU(       Ga  UR                  U R                     R                   S   U:X  Ga  U R                  (       a(  UR                  U R                     R                  5       nOUR                  U R                     nUR!                  U	R"                  5      nUR$                  (       Ga  US:X  Ga  UR&                  U R                     R                   S   U:X  a  UR&                  U R                     n[(        R*                  " USSS9nU	S S 2S S 2S4   US S 2S S 2S4'   XR&                  U R                  '   [(        R,                  " XR.                  R0                  S S 2SS S 24   -  SS9n	U R2                  (       a  XR.                  R4                  -  n	U R7                  U	5      R!                  U5      R9                  S5      n	GOUbH  [(        R:                  " US:H  5      (       d*  XS S 2U	R                   S   * S 24   R9                  S5      -  n	[<        R>                  RA                  XRB                  U	R                   S   -
  S45      nXR&                  U R                  '   U R7                  U R/                  U	5      SS U24   5      n	UbH  [(        R:                  " US:H  5      (       d*  XS S 2U	R                   S   * S 24   R9                  S5      -  n	O[(        RD                  " X@R                  U RF                  U RH                  4U	R"                  US9nUb1  [(        R:                  " US:H  5      (       d  XR9                  S5      -  n	U R7                  U R/                  U	5      SS U24   5      n	Ub1  [(        R:                  " US:H  5      (       d  XR9                  S5      -  n	U	R                  SU R                  U RF                  U5      R                  SS5      n	U RJ                  S S 2S S S 2S S 24   U	-  R                  SS	5      n[(        RL                  " XRN                  U RH                  U RH                  /SS9u  nnnU RP                  S S 2S 4   UR                  SS	5      -  U RR                  S S 2S S S 2S 4   -   n[<        R>                  RU                  U5      n[(        RV                  " U RX                  R[                  5       5      * n[(        RV                  " US S 2S S S 2S S S 24   US S 2S S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S S 2S 4   US S 2S S 2S S S 2S S 24   R[                  5       -  nUU	S S 2S S 2S S 2S S 2S 4   R[                  5       -  n/ n[]        U5       H  nUS S 2S S 2S S 2US S 24   R                  SS5      U-  US S 2S S 2S S 2US S 24   R                  SS5      -   n[(        R^                  " UR                  SS5      R!                  U5      US S 2S S 2US S 24   R9                  S5      5      nURa                  US S 2S S 2S S 2S4   5        M     [(        Rb                  " USS9nUXRd                  S S 2S S S 2S 4   -  -   nUU R7                  U
5      -  nU(       a  XR                  U R                  '   U Rg                  UR                  SS5      R                  USU5      R                  SS5      5      nU$ )
Nr    r;   r<   r   r   )shiftsdims.rd   r   )4rK   r>   r   r   r   r  r  r   rZ   rp   
isinstancera   rr   r   r   cloner?   re   ri   rq   r/   rollsumr   r1   r   r   r   r  r  r
   r   r  ro   ry   r   rm   r   r  r   r   r   softplusr  r  r  rw   r   rz   stackr  r  )r3   input_statesr	  r   r   r  r   r>   r  rD   r  	use_cacher'  
conv_stater  r  r   r!  r"  r  
discrete_A
discrete_BdeltaB_ur$  r   scan_outputr)  s                              r7   slow_forwardZambaMambaMixer.slow_forward  s   !-!3!3
Q""<<5??1E.33JAwOUUVW]^U_%--a0;;=||A||J(:(:BHRRSTVWX|-DE	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I ///qL ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O!-eiiRS@S6T6T$11}GZGZ[]G^F^F`C`4a4k4klm4n$nM]]..}?T?TWdWjWjkmWn?npq>rs
;E((8 $])CC'M)R S!-eiiRS@S6T6T$11}GZGZ[]G^F^F`C`4a4k4klm4n$nM//1D1DdFYFYZ$++I
 )%))Na<O2P2P -0H0H0K K HHT[[%?XgX%NOM)%))Na<O2P2P -0H0H0K K &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++00$2E2EtGZGZ[ac
	1a #11!T':Y=P=PQSUW=XX\`\m\mtQ]
 
  ]]334FG YYtzz'')**YYqD!T1!458J1aQRTUW[K[8\\]
'1aD(89AaD!Q>N<O<U<U<WW
aAq$.> ? E E GGwA"1aAq=1;;AqAIMPXYZ\]_`bcefYfPgPqPqrsuvPwwI,,y':':1a'@'C'CE'JAaQRTUWXjMLcLcdfLghKAq!QJ 78   kk,B7!]VVAtQ<L5M%MN!DHHTN26?##DNN3 !%!!!Q'//
BHRRSTVWX!
 %$r9   c                     U R                   (       aJ  [        (       a$  SU R                  R                  R                  ;  a  [        S5      eU R                  XUS9$ U R                  XUS9$ )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)r   )r   r  r   re   type
ValueErrorr*  r<  )r3   rD   r	  r   s       r7   rG   ZambaMambaMixer.forward2  sl      ))V4;M;M;T;T;Y;Y-Y i 
 ,,]Yg,hh  ^ \\r9   )r  r  r   r   r~   r   ro   r   r   r4   r   rk   r   r   rp   r  rm   r   r   r   r   r   r'   )rO   rP   rQ   rR   r   r!   r-   r/   r   ra   r*  r<  rG   rS   rT   rU   s   @r7   r   r   (  s^    
={ =@ im_%"\\_%9P_%B[%7N [%z	]3J 	] 	]r9   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ZambaMLPi?  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFr   )r,   r-   r~   r4   rk   r
   r   	gate_projup_proj	down_projr   
hidden_actact_fnr3   r~   r6   s     r7   r-   ZambaMLP.__init__@  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r9   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )rI  rK  rG  rH  )r3   xrI  s      r7   rG   ZambaMLP.forwardJ  s6    NN4;;t~~a/@#ADLLQRO#ST	r9   )rK  r~   rI  rG  r4   rk   rH  )rO   rP   rQ   rR   r-   rG   rS   rT   rU   s   @r7   rD  rD  ?  s    0 r9   rD  c                   (  ^  \ rS rSrSS\S\\   4U 4S jjjr    SS\R                  S\R                  S\S\\R                     S\\
   S	\\   S
\\   S\\   S\\R                  \\\R                  \R                  4      4   4S jjrSrU =r$ )ZambaAttentionDecoderLayeriO  r~   r   c                    > [         TU ]  5         [        X5      U l        [	        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l        g )Nr5   )r,   r-   r   	self_attnrD  feed_forwardr)   r   rms_norm_epsinput_layernormr4   pre_ff_layernormr   s      r7   r-   #ZambaAttentionDecoderLayer.__init__P  s]    ':$V,+F,H,HfNaNab ,V-?-?VEXEX Yr9   rD   original_hidden_statesr   r   r   r6  r   rW   c           
          [         R                  " X/SS9nU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R	                  U5      nU R                  U5      nU4n
U(       a  X4-  n
U
$ )aa  
Args:
    hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
        This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
        concatenated tensor is then used as input of the pre-attention RMSNorm
        (see fig. 2 in https://arxiv.org/pdf/2405.16712).
    layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
r<   r   )rD   r   r   r   r   r6  r   )r/   concatenaterX  rU  rY  rV  )r3   rD   r[  r   r   r   r   r6  r   self_attn_weightsoutputss              r7   rG   "ZambaAttentionDecoderLayer.forwardX  s    > ))=*QWYZ,,];+/>> ,
'))/,
 ,
( --m<))-8 "++Gr9   )rV  rX  rY  rU  r   NNFF)rO   rP   rQ   rR   r!   r   r   r-   r/   r   ra   boolr   r   r   r   rG   rS   rT   rU   s   @r7   rR  rR  O  s    Z{ Zx} Z Z 26<@,1$)3||3 !&3 	3
 !.3 !!893 $D>3 D>3 -.3 
u  (51B1BEDUDU1U+V"WW	X3 3r9   rR  c                     ^  \ rS rSrS\S\4U 4S jjr         SS\R                  S\	\R                     S\	\   S\	\R                     S\	\R                     S	\	\
   S
\	\   S\	\   S\	\R                     S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )ZambaMambaDecoderLayeri  r~   r   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        X l        g )N)r~   r   rT  )	r,   r-   r   mambar)   r4   rW  rX  r   r   s      r7   r-   ZambaMambaDecoderLayer.__init__  s:    $FH
+F,>,>FDWDWX"r9   rD   r[  r   r   r   r   r6  cache_positiontransformer_hidden_statesrW   c                     UnU
b  X-   OUnU R                  U5      nU R                  UUUS9nSnX-   nU4nU(       a  X4-  nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
N)rD   r	  r   )rX  rf  )r3   rD   r[  r   r   r   r   r   r6  rh  ri  r   residualr^  r_  s                  r7   rG   ZambaMambaDecoderLayer.forward  s    < !
 :S9^M5dq 	 ,,];

'') # 
 ! !0 "++G((Gr9   )rX  r   rf  )	NNNNNFFNN)rO   rP   rQ   rR   r!   r   r-   r/   r   r   ra   rb  r   r   r   rG   rS   rT   rU   s   @r7   rd  rd    s   #{ #s # :>#'15.2<@,1$)59<@:||: !) 6: C=	:
 !.: ell+: !!89: $D>: D>: !!1!12: $,ELL#9: 
u  (51B1BEDUDU1U+V"WW	X: :r9   rd  c                   |  ^  \ rS rSrS\S\R                  S\4U 4S jjr        SS\	R                  S\\	R                     S\\   S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\	R                     S\\	R"                  \\\	R"                  \	R"                  4      4   4S jjrSrU =r$ )ZambaHybridLayeri  shared_transflinearrf  c                 F   > [         TU ]  5         Xl        X l        X0l        g r   )r,   r-   ro  rp  mamba_decoder)r3   ro  rp  rf  r6   s       r7   r-   ZambaHybridLayer.__init__  s    *"r9   rD   r[  r   r   r   r   r   r6  rh  rW   c
                     U R                  UUUUUUUU	S9n
U
S   nU(       a  U
S   nU R                  U5      nU R                  UUUUUUU	S9n
U(       a  U
S   W4U
SS -   n
U
$ )a\  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
    hidden activations to form the input of the shared transformer layer.
    layer_idx (`int`): layer number.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
)r[  r   r   r   r   r6  rh  r   r    )ri  r   r   r   r6  rh  r;   N)ro  rp  rr  )r3   rD   r[  r   r   r   r   r   r6  rh  layer_outputsri  r^  s                r7   rG   ZambaHybridLayer.forward  s    > **#9&)/) + 	
 %2!$4! -a 0$(KK0I$J!**&?))/) + 
 *1-/@AMRSRTDUUMr9   )rp  rr  ro  )NNNNNFFN)rO   rP   rQ   rR   rR  r
   r   rd  r-   r/   r   r   r   ra   rb  r   r   r   rG   rS   rT   rU   s   @r7   rn  rn    s   #&@ #")) #\r # :>#'15.2<@,1$)59>||> !) 6> C=	>
 !.> ell+> !!89> $D>> D>> !!1!12> 
u  (51B1BEDUDU1U+V"WW	X> >r9   rn  c                      ^  \ rS rSr\rSrSrSS/rSr	Sr
SrSrSrS r\\    SS	\\R$                     S
\\\\\\4   4      S\S\4U 4S jjj5       5       rSrU =r$ )ZambaPreTrainedModeli  modelTrR  rd  r   Fc                 t   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g [        U[        5      (       Ga  UR                   R                  R                  SUS9  U R                   R"                  S-  n[        R$                  R'                  UR(                  U* U5        U R                   R*                  U R                   R,                  -  U R                   R.                  -  n[0        R2                  " [0        R4                  " U R                   R.                  U5      [6        R8                  " U R                   R:                  5      [6        R8                  " U R                   R<                  5      -
  -  [6        R8                  " U R                   R<                  5      -   5      R?                  U R                   R@                  S9nU[0        R8                  " [0        RB                  " U* 5      * 5      -   nURD                  R                  RG                  U5        [0        RH                  " SURJ                  S-   [0        RL                  S9S S S 24   nURO                  URP                  S5      RS                  5       nURT                  R                  RG                  [0        R8                  " U5      RW                  UR.                  URX                  S5      5        URZ                  R                  R                  S5        g g )	Nr   )rB   stdg      ?r   )minr    r   r<   ).r~   initializer_ranger/  r
   r   r   r1   datanormal_r   zero_	Embeddingpadding_idxr)   fill_r   r   r   inituniform_r   rj   r4   rp   r/   r  randmathr  time_step_maxtime_step_minclamptime_step_floorexpm1r   r  r  rm   r@   rY   rk   r   r  rZ   r   r  )r3   r   r{  dt_init_stdr   dtinv_dtr  s           r7   _init_weights"ZambaPreTrainedModel._init_weights&  s   kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)00  %%--3C-@++33T9KGGV22[L+N![[558O8OOSWS^S^SlSllN

4;;44nE88DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566F$$**62Q 5 5 9OPTVWPWXA1126AACALL##EIIaL$8$89M9MvOdOdfh$ijHHMM$% 1r9   torch_dtype
device_maphard_check_onlycheck_device_mapc                 b   > [         TU ]  XX4US9nU(       d  UR                  S:X  a  SUl        U$ )z
Overloads `PreTrainedModel._check_and_enable_flash_attn_2` so as to DISABLE Flash Attention 2 by default on Zamba models.
Flash attention 2 is currently not supported in the HuggingFace implementation of Zamba v1.
)r  r  flash_attention_2r   )r,   _check_and_enable_flash_attn_2r   )r   r~   r  r  r  r  r6   s         r7   r  3ZambaPreTrainedModel._check_and_enable_flash_attn_2F  sA     7_o 8 

 6#>#>BU#U*1F'r9   r   ra  )rO   rP   rQ   rR   r!   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_class_is_statefulr  r   r   r/   r>   r	   r   r   r   rb  r  rS   rT   rU   s   @r7   rx  rx    s    L&*#57OP"3"N L%@  .2;? %!& ekk* U3S#X#678	
    r9   rx  c                   H  ^  \ rS rSrSrS\4U 4S jjrS rS r\	          SS\
\R                     S\
\R                     S	\
\R                     S
\
\   S\
\R                     S\
\   S\
\   S\
\   S\
\   S\
\R                     S\\\4   4S jj5       rS rSrU =r$ )
ZambaModeli_  z
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

Args:
    config: ZambaConfig
r~   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        U5      n/ n/ nUR                  U l
        [        UR                  5       H  nUR                  U   S:X  a  UR                  [        XS95        M0  UR                  U   S:X  d  ME  UR                  [
        R                  " U R                   R                  U R                   R                  SS95        UR                  [        XS95        M     [#        U5      n[#        U5      n/ n/ U l        ['        U R                  5       H  u  pxUS:X  ab  SU S3n	/ SQn
/ U R$                  QU
 Vs/ s H  oU-   PM	     snQU l        UR                  [)        U[+        U5      [+        U5      5      5        Mm  UR                  [+        U5      5        M     [
        R,                  " U5      U l        UR0                  U l        [3        UR                  UR4                  S	9U l        SU l        U R;                  5         g s  snf )
Nrf  )r   rf   Fr   zlayers..)	z%shared_transf.self_attn.q_proj.weightz%shared_transf.self_attn.k_proj.weightz%shared_transf.self_attn.v_proj.weightz%shared_transf.self_attn.o_proj.weightz+shared_transf.feed_forward.gate_proj.weightz)shared_transf.feed_forward.up_proj.weightz+shared_transf.feed_forward.down_proj.weightz$shared_transf.input_layernorm.weightz%shared_transf.pre_ff_layernorm.weightrT  )r,   r-   pad_token_idr  
vocab_sizer
   r  r4   embed_tokensrR  rh   rw   rx   rz   rd  r   r~   iter_tied_weights_keys	enumeratern  next
ModuleListlayersr   r)   rW  final_layernormgradient_checkpointing	post_init)r3   r~   blockmamba_layerslinear_layersr   r  layer_id
layer_typeprefix_name	tied_keysr   r6   s               r7   r-   ZambaModel.__init__h  s    !.. ++LL):):F<N<NPTP`P`a*62!'!9!9v//0A''*g5##$:6$OP))!,8$$RYYt{{/F/FH_H_fk%lm##$:6$OP 1 L)]+"$$-d.D.D$E HX% 'z3
	 +pD,C,C*odmFndm]`UXGXdmFn*o'.ud=6I4P\K]^_d<01# %F$ mmF+$*$?$?!+F,>,>FDWDWX&+# Gos   +I7c                     U R                   $ r   r  rL   s    r7   get_input_embeddingsZambaModel.get_input_embeddings  s       r9   c                     Xl         g r   r  r3   r   s     r7   set_input_embeddingsZambaModel.set_input_embeddings  s    !r9   	input_idsr   position_idsr   inputs_embedsr6  r   output_hidden_statesreturn_dictrh  rW   c                 
   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nUn[        R                  " U5      nU(       a  Uc  [        R                  S5        U
c,  [        R                  " UR                  S   UR                  S9n
Uc  U
R!                  S5      nU R#                  X%U
5      nU(       a  SOS nU(       a  SOS n[%        U R&                  5       H  u  nnU(       a  X4-  nU R                  (       a6  U R                  (       a%  U R)                  UR*                  UUUUUUUUU
5
      nOU" UUUUUUUUU
S	9	nUS   nU(       d  Mu  US   c  M}  UUS   4-  nM     U R-                  U5      nU(       a  X4-  nU(       a  UR.                  (       d  S
Ul        [1        UU(       a  UOS UUS9nU	(       a  U$ UR3                  5       $ )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz{Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was provided, so no cache will be returned.r    rg   r   r   )r[  r   r   r   r   r   r6  rh  T)last_hidden_stater   rD   
attentions)r~   r   r  r6  use_return_dictrA  r  r   r   r   r  r/   r0  r  rK   re   r  _update_causal_maskr  r  _gradient_checkpointing_func__call__r  ri   r   to_tuple)r3   r  r   r  r   r  r6  r   r  r  rh  rD   r[  r   all_hidden_statesall_self_attnsr   layerru  outputs                       r7   rG   ZambaModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0:
 !"\\-*=*=a*@I]I]^N)33A6L..~n]"6BD0d )$++ 6Iu#!%55!**t}} $ A ANN!*"#%"! !&!+A'#1 +#2&7'#1
! *!,M   #/"}Q'7&99NE !7H ,,];  !11?#E#E15O.(+/8Od+%	
 %v;&//*;;r9   c                    U R                   R                  S:X  a  Ub  SU;   a  U$ g UR                  UR                  pT[        R
                  " U5      R                  nUR                  S   nUS   S-   n[        R                  " Xx4XdUS9n	US:w  a  [        R                  " U	SS9n	U	[        R                  " XS9UR                  SS5      :  -  n	U	S S S S 2S S 24   R                  UR                  S   SSS5      n	Ub  U	R                  5       n	UR                  5       S	:X  ac  UR                  S   n
U	S
S U
24   R                  S5      US S 2S S S S 24   R                  S5      -  nU	S
S U
24   R!                  X5      U	S
S U
24'   U R                   R                  S:X  a3  Ub0  UR                  R"                  S;   a  [$        R&                  " X5      n	U	$ )Nr  r   r    r<   )
fill_valuer>   re   )diagonalrg   r   r;   .r   )r?  xpunpu)r~   r   r>   re   r/   finfor|  rK   fulltriur  rZ   rY   r0  r   eqmasked_fillr@  r   _unmask_unattended)r3   r   input_tensorrh  r>   re   	min_dtypesequence_lengthtarget_lengthr   mask_lengthpadding_masks               r7   r  ZambaModel._update_causal_mask  s   ;;++/BB)c^.C%%$**L,?,?vKK&**	&,,Q/&r*Q.jj/!Aimsta**[1=Ku||MANDZDZ[]_`Daaa!$a"23::<;M;Ma;PRSUWY[\%%++-K!!#q(,2226*3+<=@@EWXZ^`dfgWgHhHkHkloHpp1<S,;,=N1O1[1[\h1tC+-. KK,,6*%%**.DD
 1CCK[Kr9   )	r   r  r  r  r  r  rh   r  r  
NNNNNNNNNN)rO   rP   rQ   rR   r   r!   r-   r  r  r   r   r/   r   r   ra   r   rb  r	   r   r   rG   r  rS   rT   rU   s   @r7   r  r  _  s-   -{ -^!"  151537=A59$(,0/3&*59k<E,,-k< !.k< u//0	k<
 ""9:k<   1 12k< D>k< $D>k< 'tnk< d^k< !!1!12k< 
u--	.k< k<\! !r9   r  c                     ^  \ rS rSrS\4U 4S jjrS rS rS rS r	S r
S	 r\            SS
\\R                     S\\R                      S\\R                     S\\   S\\R$                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                   4   S\\\4   4S jj5       r      SS jrSrU =r$ )ZambaForCausalLMi1  r~   c                    > [         TU ]  U5        [        U5      U l        S/U R                  R                  QU l        UR
                  U l        [        R                  " UR                  UR
                  SS9U l	        U R                  5         g )Nzlm_head.weightFr   )r,   r-   r  ry  r  r  r
   r   r4   lm_headr  rL  s     r7   r-   ZambaForCausalLM.__init__2  so     '
#3"Tdjj6S6S"T ++yy!3!3V5F5FUS 	r9   c                 .    U R                   R                  $ r   ry  r  rL   s    r7   r  %ZambaForCausalLM.get_input_embeddings<      zz&&&r9   c                 $    XR                   l        g r   r  r  s     r7   r  %ZambaForCausalLM.set_input_embeddings?      "'

r9   c                     U R                   $ r   r  rL   s    r7   get_output_embeddings&ZambaForCausalLM.get_output_embeddingsB  s    ||r9   c                     Xl         g r   r  )r3   new_embeddingss     r7   set_output_embeddings&ZambaForCausalLM.set_output_embeddingsE  s    %r9   c                     Xl         g r   ry  )r3   decoders     r7   set_decoderZambaForCausalLM.set_decoderH  s    
r9   c                     U R                   $ r   r  rL   s    r7   get_decoderZambaForCausalLM.get_decoderK  s    zzr9   r  r   r  r   r  labelsr6  r   r  r  rh  logits_to_keeprW   c                 .   Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R	                  UUUUUUUU	UU
S9
nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " UX`R                  40 UD6nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, ZambaForCausalLM

>>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
>>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)
r  r   r  r   r  r6  r   r  rh  r  r   r    losslogitsr   rD   r  )r~   r   r  r  ry  r/  r   slicer  loss_functionr  r   r   rD   r  )r3   r  r   r  r   r  r  r6  r   r  r  rh  r  loss_kwargsr_  rD   slice_indicesr	  r  r  s                       r7   rG   ZambaForCausalLM.forwardN  sK   P 2C1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B] **)%+'/!5)#  
  
8B>SV8W8W~ot4]kmA}a,?@A%%ffooUUDY,F'+'7D7V#CVC%#33!//))
 	
r9   c           	         US L n	U	(       d]  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nOaUR                   S   UR                   S   :w  a	  US S 2U4   nO7[        U R                  UR                   S   U R                  U R                  S9nUbZ  UcW  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U	(       d  US S 2UR                   S   * S 24   nUb  U	(       a  SU0n
OSUR                  5       0n
U
R                  UUUUU R                  R                  US.5        U
$ )Nr<   r    r   )r>   re   r  r  )r  r   r6  r   r  rh  )rK   ra   r~   r>   re   longcumsummasked_fill_r   r   num_logits_to_keep)r3   r  r   r   r  rh  r  r6  r   empty_past_kvmodel_inputss              r7   prepare_inputs_for_generation.ZambaForCausalLM.prepare_inputs_for_generation  sd    (4/  )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	5Y__Q/tzz$++O %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r9   )r  r  ry  r  )NNNNNNNNNNNr   )NNNNNT)rO   rP   rQ   rR   r!   r-   r  r  r  r  r  r  r   r   r/   r   r   ra   r   rb  r	   r   r   r   rG   r  rS   rT   rU   s   @r7   r  r  1  s   { '(&  151537=A59-1$(,0/3&*5934O
E,,-O
 !.O
 u//0	O

 ""9:O
   1 12O
 ))*O
 D>O
 $D>O
 'tnO
 d^O
 !!1!12O
 c5<</0O
 
u,,	-O
 O
h 9 9r9   r  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   Z  ^  \ rS rSrU 4S jrS rS r\          SS\\	R                     S\\	R                     S\\	R                     S\\\\\	R                     4      S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )ZambaForSequenceClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        U R                  R
                  U l        [        R                  " UR                  U R                  SS9U l	        U R                  5         g rF  )r,   r-   
num_labelsr  ry  r  r
   r   r4   scorer  rL  s     r7   r-   'ZambaForSequenceClassification.__init__  se      ++'
"&**"?"?YYv114??O
 	r9   c                 .    U R                   R                  $ r   r  rL   s    r7   r  3ZambaForSequenceClassification.get_input_embeddings  r  r9   c                 $    XR                   l        g r   r  r  s     r7   r  3ZambaForSequenceClassification.set_input_embeddings  r  r9   r  r   r  r   r  r  r6  r   r  r  rW   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGb  UR                  UR                  5      nU R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r   r  r   r  r6  r   r  r  r   r    z=Cannot handle batch sizes > 1 if no padding token is defined.r<   rd   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rg   
regressionsingle_label_classificationmulti_label_classificationr  )r~   r  ry  r  rK   r  rA  r?   re   r/   int32r  argmaxr   r   r6   rO   problem_typer  r>   r  r   r   r  r   r   r   r   r   rD   r  )r3   r  r   r  r   r  r  r6  r   r  r  transformer_outputsrD   r	  r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  s                         r7   rG   &ZambaForSequenceClassification.forward  s   ( &1%<k$++B]B]"jj)%+'/!5# ) 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r9   )r  ry  r  r  r  )rO   rP   rQ   rR   r-   r  r  r   r   r/   r   r   r	   r   r   r   rb  r   r   rG   rS   rT   rU   s   @r7   r  r    s)   '(  151537KO59-1$(,0/3&*[
E,,-[
 !.[
 u//0	[

 "%tE4E4E/F(F"GH[
   1 12[
 ))*[
 D>[
 $D>[
 'tn[
 d^[
 
u66	7[
 [
r9   r  )r  r  r  rx  )r   )Pr   r  typingr   r   r   r   r   r   r	   r/   torch.utils.checkpointr
   torch.nnr   r   r   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   utils.import_utilsr   r   configuration_zambar!   &mamba_ssm.ops.selective_scan_interfacer"   r#   +mamba_ssm.ops.triton.selective_state_updater$   causal_conv1dr%   r&   r  r  
get_loggerrO   r   Moduler)   rz   r   r   r_   ra   r  r   r   r   rD  rR  rd  rn  rx  r  r  r  __all__r   r9   r7   <module>rF     se  (   D D D    A A ! . ) > B q q F & 1 , T , XR@P=-~DD-7**.0@BVXfg 
 
		H	%J299 J(   L )	UU\\ 	U# 	U%,, 	U[fl [fJ %II%<<% 
% <<	%
 U\\*% % %4I)RYY I)XS]bii S]nryy  < <~ARYY AHEryy EP A? A AH M% M Mbh+_ hV m
%9 m
m
` gr9   