
    fTh                        S r SSKrSSKJr  SSKJrJrJrJrJ	r	  SSK
r
SSKr
SSK
Jr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJrJr  SSKJ r   \RB                  " \"5      r#\" 5       (       a  SSK$J%r%  OSr%\" 5       (       a  SSK&J'r'J(r(  SSK)J*r*  OSu  r*r(r'\" 5       (       a	  SSK+J,r,J-r-  OSu  r-r,\." \*\(\,\-\'45      r/ " S S\R`                  5      r1 " S S\R`                  5      r2 " S S\R`                  5      r3\ " S S\5      5       r4\ " S S\5      5       r5\ " S  S!\5      5       r6\ " S" S#\45      5       r7\" S$S%9 " S& S'\4\5      5       r8/ S(Qr9g))zPyTorch MAMBA model.    N)	dataclass)AnyDictOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)
MambaCache)GenerationMixin)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNN)causal_conv1d_fncausal_conv1d_update)NNc            
       X  ^  \ rS rSrSrS\S\4U 4S jjr   SS\R                  S\
\   S\
\R                     S	\
\R                     4S
 jjrSS\
\   S\
\R                     S	\
\R                     4S jjr   SS\
\   S\
\R                     S	\
\R                     4S jjrSrU =r$ )
MambaMixer>   uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
config	layer_idxc           	        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        UR                  5      U l
        X l        UR                  U l        [        R                  " U R                  U R                  UR                  UR                  U R                  UR                  S-
  S9U l        UR                   U l        [$        UR                      U l        UR(                  U l        [        R*                  " U R                  U R                  S-  UR,                  S9U l        [        R*                  " U R                  U R                  U R
                  S-  -   SS9U l        [        R*                  " U R                  U R                  SS9U l        [4        R6                  " SU R
                  S-   [4        R8                  S9S S S 24   nUR;                  U R                  S5      R=                  5       n[        R>                  " [4        R@                  " U5      5      U l!        [        R>                  " [4        RD                  " U R                  5      5      U l#        [        R*                  " U R                  U R                  UR,                  S9U l$        UR,                  U l        [J        (       dW  U R(                  (       a0  [M        5       (       a  [N        RQ                  S	5        g [S        S
5      e[N        RQ                  S5        g g )Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   r'   FTdtypea7  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)*super__init__r"   hidden_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeintermediate_sizeinttime_step_rankr#   use_conv_biasr	   Conv1dconv1d
hidden_act
activationr   actuse_mambapyLinearuse_biasin_projx_projdt_projtorcharangefloat32expand
contiguous	ParameterlogA_logonesDout_projis_fast_path_availabler   loggerwarning_onceImportError)selfr"   r#   A	__class__s       `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mamba/modeling_mamba.pyr1   MambaMixer.__init__F   s}   !--$// & 2 2!'!9!9!&"7"78"#11ii..//%%**))&&*
 !++&++,!-- yy!1!143I3IA3MTZTcTcdii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!35==I$PQ'RHHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQWQ`Q`a%%'))''F & Z  ##J &    hidden_statescache_paramscache_positionattention_maskc                 	   U R                  U5      R                  SS5      nU R                  (       Ga.  UGc*  [        UU R                  R
                  U R                  (       a  U R                  R                  OS U R                  R
                  U R                  R
                  U R                  R
                  U R                  (       a$  U R                  R                  R                  5       OS [        R                  " U R                  R                  5       5      * S S U R                   R                  5       U R                  R                  R                  5       SS9nU$ UR#                  SSS9u  pUb  XR%                  S5      -  nU R                  R
                  R'                  U R                  R
                  R)                  S5      U R                  R
                  R)                  S5      5      nUbn  US   S:  ae  [+        UR-                  S5      UR.                  U R0                     UU R                  R                  U R2                  5      nUR%                  S5      nOUbW  [4        R6                  R9                  XR:                  UR<                  S   -
  S45      n	UR?                  U R0                  X5        [A        XU R                  R                  U R2                  S9nUb  XR%                  S5      -  nU R                  UR                  SS5      5      n
[        RB                  " XRD                  U RF                  U RF                  /SS9u  pnU R                  R
                  UR                  SS5      -  n[        R                  " U R                  R                  5       5      * n[I        U R                  S	5      (       a$  U R                  R                  R                  5       OS nUbc  US   S:  aZ  [K        URL                  U R0                     US
   US
   UUS S 2S4   US S 2S4   U R                   US
   USS9
R%                  S5      nOo[O        UUUUR                  SS5      UR                  SS5      U R                   R                  5       UUSSS9
u  nnUb  Ub  URQ                  U R0                  U5        U R                  UR                  SS5      5      nU$ )Nr   r+   T)
delta_biasdelta_softplusdimr   r/   )r>   r'   ).r   )dt_softplus)ra   return_last_state))rC   	transposetrainingr   r<   weightr:   r'   rD   rE   rP   rB   floatrF   exprM   rO   chunk	unsqueezeviewsizer   squeezeconv_statesr#   r>   r	   
functionalpadr6   shapeupdate_conv_stater   splitr9   r4   hasattrr   
ssm_statesr   update_ssm_state)rU   r[   r\   r]   r^   projected_statescontextualized_statesgateconv_weightsrp   ssm_parameters	time_stepBCdiscrete_time_steprV   time_proj_biasscan_outputs	ssm_states                      rX   cuda_kernels_forwardMambaMixer.cuda_kernels_forward   s+     <<6@@AF===\1$2 ""$($6$6  D""##$$.2mm""((*4::++-..<<,,224#%!p %$O #3"8"8"8"BM) -0H0H0K K  ;;--224;;3E3E3J3J13Mt{{OaOaOfOfghOijL'N1,=,A 4!))"- ,,T^^< KK$$OO! !. 7 7 ;+"$--"3"3%(=(=@S@STV@W(WYZ'[#K !224>>;_ 0!1A1Adoo! ) -0H0H0K K "[[)@)@A)FGN#kk!4!4d6I6I4K^K^ _egOI! "&!4!4y7J7J1a7P!P4::++-..A:A$,,PV:W:WT\\..446]aN'N1,=,A5 ++DNN;!&)&v.adGadGFFL" $  )B-  +<!&KK1%KK1%FFLLN"#'&*+'i (\-E 11$..)L %)MM,2H2HA2N$O!$$rZ   c           	      ^   UR                   u  pVnUR                  nU R                  U5      R                  SS5      n	U	R	                  SSS9u  pUb  XR                  S5      -  n
UGb  UR                  U R                     R                  5       nUR                  U
R                  5      nUR                   S   U R                  :X  a  [        R                  R                  U
U R                  U
R                   S   -
  S45      nUR                  U R                  X5        U R!                  U R#                  U
5      SS U24   5      n
GO6UR                  U R                  X5      nUR                  U R"                  R$                  R                  5      n[&        R(                  " XR"                  R$                  S S 2SS S 24   -  SS9n
U R*                  (       a  XR"                  R,                  -  n
U R!                  U
5      R                  U5      R                  S5      n
O][&        R.                  " XPR0                  U R2                  4U
R                  US9nU R!                  U R#                  U
5      SS U24   5      n
Ub  XR                  S5      -  n
U R5                  U
R                  SS5      5      n[&        R6                  " XR8                  U R2                  U R2                  /SS9u  nnnU R;                  U5      n[        R                  R=                  U5      R                  SS5      n[&        R>                  " U R@                  RC                  5       5      * n[&        R>                  " US S S 2S S S 24   US S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S 4   US S 2S S S 2S S 24   RC                  5       -  nUU
S S 2S S 2S S 2S 4   RC                  5       -  nU RD                  (       a  U RF                  (       a  Uc  [I        UR                  SS5      UR                  SS5      5      nUUR                  S5      -  RK                  S5      R                  SS5      nUXRL                  S S S 2S 4   -  -   nUU R!                  U5      -  nO/ n[O        U5       H  nUS S 2S S 2US S 24   U-  US S 2S S 2US S 24   -   n[&        RP                  " UR                  U5      US S 2US S 24   R                  S5      5      nURS                  US S 2S S 2S4   5        M     [&        RT                  " USS9nUXRL                  S S S 2S 4   -  -   nUU R!                  U5      -  nUb(  UR                  U R                     RW                  U5        U RY                  UR                  SS5      5      nU$ )	Nr   r+   rb   r   r/   .devicer.   r   )-rs   r.   rC   rf   rk   rl   rw   r#   clonetor   r6   r	   rq   rr   rt   r?   r<   rh   rF   sumr:   r'   zerosr7   r4   rD   ru   r9   rE   softplusrj   rM   ri   r@   rg   r   ro   rO   rangematmulappendstackcopy_rP   )rU   input_statesr\   r]   r^   
batch_sizeseq_len_r.   ry   r[   r{   r   
conv_stater}   r~   r   r   r   rV   
discrete_A
discrete_BdeltaB_uhsscan_outputr   irz   s                               rX   slow_forwardMambaMixer.slow_forward   s   !-!3!3
Q""<<5??1E.44QA4>%),D,DQ,GGM #$//?EEGI!]%9%9:I ##A&$*?*??]]..!**]-@-@-DDaH

 ..t~~zZ $])CC'M)R S);;DNNMj
']]4;;+=+=+D+DE
 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O33T5H5HI$++5I !HHT[[%?XgX%NOM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	1a "\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DD ,2Fz++Aq183E3Ea3KLBB/88;EEaKK%tQ}8M(MMK%6KL7^&q!Qz2Y>!QPQST*AUU	#ll9<<+>!Q'
@T@TUW@XY##K1a$89 $  ++l;K%a9N)NOK&$7K'''7==iH !%k.C.CAq.I J$$rZ   c                    [         (       ac  SU R                  R                  R                  R                  ;   a5  [
        R                  R                  5       (       d  U R                  XX45      $ U R                  XX45      $ )Ncuda)
rQ   rD   rh   r   typerF   _dynamois_compilingr   r   )rU   r[   r\   r]   r^   s        rX   forwardMambaMixer.forward:  sb     "!f0B0B0I0I0N0N&NW\WdWdWqWqWsWs,,].ii  n]]rZ   )rM   rO   r?   r>   r"   r<   r6   rE   r2   rC   r7   r#   rP   r4   r9   rB   r:   r@   rD   r   )__name__
__module____qualname____firstlineno____doc__r   r8   r1   rF   Tensorr   r   
LongTensorr   r   r   __static_attributes____classcell__rW   s   @rX   r    r    >   s   :{ :s :~ .25959c%||c% z*c% !!1!12	c%
 !!1!12c%LO%x
7K O%aijojzjza{ O%  S[  \a  \l  \l  Sm O%j .25959	^ z*	^ !!1!12		^
 !!1!12	^ 	^rZ   r    c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )MambaRMSNormiF  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z<
MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
N)r0   r1   r	   rK   rF   rN   rh   variance_epsilon)rU   r2   epsrW   s      rX   r1   MambaRMSNorm.__init__G  s/     	ll5::k#:; #rZ   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr+   r/   T)keepdim)	r.   r   rF   rH   powmeanrsqrtr   rh   )rU   r[   input_dtypevariances       rX   r   MambaRMSNorm.forwardO  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::rZ   c                 R    U R                   R                  S    SU R                   3$ )Nr   z, eps=)rh   rs   r   rU   s    rX   
extra_reprMambaRMSNorm.extra_reprV  s*    ++##A&'vd.C.C-DEErZ   )r   rh   )gư>)	r   r   r   r   r1   r   r   r   r   r   s   @rX   r   r   F  s    $;F FrZ   r   c                      ^  \ rS rSrU 4S jr   SS\\   S\\R                     S\\R                     4S jjr	Sr
U =r$ )	
MambaBlockiZ  c                    > [         TU ]  5         Xl        X l        UR                  U l        [        UR                  UR                  S9U l        [        XS9U l
        g )Nr   r#   )r0   r1   r"   r#   residual_in_fp32r   r2   layer_norm_epsilonnormr    mixer)rU   r"   r#   rW   s      rX   r1   MambaBlock.__init__[  sL    " & 7 7 !3!39R9RS	<
rZ   r\   r]   r^   c                 
   UnU R                  UR                  U R                   R                  R                  S95      nU R                  (       a  UR                  [
        R                  5      nU R                  XX4S9nXQ-   nU$ )Nr-   r\   r]   r^   )r   r   rh   r.   r   rF   rH   r   )rU   r[   r\   r]   r^   residuals         rX   r   MambaBlock.forwardc  sx     !		-"2"29I9I9O9O"2"PQ  {{5==1H

^ # 
 !0rZ   )r"   r#   r   r   r   r   )r   r   r   r   r1   r   r   rF   r   r   r   r   r   s   @rX   r   r   Z  sV    = .25959 z* !!1!12	
 !!1!12 rZ   r   c                   2    \ rS rSr\rSrSS/rSrSr	S r
Srg)	MambaPreTrainedModeliv  backboner   r    Tc                 B   [        U[        5      (       Ga^  SUR                  l        SUR                  l        U R
                  R                  S-  U R
                  R                  -  nU R
                  R                  S:X  a5  [        R                  R                  UR                  R                  U5        OPU R
                  R                  S:X  a6  [        R                  R                  UR                  R                  U* U5        [        R                   " [        R"                  " U R
                  R$                  5      [&        R(                  " U R
                  R*                  5      [&        R(                  " U R
                  R,                  5      -
  -  [&        R(                  " U R
                  R,                  5      -   5      R/                  U R
                  R0                  S9nU[        R(                  " [        R2                  " U* 5      * 5      -   n[        R4                  " 5          UR                  R6                  R9                  U5        SSS5        SUR                  R6                  l        [        U[        R<                  5      (       aS  UR6                  bE  [?        UR6                  SS5      (       d)  [        R                  RA                  UR6                  5        O[[        U[        RB                  5      (       a<  [        R                  RE                  UR                  U R
                  RF                  S	9  U R
                  RH                  (       a  URK                  5        H  u  pVUS
;   d  M  [        R                  RM                  U[&        RN                  " S5      S9  [        R4                  " 5          U[&        RN                  " U R
                  RP                  5      -  nSSS5        M     gg! , (       d  f       GN= f! , (       d  f       M  = f)zInitialize the weights.Tg      constantrandom)minN
_no_reinitF)std)zout_proj.weight   )a))
isinstancer    rM   _no_weight_decayrO   r"   r9   time_step_scaletime_step_init_schemer	   init	constant_rE   rh   uniform_rF   rj   randr7   mathrL   time_step_maxtime_step_minclamptime_step_floorexpm1no_gradr'   r   r   rA   getattrzeros_	Embeddingnormal_initializer_rangerescale_prenorm_residualnamed_parameterskaiming_uniform_sqrtnum_hidden_layers)rU   moduledt_init_stddtinv_dtnameps          rX   _init_weights"MambaPreTrainedModel._init_weights~  s   fj)),0FLL)(,FHH%++44d:T[[=X=XXK{{00J>!!&.."7"7E22h>  !6!6kR

4;;88988DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566F##))&1 !-1FNN*fbii(({{&v{{L%@@GGNN6;;/--GGOOFMMt{{/L/LOM;;// "224..
 GG,,Q$))A,,?TYYt{{'D'DEE ) 5 0 !2 )s   ,&O=.P=
P
P	 N)r   r   r   r   r   config_classbase_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr   r   r   rZ   rX   r   r   v  s)    L"%|4&*#L-FrZ   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\   \	S'   Sr\\\R                        \	S'   Srg)MambaOutputi  a  
Class for the MAMBA model outputs.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
Nlast_hidden_stater\   r[   r   )r   r   r   r   r   r  r   rF   FloatTensor__annotations__r\   r   r[   r   r   r   rZ   rX   r  r    sH    $ 6:x 1 129)-L(:&-8<M8E%"3"345<rZ   r  c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\   \	S'   Sr\\\R                        \	S'   Srg)	MambaCausalLMOutputi  a  
Base class for causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
Nlosslogitsr\   r[   r   )r   r   r   r   r   r  r   rF   r  r  r  r\   r   r[   r   r   r   rZ   rX   r
  r
    s\    ( )-D(5$$
%,*.FHU&&'.)-L(:&-8<M8E%"3"345<rZ   r
  c                     ^  \ rS rSrU 4S jrS rS rS r\        SS\	\
R                     S\	\
R                     S\	\   S	\	\   S
\	\   S\	\   S\	\
R                     S\	\
R                     S\\\4   4S jj5       rSrU =r$ )
MambaModeli  c           
        > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l        SU l        [        UR
                  UR                  S9U l        U R!                  U R"                  5        U R%                  5         g s  snf )Nr   Fr   )r0   r1   r	   r   
vocab_sizer2   
embeddings
ModuleListr   r   r   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)rU   r"   idxrW   s      rX   r1   MambaModel.__init__  s     ,,v'8'8&:L:LMmmRWX^XpXpRq$rRq3Z%FRq$rs&+#"6#5#56;T;TU//? %ss   (Cc                 l    U H.  nSU;   d  M  UR                  U5      XR                  SS5      '     g    g )Nz
embedding.zembeddings.)popreplace)rU   
state_dictprefixargsks        rX   r  MambaModel.load_hook  s4    Aq EO^^TUEV
99\=AB rZ   c                     U R                   $ Nr  r   s    rX   get_input_embeddingsMambaModel.get_input_embeddings  s    rZ   c                     Xl         g r$  r%  rU   new_embeddingss     rX   set_input_embeddingsMambaModel.set_input_embeddings  s    (rZ   	input_idsinputs_embedsr\   	use_cacheoutput_hidden_statesreturn_dictr]   r^   returnc	                    Ub  UOU R                   R                  nUb  UO(U R                  (       d  U R                   R                  OSnUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUc  U R                  U5      nU R                  (       a  U R                  (       a	  U(       a  SnU(       a  Ucn  [        U R                   UR                  S5      UR                  UR                  S9n[        R                  " SU R                   R                  UR                  S9nOUc  [        S5      eOSnUn	U(       a  SOSn
U R                   HZ  nU R                  (       a/  U R                  (       a  U R!                  UR"                  XXx5      n	O	U" U	UUUS	9n	U(       d  MU  X4-   n
M\     U R%                  U	5      n	U(       a  X4-   n
U(       d  ['        S
 XU
4 5       5      $ [)        U	U(       a  UU
S9$ SU
S9$ )ay  
cache_params (`MambaCache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
NFz:You must specify exactly one of input_ids or inputs_embedsr   r   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyr   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr$  r   ).0vs     rX   	<genexpr>%MambaModel.forward.<locals>.<genexpr>I  s     f$Tq$Ts   	)r  r\   r[   )r"   r0  rg   r/  use_return_dict
ValueErrorr  r  r   rn   r   r.   rF   rG   r5   r  _gradient_checkpointing_func__call__r  tupler  )rU   r-  r.  r\   r/  r0  r1  r]   r^   r[   all_hidden_statesmixer_blocks               rX   r   MambaModel.forward  s   ( %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]-t";<YZZ  OOI6M&&4==YI#)KK!3!3A!6}?S?S[h[n[n  "'a1H1HQ^QeQe!f' !; 	 (  L%"6BD;;K**t}} $ A A((-~! !,!!-#1#1	! $#$58H$H! '  M2 14D Df]BS$Tfff+)2+
 	
8<+
 	
rZ   )r  r  r  r  )NNNNNNNN)r   r   r   r   r1   r  r&  r+  r   r   rF   r   r   boolr   r   r  r   r   r   r   s   @rX   r  r    s    
)  1548-1$(/3&*5959Q
E,,-Q
   0 01Q
 z*	Q

 D>Q
 'tnQ
 d^Q
 !!1!12Q
 !!1!12Q
 
uk!	"Q
 Q
rZ   r  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc                     ^  \ rS rSrS/rU 4S jrS rS rS rS r	 SS\
S	\\\4   S
\S\\\4   4S jjr     SS\\   S\\R&                     S\\R&                     4S jjr\         SS\\R&                     S\\R&                     S\\R,                     S\\   S\\R&                     S\\   S\\   S\\   S\\R0                     S\\\4   4S jj5       rSrU =r$ )MambaForCausalLMiR  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr,   )
r0   r1   r  r   r	   rA   r2   r  lm_headr  )rU   r"   rW   s     rX   r1   MambaForCausalLM.__init__[  sF     "6*yy!3!3V5F5FUSrZ   c                     U R                   $ r$  rG  r   s    rX   get_output_embeddings&MambaForCausalLM.get_output_embeddingsb  s    ||rZ   c                     Xl         g r$  rJ  r)  s     rX   set_output_embeddings&MambaForCausalLM.set_output_embeddingse  s    %rZ   c                 6    U R                   R                  5       $ r$  )r   r&  r   s    rX   r&  %MambaForCausalLM.get_input_embeddingsh  s    }}1133rZ   c                 8    U R                   R                  U5      $ r$  )r   r+  r)  s     rX   r+  %MambaForCausalLM.set_input_embeddingsk  s    }}11.AArZ   outputsmodel_kwargsnum_new_tokensr2  c                    UR                  SS 5      US'   UR                  SS5      (       a  SU;   a  US   b  US   SS  U-   US'   SU;   a<  US   n[        R                  " XUR                  UR                  S   S45      /SS	9US'   U$ )
Nr\   r/  Tr]   r/   r^   r   r   rb   )getrF   catnew_onesrs   )rU   rT  rU  rV  kwargsr^   s         rX   #_update_model_kwargs_for_generation4MambaForCausalLM._update_model_kwargs_for_generationn  s     (/{{>4'H^$[$// L0-.:-9:J-KBC-PSa-aL)*|+)*:;N-2YY!8!8.:N:Nq:QST9U!VW]_.L)* rZ   r\   r]   r^   c                 L   U(       ai  Uc  [        S5      eUS   S:  a  US S 2S4   R                  S5      nUb  S nO4[        R                  " SU R                  R
                  UR                  S9nUb  Uc  SU0nOSUR                  5       0nUR                  UUUUS.5        U$ )Nz`cache_position` should not be None as it should have been initialized in `model.generate`, you are responsible for passing in a valid `cache_position` if you are calling `prepare_inputs_for_generation` directly with `use_cache=True`r   r/   r4  r.  r-  )r\   r/  r]   r^   )	r;  rl   rF   rG   r"   r5   r   rJ   update)	rU   r-  r.  r/  r\   r]   r^   r[  model_inputss	            rX   prepare_inputs_for_generation.MambaForCausalLM.prepare_inputs_for_generation  s     % e 
 a 1$%ae,66r:	!-%)N "'a1H1HQZQaQa!b$)=+];L')=)=)?@L ,&"0"0		
 rZ   r-  r.  labelsr0  r1  r/  c
                    Ub  UOU R                   R                  nU R                  UUUUUUU	US9nUS   nU R                  UR	                  U R                  R
                  R                  5      5      R                  5       nSnUb  UR	                  UR                  5      nUSSS2SS24   R                  5       nUSSS24   R                  5       n[        5       nU" UR                  SUR                  S5      5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
cache_params (`MambaCache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
N)r\   r.  r0  r1  r/  r]   r^   r   .r/   r   )r  r  r\   r[   )r"   r:  r   rG  r   rh   r.   ri   r   rJ   r
   rm   rn   r
  r\   r[   )rU   r-  r^   r.  r\   rc  r0  r1  r/  r]   r[  mamba_outputsr[   r  r  shift_logitsshift_labelsloss_fctoutputs                      rX   r   MambaForCausalLM.forward  s_   2 &1%<k$++B]B]%'!5#)) & 	
 &a(m..t||/B/B/H/HIJPPRYYv}}-F!#ssA+.99;L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`DYqr!22F)-)9TGf$EvE"&33'55	
 	
rZ   )r   rG  )r   )NNNNN)	NNNNNNNNN)r   r   r   r   _tied_weights_keysr1   rK  rN  r&  r+  r   r   strr   r8   r\  r   r   rF   r   ra  r   r  rB  r   r   r   r
  r   r   r   r   s   @rX   rE  rE  R  s    ++&4B YZ"26sCx.RU	c3h, -15959.
 z*. !!1!12. !!1!12.`  155959-1-1/3&*$(15<
E,,-<
 !!1!12<
   1 12	<

 z*<
 ))*<
 'tn<
 d^<
 D><
 !.<
 
u))	*<
 <
rZ   rE  )rE  r  r   ):r   r   dataclassesr   typingr   r   r   r   r   rF   torch.utils.checkpointr	   torch.nnr
   activationsr   cache_utilsr   
generationr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   configuration_mambar   
get_loggerr   rR   mambapy.pscanr   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   causal_conv1dr   r   allrQ   Moduler    r   r   r   r  r
  r  rE  __all__r   rZ   rX   <module>r     s     ! 4 4    % ! % ) - 
 k j , 
		H	%#EXR@P=-~DD-7**.0@BVXfg 
E^ E^PF299 F( 8 4F? 4F 4Fn =+ = =0 =+ = =6 k
% k
 k
\ V
+_ V
V
r ErZ   