o
    Zh                     @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZmZ ddlm Z  e!e"Z#e roddl$m%Z% ndZ%e rddl&m'Z'm(Z( ddl)m*Z* nd\Z*Z(Z'e rddl+m,Z,m-Z- nd\Z-Z,e.e*e(e,e-e'fZ/G dd dej0Z1G dd dej0Z2G dd dej0Z3eG dd deZ4eG dd deZ5eG d d! d!eZ6eG d"d# d#e4Z7ed$d%G d&d' d'e4eZ8g d(Z9dS ))zPyTorch MAMBA model.    N)	dataclass)AnyDictOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)
MambaCache)GenerationMixin)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNN)causal_conv1d_fncausal_conv1d_update)NNc                
       s   e Zd ZdZdedef fddZ			ddejde	e
 d	e	ej d
e	ej fddZdde	e
 d	e	ej d
e	ej fddZ			dde	e
 d	e	ej d
e	ej fddZ  ZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    config	layer_idxc                    s  t    || _|j| _|j| _|j| _|j| _t	|j
| _
|| _|j| _tj| j| j|j|j| j|jd d| _|j| _t|j | _|j| _tj| j| jd |jd| _tj| j| j
| jd  dd| _tj| j
| jdd| _tjd| jd tjdd d d f }|| jd }tt || _!tt"| j| _#tj| j| j|jd| _$|j| _t%s| jrt& rt'(d	 d S t)d
t'(d d S d S )Nr   )Zin_channelsZout_channelsbiasZkernel_sizegroupspadding   r!   FTdtypea7  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)*super__init__r   hidden_sizeZ
state_sizessm_state_sizeconv_kernelconv_kernel_sizeintermediate_sizeinttime_step_rankr    use_conv_biasr   ZConv1dconv1dZ
hidden_act
activationr   actuse_mambapyLinearuse_biasin_projx_projdt_projtorcharangefloat32expand
contiguous	ParameterlogA_logonesDout_projis_fast_path_availabler   loggerZwarning_onceImportError)selfr   r    A	__class__ W/var/www/auris/lib/python3.10/site-packages/transformers/models/mamba/modeling_mamba.pyr*   F   sT   
	 $zMambaMixer.__init__Nhidden_statescache_paramscache_positionattention_maskc                 C   s  |  |dd}| jrI|d u rIt|| jj| jr| jjnd | jj| j	j| j
j| jr/| j
j nd t| j  d d | j | j	j dd}|S |jddd\}}|d ur]||d }| jj| jjd| jjd}|d ur|d dkrt|d|j| j || jj| j}|d}n&|d urtj|| j|jd  df}	|| j|	| t ||| jj| jd}|d ur||d }| |dd}
tj!|
| j"| j#| j#gdd\}}}| j	j|dd }t| j  }t$| j	d	r| j	j nd }|d ur2|d dkr2t%|j&| j |d
 |d
 ||d d df |d d df | j|d
 |dd
d}n,t'||||dd|dd| j ||ddd
\}}|d ur^|d ur^|(| j| | 
|dd}|S )Nr   r$   T)Z
delta_biasdelta_softplusdimr   r(   )r4   r!   ).r   )Zdt_softplus)rT   Zreturn_last_state))r9   	transposetrainingr   r3   weightr2   r!   r:   r;   rF   r8   floatr<   exprC   rE   chunk	unsqueezeviewsizer   squeezeconv_statesr    r4   r   
functionalpadr.   shapeupdate_conv_stater   splitr1   r,   hasattrr   
ssm_statesr   Zupdate_ssm_state)rJ   rP   rQ   rR   rS   projected_statescontextualized_statesgateZconv_weightsra   ssm_parameters	time_stepBCdiscrete_time_steprK   Ztime_proj_biasscan_outputs	ssm_staterN   rN   rO   cuda_kernels_forward   s   
X$




zMambaMixer.cuda_kernels_forwardc              	   C   s  |j \}}}|j}| |dd}	|	jddd\}
}|d ur&|
|d }
|d ur|j| j  }|	|
j
}|j d | jkrftj|
| j|
j d  df}|| j|| | | |
dd |f }
nU|| j|
|}|	| jjj
}tj|| jjd d dd d f  dd}
| jr|
| jj7 }
| |
	|d}
ntj|| j| jf|
j
|d}| | |
dd |f }
|d ur|
|d }
| |
dd}tj|| j| j| jgdd\}}}| |}tj|dd}t| j !  }t|d d d d d d f |d d d d d d d f  }|d d d d d d d f |d d d d d d d f !  }||
d d d d d d d f !  }| j"r| j#r|d u rt$|dd|dd}||d %ddd}||
| j&d d d d f   }|| | }nug }t'|D ]D}|d d d d |d d f | |d d d d |d d f  }t(|	||d d |d d f d}|)|d d d d df  qtj*|dd}||
| j&d d d d f   }|| | }|d ur|j| j +| | ,|dd}|S )	Nr   r$   rU   r   r(   .devicer'   r
   )-rd   r'   r9   rW   r\   r]   rh   r    clonetoru   r.   r   rb   rc   re   r5   r3   rY   r<   sumr2   r!   Zzerosr/   r,   r:   rf   r1   r;   Zsoftplusr[   rC   rZ   r6   rX   r   r`   rE   rangematmulappendstackcopy_rF   )rJ   Zinput_statesrQ   rR   rS   Z
batch_sizeZseq_len_r'   ri   rP   rk   rr   Z
conv_staterl   rm   rn   ro   rp   rK   Z
discrete_AZ
discrete_BZdeltaB_uhsZscan_outputrq   irj   rN   rN   rO   slow_forward   sp   (
:<$<* 
zMambaMixer.slow_forwardc                 C   s>   t rd| jjjjv rtj s| ||||S | 	||||S )Ncuda)
rG   r:   rY   ru   typer<   Z_dynamoZis_compilingrs   r   )rJ   rP   rQ   rR   rS   rN   rN   rO   forward:  s   zMambaMixer.forwardr   )__name__
__module____qualname____doc__r   r0   r*   r<   Tensorr   r   
LongTensorrs   r   r   __classcell__rN   rN   rL   rO   r   >   s4    ?
(fUr   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	MambaRMSNormư>c                    s&   t    tt|| _|| _dS )zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)r)   r*   r   rA   r<   rD   rY   variance_epsilon)rJ   r+   epsrL   rN   rO   r*   G  s   

zMambaRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr$   r(   T)Zkeepdim)	r'   rw   r<   r>   powmeanZrsqrtr   rY   )rJ   rP   Zinput_dtypeZvariancerN   rN   rO   r   O  s
   zMambaRMSNorm.forwardc                 C   s   | j jd  d| j S )Nr   z, eps=)rY   rd   r   rJ   rN   rN   rO   
extra_reprV  s   zMambaRMSNorm.extra_repr)r   )r   r   r   r*   r   r   r   rN   rN   rL   rO   r   F  s    r   c                       sJ   e Zd Z fddZ			d	dee deej deej fddZ  Z	S )

MambaBlockc                    sB   t    || _|| _|j| _t|j|jd| _t	||d| _
d S )Nr   r    )r)   r*   r   r    residual_in_fp32r   r+   layer_norm_epsilonnormr   mixer)rJ   r   r    rL   rN   rO   r*   [  s   
zMambaBlock.__init__NrQ   rR   rS   c                 C   sL   |}|  |j| j jjd}| jr|tj}| j||||d}|| }|S )Nr&   rQ   rR   rS   )r   rw   rY   r'   r   r<   r>   r   )rJ   rP   rQ   rR   rS   ZresidualrN   rN   rO   r   c  s   zMambaBlock.forwardr   )
r   r   r   r*   r   r   r<   r   r   r   rN   rN   rL   rO   r   Z  s    r   c                   @   s,   e Zd ZeZdZddgZdZdZdd Z	dS )MambaPreTrainedModelbackboner   r   Tc              	   C   s  t |trd|j_d|j_| jjd | jj }| jjdkr't	j
|jj| n| jjdkr8t	j
|jj| | tt| jjt| jjt| jj  t| jj j| jjd}|tt|   }t  |jj| W d   n1 sw   Y  d|jj_t |t	jr|jdurt|jddst	j
 |j nt |t	j!rt	j
j"|j| jj#d	 | jj$r|% D ]2\}}|d
v rt	j
j&|t'dd t  |t'| jj( }W d   n1 sw   Y  qdS dS )zInitialize the weights.Tg      Zconstantrandom)minN
_no_reinitF)Zstd)zout_proj.weight   )a))
isinstancer   rC   Z_no_weight_decayrE   r   r1   Ztime_step_scaleZtime_step_init_schemer   initZ	constant_r;   rY   Zuniform_r<   r[   Zrandr/   mathrB   Ztime_step_maxZtime_step_minclampZtime_step_floorexpm1Zno_gradr!   r}   r   r7   getattrZzeros_	EmbeddingZnormal_Zinitializer_rangeZrescale_prenorm_residualZnamed_parametersZkaiming_uniform_sqrtnum_hidden_layers)rJ   moduleZdt_init_stddtZinv_dtnameprN   rN   rO   _init_weights~  sN   




z"MambaPreTrainedModel._init_weightsN)
r   r   r   r   Zconfig_classZbase_model_prefixZ_no_split_modulesZsupports_gradient_checkpointingZ_is_statefulr   rN   rN   rN   rO   r   v  s    r   c                   @   sJ   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeeej  ed< dS )MambaOutputa#  
    Class for the MAMBA model outputs.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cache_params (`MambaCache`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.

            Includes both the State space model state matrices after the selective scan, and the Convolutional states
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Nlast_hidden_staterQ   rP   )r   r   r   r   r   r   r<   FloatTensor__annotations__rQ   r   rP   r   rN   rN   rN   rO   r     s
   
 r   c                   @   s\   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dS )MambaCausalLMOutputa  
    Base class for causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        cache_params (`MambaCache`):
            The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
            avoid providing the old `input_ids`.

            Includes both the State space model state matrices after the selective scan, and the Convolutional states
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    NlosslogitsrQ   rP   )r   r   r   r   r   r   r<   r   r   r   rQ   r   rP   r   rN   rN   rN   rO   r     s   
 r   c                       s   e Zd Z fddZdd Zdd Zdd Ze																dd
ee	j
 dee	j
 dee dee dee dee dee	j
 dee	j
 deeef fddZ  ZS )
MambaModelc                    sn   t    t j j| _t fddt j	D | _
d| _t j jd| _| | j |   d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0idxr   rN   rO   
<listcomp>  s    z'MambaModel.__init__.<locals>.<listcomp>Fr   )r)   r*   r   r   
vocab_sizer+   
embeddingsZ
ModuleListry   r   layersgradient_checkpointingr   r   norm_fZ"_register_load_state_dict_pre_hook	load_hook	post_initrJ   r   rL   r   rO   r*     s    zMambaModel.__init__c                 G   s2   |D ]}d|v r| |||dd<  d S qd S )Nz
embedding.zembeddings.)popreplace)rJ   Z
state_dictprefixargskrN   rN   rO   r     s   zMambaModel.load_hookc                 C      | j S Nr   r   rN   rN   rO   get_input_embeddings     zMambaModel.get_input_embeddingsc                 C   
   || _ d S r   r   rJ   Znew_embeddingsrN   rN   rO   set_input_embeddings     
zMambaModel.set_input_embeddingsN	input_idsinputs_embedsrQ   	use_cacheoutput_hidden_statesreturn_dictrR   rS   returnc	                 C   s  |dur|n| j j}|dur|n| js| j jnd}|dur|n| j j}|du |duA r/td|du r8| |}| jrB| jrB|rBd}|rk|du rbt| j |	d|j
|jd}tjd| j j|j
d}n|du rjtdnd}|}	|rsdnd}
| jD ]"}| jr| jr| |j|	|||}	n||	|||d	}	|r|
|	f }
qx| |	}	|r|
|	f }
|std
d |	||
fD S t|	|r||
dS d|
dS )a  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   rt   ru   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyrN   r   c                 s   s    | ]	}|d ur|V  qd S r   rN   )r   vrN   rN   rO   	<genexpr>I  s    z%MambaModel.forward.<locals>.<genexpr>)r   rQ   rP   )r   r   rX   r   use_return_dict
ValueErrorr   r   r   r_   ru   r'   r<   r=   r-   r   Z_gradient_checkpointing_func__call__r   tupler   )rJ   r   r   rQ   r   r   r   rR   rS   rP   Zall_hidden_statesZmixer_blockrN   rN   rO   r     sf   





zMambaModel.forward)NNNNNNNN)r   r   r   r*   r   r   r   r   r   r<   r   r   boolr   r   r   r   r   rN   rN   rL   rO   r     sB    	

r   z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )Zcustom_introc                       s"  e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Z	d!de	de
eef dede
eef fddZ					d"dee deej deej fddZe									d#deej deej deej dee deej dee dee dee deej deeef fdd Z  ZS )$MambaForCausalLMzlm_head.weightc                    s8   t  | t|| _tj|j|jdd| _| 	  d S )NFr%   )
r)   r*   r   r   r   r7   r+   r   lm_headr   r   rL   rN   rO   r*   [  s   
zMambaForCausalLM.__init__c                 C   r   r   r   r   rN   rN   rO   get_output_embeddingsb  r   z&MambaForCausalLM.get_output_embeddingsc                 C   r   r   r   r   rN   rN   rO   set_output_embeddingse  r   z&MambaForCausalLM.set_output_embeddingsc                 C   s
   | j  S r   )r   r   r   rN   rN   rO   r   h  r   z%MambaForCausalLM.get_input_embeddingsc                 C   s   | j |S r   )r   r   r   rN   rN   rO   r   k  s   z%MambaForCausalLM.set_input_embeddingsr   outputsmodel_kwargsnum_new_tokensr   c                 K   s   | dd |d< | ddr$d|v r$|d d ur$|d dd  | |d< d|v r?|d }tj|||jd dfgdd	|d< |S )
NrQ   r   TrR   r(   rS   r   r   rU   )getr<   catZnew_onesrd   )rJ   r   r   r   kwargsrS   rN   rN   rO   #_update_model_kwargs_for_generationn  s   

z4MambaForCausalLM._update_model_kwargs_for_generationNrQ   rR   rS   c           	      K   s   |r-|d u r
t d|d dkr"|d d df d}|d ur!d }ntjd| jj|jd}|d ur:|d u r:d|i}nd| i}|||||d |S )Nz`cache_position` should not be None as it should have been initialized in `model.generate`, you are responsible for passing in a valid `cache_position` if you are calling `prepare_inputs_for_generation` directly with `use_cache=True`r   r(   r   r   r   )rQ   r   rR   rS   )	r   r]   r<   r=   r   r-   ru   r@   update)	rJ   r   r   r   rQ   rR   rS   r   Zmodel_inputsrN   rN   rO   prepare_inputs_for_generation  s,   
z.MambaForCausalLM.prepare_inputs_for_generationr   r   labelsr   r   r   c
              
   K   s   |dur|n| j j}| j|||||||	|d}|d }| || jjj }d}|dur]||j}|dddddf 	 }|dddf 	 }t
 }||d|d|d}|ss|f|dd  }|durq|f| S |S t|||j|jdS )aS  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        N)rQ   r   r   r   r   rR   rS   r   .r(   r   )r   r   rQ   rP   )r   r   r   r   rw   rY   r'   rZ   ru   r@   r	   r^   r_   r   rQ   rP   )rJ   r   rS   r   rQ   r   r   r   r   rR   r   Zmamba_outputsrP   r   r   Zshift_logitsZshift_labelsZloss_fctoutputrN   rN   rO   r     s:   
zMambaForCausalLM.forward)r   )NNNNN)	NNNNNNNNN)r   r   r   Z_tied_weights_keysr*   r   r   r   r   r   r   strr   r0   r   r   r   r<   r   r   r   r   r   r   r   r   r   r   r   rN   rN   rL   rO   r   R  sx    



0	

r   )r   r   r   ):r   r   dataclassesr   typingr   r   r   r   r   r<   Ztorch.utils.checkpointr   Ztorch.nnr	   Zactivationsr   Zcache_utilsr   Z
generationr   Zmodeling_utilsr   utilsr   r   r   Zutils.import_utilsr   r   r   Zconfiguration_mambar   Z
get_loggerr   rH   Zmambapy.pscanr   Z&mamba_ssm.ops.selective_scan_interfacer   r   Z+mamba_ssm.ops.triton.selective_state_updater   Zcausal_conv1dr   r   allrG   Moduler   r   r   r   r   r   r   r   __all__rN   rN   rN   rO   <module>   s`   

  
7n 