o
    ZhA                     @   s>  d dl mZmZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZmZmZmZ ddlmZ ee Z!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G dd deZ'G dd deZ(G dd deZ)g dZ*dS )    )ListOptionalTupleUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)auto_docstringcan_return_tuplelogging   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                       &   e Zd Zdedef fddZ  ZS )GraniteMoeHybridAttentionconfig	layer_idxc                       t  || d S Nsuper__init__selfr   r   	__class__ l/var/www/auris/lib/python3.10/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr#   )      z"GraniteMoeHybridAttention.__init____name__
__module____qualname__r   intr#   __classcell__r(   r(   r&   r)   r   (       r   c                       r   )GraniteMoeHybridMambaLayerr   r   c                    s   t  t|| d S r    )r"   r#   r   r$   r&   r(   r)   r#   .   s   z#GraniteMoeHybridMambaLayer.__init__r+   r(   r(   r&   r)   r2   -   r1   r2   c                       s   e Zd Zd fdd	Z  ZS )GraniteMoeHybridRMSNormGatedư>c                    r   r    r!   )r%   Zhidden_sizeZepsr&   r(   r)   r#   3   r*   z%GraniteMoeHybridRMSNormGated.__init__)r4   )r,   r-   r.   r#   r0   r(   r(   r&   r)   r3   2   s    r3   c                       s"   e Zd Zdef fddZ  ZS )GraniteMoeHybridMLPr   c                    s   t  | d S r    r!   r%   r   r&   r(   r)   r#   8   s   zGraniteMoeHybridMLP.__init__)r,   r-   r.   r   r#   r0   r(   r(   r&   r)   r5   7   s    r5   c                       s   e Zd Zdedef fddZ							ddejdeej d	ee	 d
ee
 dee
 deej dee
 deeejejf  deejeeejejf  f fddZ  ZS )GraniteMoeHybridDecoderLayerr   r   c                    s\   t  || t|| _d | _d | _|j| dkr t||| _nt||| _|j| | _	d S )Nmamba)
r"   r#   r5   
shared_mlp	self_attnr8   Zlayers_block_typer2   r   
layer_typer$   r&   r(   r)   r#   =   s   
z%GraniteMoeHybridDecoderLayer.__init__NFhidden_statesattention_maskpast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingsreturnc	              
   K   s   |}
|  |}| jdur| j||||d}d}n| jd|||||||d|	\}}}|
|| j  }|}
| |}| |\}}|| | }|
|| j  }|f}|rX||f7 }|r_||f7 }|rf||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*):
                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
                query_sequence_length, key_sequence_length)` if default attention is used.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence
            output_router_logits (`bool`, *optional*):
                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
                should not be returned during inference.
            position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
                into the model
        N)r<   rA   Zcache_paramsr=   )r<   r=   r>   r?   r@   rA   rC   r(   )Zinput_layernormr8   r:   Zresidual_multiplierZpost_attention_layernormZblock_sparse_moer9   )r%   r<   r=   r>   r?   r@   rA   rB   rC   kwargsZresidualZself_attn_weights_Zmoe_hidden_statesrouter_logitsZoutputsr(   r(   r)   forwardJ   sF   %





z$GraniteMoeHybridDecoderLayer.forward)NNFFNFN)r,   r-   r.   r   r/   r#   torchTensorr   r   bool
LongTensorr   FloatTensorrH   r0   r(   r(   r&   r)   r7   <   s8    	r7   c                       s*   e Zd ZeZdgZdZ fddZ  ZS )GraniteMoeHybridPreTrainedModelr7   Tc                    s   t    t|tjr%|jjjd| jj	d |j
d ur#|j
j  d S d S t|trH|jjd ttd|jd |j_|jjd d S t|trV|jjd d S d S )Ng        )meanZstdg      ?r   )r"   _init_weights
isinstancer   ZConv1dweightdataZnormal_r   Zinitializer_rangeZbiasZzero_r2   Zdt_biasZfill_rI   logarangeZ	num_headsZA_logDr3   )r%   moduler&   r(   r)   rP      s   



z-GraniteMoeHybridPreTrainedModel._init_weights)	r,   r-   r.   r   Zconfig_classZ_no_split_modulesZ_is_statefulrP   r0   r(   r(   r&   r)   rN      s
    rN   c                       s   e Zd Zdef fddZee											ddejde	ej
 de	ej de	eeeej f  d	e	ej d
e	e de	e de	e de	e de	e de	ej deeef fddZdd Z  ZS )GraniteMoeHybridModelr   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  |qS r(   )r7   ).0r   r   r(   r)   
<listcomp>   s    z2GraniteMoeHybridModel.__init__.<locals>.<listcomp>)r"   r#   r   Z
ModuleListrangeZnum_hidden_layerslayersr6   r&   rZ   r)   r#      s   
zGraniteMoeHybridModel.__init__N	input_idsr=   position_idspast_key_valuesinputs_embedsr@   r?   output_hidden_statesrB   return_dictrA   rD   c                 C   s@  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|
d ur$|
n| j j}
|d u |d uA r4td| jrC| jrC|rCt	d d}|d u rL| 
|}|| j }|r\|d u r\t	d |d u rx|d urh| nd}tj|||jd  |jd}|d u r|d}| |||||}| ||}|}d }| jd ur| ||}|rdnd }|rdnd }|	rdnd }d }| jD ]L}|jd	kr|n|}|r||f7 }||||||||	|d
}|d }|r||rdnd }|r|d d ur||d f7 }|	r|d d ur||d f7 }q| |}|r||f7 }|r|nd }t|||||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicer(   r8   )r=   r>   r?   r@   rA   rB   rC   r   )Zlast_hidden_stater`   r<   Z
attentionsrG   )r   r?   rb   r@   Zuse_return_dict
ValueErrorZgradient_checkpointingZtrainingloggerZwarning_onceZembed_tokensZembedding_multiplierZget_seq_lengthrI   rU   shapere   Z	unsqueezeZ_update_causal_mask_update_mamba_maskZ
rotary_embr]   r;   Znormr
   )r%   r^   r=   r_   r`   ra   r@   r?   rb   rB   rc   rA   Zpast_seen_tokensZcausal_mask
mamba_maskr<   rC   Zall_hidden_statesZall_self_attnsZall_router_logitsZnext_decoder_cacheZdecoder_layerZ
layer_maskZlayer_outputsZ
next_cacher(   r(   r)   rH      s   








zGraniteMoeHybridModel.forwardc                 C   s.   |}|d dks|durt |dkrd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr   )rI   all)r%   r=   rA   rk   r(   r(   r)   rj   2  s   "z(GraniteMoeHybridModel._update_mamba_mask)NNNNNNNNNNN)r,   r-   r.   r   r#   r   r   rI   rL   r   rJ   r   r   r   rM   rK   r   r	   rH   rj   r0   r(   r(   r&   r)   rX      sR    	

vrX   c                       sL   e Zd ZdgZdef fddZ						dddZd	efd
dZ  Z	S )GraniteMoeHybridForCausalLMzlm_head.weightr   c                    s"   t  | t|| _|   d S r    )r"   r#   rX   modelZ	post_initr6   r&   r(   r)   r#   A  s   
z$GraniteMoeHybridForCausalLM.__init__NTc                 K   s  |d u }	|	s5|d us|d |j d kr"|d d |j d  d f }n!|j d |j d kr4|d d |f }nt| j|j d | j| jd}|d url|d u rl| dd }||dkd |	sl|d d |j d  d f }|d urw|	rwd|i}
nd| i}
|
	|||||d |
S )Nrf   r   r   rd   ra   r^   )r_   r`   r@   r=   rA   )
ri   r   r   Zdtypere   longZcumsumZmasked_fill_
contiguousupdate)r%   r^   r`   r=   ra   rA   r_   r@   rE   Zempty_past_kvZmodel_inputsr(   r(   r)   prepare_inputs_for_generationG  s8   
	z9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationrD   c                 C   s   dS )aG  
        Function overwritten as this class uses its own `HybridMambaAttentionDynamicCache`
        and do not need to initialize the Cache in advance in order to save memory
        (because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
        for `HybridMambaAttentionDynamicCache`).
        Fr(   )r%   r(   r(   r)   _supports_default_dynamic_cache  s   z;GraniteMoeHybridForCausalLM._supports_default_dynamic_cache)NNNNNT)
r,   r-   r.   Z_tied_weights_keysr   r#   rr   rK   rs   r0   r(   r(   r&   r)   rm   >  s    	
9rm   )rm   rX   rN   )+typingr   r   r   r   rI   r   Zcache_utilsr   Zmodeling_outputsr	   r
   utilsr   r   r   Zbamba.configuration_bambar   Zbamba.modeling_bambar   r   r   Z*granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   Zconfiguration_granitemoehybridr   Z
get_loggerr,   rh   r   r2   r3   r5   r7   rN   rX   rm   __all__r(   r(   r(   r)   <module>   s*    
c L