
    fThA                        S SK JrJrJrJr  S SKrS SKJr  SSKJr  SSK	J
r
Jr  SSKJrJrJr  SS	KJr  SS
KJrJrJr  SSKJrJrJrJrJrJr  SSKJr  \R>                  " \ 5      r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r& " S S\5      r' " S S\5      r( " S S\5      r)/ SQr*g)    )ListOptionalTupleUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)auto_docstringcan_return_tuplelogging   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )GraniteMoeHybridAttention(   config	layer_idxc                 $   > [         TU ]  X5        g Nsuper__init__selfr   r    	__class__s      u/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr%   "GraniteMoeHybridAttention.__init__)   s    +     	__name__
__module____qualname____firstlineno__r   intr%   __static_attributes____classcell__r(   s   @r)   r   r   (   s    ,5 ,# , ,r+   r   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )GraniteMoeHybridMambaLayer-   r   r    c                 8   > [         TU ]  [        U5      U5        g r"   )r$   r%   r   r&   s      r)   r%   #GraniteMoeHybridMambaLayer.__init__.   s    V,i8r+   r,   r-   r5   s   @r)   r7   r7   -   s    95 9# 9 9r+   r7   c                   ,   ^  \ rS rSrSU 4S jjrSrU =r$ )GraniteMoeHybridRMSNormGated2   c                 $   > [         TU ]  X5        g r"   r#   )r'   hidden_sizeepsr(   s      r)   r%   %GraniteMoeHybridRMSNormGated.__init__3   s    *r+   r,   )gư>)r.   r/   r0   r1   r%   r3   r4   r5   s   @r)   r<   r<   2   s    + +r+   r<   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )GraniteMoeHybridMLP7   r   c                 $   > [         TU ]  U5        g r"   r#   r'   r   r(   s     r)   r%   GraniteMoeHybridMLP.__init__8   s     r+   r,   )r.   r/   r0   r1   r   r%   r3   r4   r5   s   @r)   rC   rC   7   s    !5 ! !r+   rC   c                   b  ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\
   S\	\   S	\	\   S
\	\R                     S\	\   S\	\\R                  \R                  4      S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )GraniteMoeHybridDecoderLayer<   r   r    c                    > [         TU ]  X5        [        U5      U l        S U l        S U l        UR                  U   S:X  a  [        X5      U l        O[        X5      U l        UR                  U   U l	        g )Nmamba)
r$   r%   rC   
shared_mlp	self_attnrL   layers_block_typer7   r   
layer_typer&   s      r)   r%   %GraniteMoeHybridDecoderLayer.__init__=   sg    +-f5
##I.'93FFDJ6vIDN 229=r+   hidden_statesattention_maskpast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingsreturnc	                    Un
U R                  U5      nU R                  b  U R                  UUUUS9nSnOU R                  " SUUUUUUUS.U	D6u  pnXU R                  -  -   nUn
U R	                  U5      nU R                  U5      u  pXR                  U5      -   nXU R                  -  -   nU4nU(       a  X4-  nU(       a  X4-  nU(       a  X4-  nU$ )aY  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    output_router_logits (`bool`, *optional*):
        Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
        should not be returned during inference.
    position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
N)rR   rW   cache_paramsrS   )rR   rS   rT   rU   rV   rW   rY   r,   )input_layernormrL   rN   residual_multiplierpost_attention_layernormblock_sparse_moerM   )r'   rR   rS   rT   rU   rV   rW   rX   rY   kwargsresidualself_attn_weights_moe_hidden_statesrouter_logitsoutputss                   r)   forward$GraniteMoeHybridDecoderLayer.forwardJ   s   J !,,];::! JJ+-+-	 ' M !%26.. 	3+--"3#-$7	3 	3/Ma !43K3K#KK !55mD+/+@+@+O()OOM,JJ 43K3K#KK "++G((G''Gr+   )rP   rL   rN   rM   )NNFFNFN)r.   r/   r0   r1   r   r2   r%   torchTensorr   r	   bool
LongTensorr   FloatTensorrh   r3   r4   r5   s   @r)   rI   rI   <   s   >5 ># >  26*.,1$)59/4KOR||R !.R !	R
 $D>R D>R !!1!12R 'tnR &eELL%,,,F&GHR 
u  (51B1BEDUDU1U+V"WW	XR Rr+   rI   c                   6   ^  \ rS rSr\rS/rSrU 4S jrSr	U =r
$ )GraniteMoeHybridPreTrainedModel   rI   Tc                   > [         TU ]  5         [        U[        R                  5      (       ak  UR
                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         g g [        U[        5      (       a  UR                  R                  R                  S5        [        R                   " [        R"                  " SUR$                  S-   5      5      UR&                  l        UR(                  R                  R                  S5        g [        U[*        5      (       a&  UR
                  R                  R                  S5        g g )Ng        )meanstdg      ?r   )r$   _init_weights
isinstancer   Conv1dweightdatanormal_r   initializer_rangebiaszero_r7   dt_biasfill_rj   logarange	num_headsA_logDr<   )r'   moduler(   s     r)   ru   -GraniteMoeHybridPreTrainedModel._init_weights   s    fryy**MM&&CT[[5R5R&S{{&  &&( ' :;;NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ <==MM$$S) >r+   r,   )r.   r/   r0   r1   r   config_class_no_split_modules_is_statefulru   r3   r4   r5   s   @r)   rp   rp      s     )L78L* *r+   rp   c                   l  ^  \ rS rSrS\4U 4S jjr\\           SS\R                  S\
\R                     S\
\R                     S\
\\\\R                     4      S\
\R                     S	\
\   S
\
\   S\
\   S\
\   S\
\   S\
\R                     S\\\4   4S jj5       5       rS rSrU =r$ )GraniteMoeHybridModel   r   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ s H  n[        X5      PM     sn5      U l        g s  snf r"   )r$   r%   r   
ModuleListrangenum_hidden_layersrI   layersr&   s      r)   r%   GraniteMoeHybridModel.__init__   sI     mmNSTZTlTlNmnNm)&<Nmn
ns   A	input_idsrS   position_idspast_key_valuesinputs_embedsrV   rU   output_hidden_statesrX   return_dictrW   rZ   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nXPR                  -  nU(       a  Uc  [        R                  S5        UcD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                   S9nUc  UR#                  S5      nU R%                  X%XU5      nU R'                  X+5      nUnS nU R(                  b  U R)                  X5      nU(       a  SOS nU(       a  SOS nU	(       a  SOS nS nU R*                   H{  nUR,                  S	:X  a  UOUnU(       a  UU4-  nU" UUUUUUU	US
9nUS   nU(       a  UU(       a  SOS   nU(       a  US   b	  UUS   4-  nU	(       d  Mj  US   c  Mr  UUS   4-  nM}     U R/                  U5      nU(       a  UU4-  nU(       a  UOS n[1        UUUUUS9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicer,   rL   )rS   rT   rU   rV   rW   rX   rY   r   )last_hidden_stater   rR   
attentionsrf   )r   rU   r   rV   use_return_dict
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensembedding_multiplierget_seq_lengthrj   r   shaper   	unsqueeze_update_causal_mask_update_mamba_mask
rotary_embr   rP   normr   )r'   r   rS   r   r   r   rV   rU   r   rX   r   rW   past_seen_tokenscausal_mask
mamba_maskrR   rY   all_hidden_statesall_self_attnsall_router_logitsnext_decoder_cachedecoder_layer
layer_masklayer_outputs
next_caches                            r)   rh   GraniteMoeHybridModel.forward   s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M%(A(AA 0K
 !CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 ,,^L
 &"??&"&//-"N #7BD0d"6BD!![[M'4'?'?7'JP[J#!m%55!))."3#-%9$7	M *!,M%28I1q%Q"  #/"}Q'7&99N## $0%-*;)==%? )B 		-0  -!11+4'$
%+&+%+
 	
r+   c                 b    UnUS   S:  d!  Ub   [         R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
r   Nr   )rj   all)r'   rS   rW   r   s       r)   r   (GraniteMoeHybridModel._update_mamba_mask2  s:     $
!q ^%?EIIn`aNaDbDbJr+   )r   )NNNNNNNNNNN)r.   r/   r0   r1   r   r%   r   r   rj   rm   r   rk   r   r	   r   rn   rl   r   r
   rh   r   r3   r4   r5   s   @r)   r   r      sG   
5 
  '+1537KO59$(,0/3/3&*59t
##t
 !.t
 u//0	t

 "%tE4E4E/F(F"GHt
   1 12t
 D>t
 $D>t
 'tnt
 'tnt
 d^t
 !!1!12t
 
u--	.t
  t
l	 	r+   r   c                   Z   ^  \ rS rSrS/rS\4U 4S jjr      S	S jrS\4S jr	Sr
U =r$ )
GraniteMoeHybridForCausalLMi>  zlm_head.weightr   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r"   )r$   r%   r   model	post_initrF   s     r)   r%   $GraniteMoeHybridForCausalLM.__init__A  s&     *62
r+   c                 n   US L n	U	(       d]  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nOaUR                   S   UR                   S   :w  a	  US S 2U4   nO7[        U R                  UR                   S   U R                  U R                  S9nUbZ  UcW  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U	(       d  US S 2UR                   S   * S 24   nUb  U	(       a  SU0n
OSUR                  5       0n
U
R                  UUUUUS.5        U
$ )Nr   r   r   r   r   r   )r   r   rV   rS   rW   )
r   r   r   dtyper   longcumsummasked_fill_
contiguousupdate)r'   r   r   rS   r   rW   r   rV   ra   empty_past_kvmodel_inputss              r)   prepare_inputs_for_generation9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationG  sU    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"0	
 r+   rZ   c                     g)a  
Function overwritten as this class uses its own `HybridMambaAttentionDynamicCache`
and do not need to initialize the Cache in advance in order to save memory
(because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
for `HybridMambaAttentionDynamicCache`).
Fr,   )r'   s    r)   _supports_default_dynamic_cache;GraniteMoeHybridForCausalLM._supports_default_dynamic_cache  s     r+   )r   )NNNNNT)r.   r/   r0   r1   _tied_weights_keysr   r%   r   rl   r   r3   r4   r5   s   @r)   r   r   >  sD    *+5  7r  r+   r   )r   r   rp   )+typingr   r   r   r   rj   r   cache_utilsr	   modeling_outputsr
   r   utilsr   r   r   bamba.configuration_bambar   bamba.modeling_bambar   r   r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   configuration_granitemoehybridr   
get_loggerr.   r   r   r7   r<   rC   rI   rp   r   r   __all__r,   r+   r)   <module>r      s     0 /     O > > 3 b b  C 
		H	%, 9 ,
9 9
+#4 +
!- !
`#? `F*&E *(H1 HVI"= IX fr+   