
    fThF                       S SK JrJrJrJrJr  S SKrS SKJs  J	r
  S SKJr  S SKJs  Js  Jr  S SKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJrJr  SSK J!r!J"r"  SSK#J$r$J%r%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,  \*" 5       (       a  S SK-J.r.  S SK/J0r0J1r1  OSr.\)" 5       (       a	  S SK2J3r3J4r4  OSu  r4r3\&" 5       (       a  S SK5J6r6  SSK7J8r8  \'Rr                  " \:5      r;S r<SKS jr=S\R|                  S\?S\R|                  4S jr@ SLS\R                  S\R|                  S\R|                  S \R|                  S!\\R|                     S"\BS#\B4S$ jjrC " S% S&\R                  5      rD " S' S(\R                  5      rES)\R|                  S*\?4S+ jrFS, rGS- rH\I" \.\3\445      rJS. rK " S/ S0\R                  5      rL " S1 S2\R                  R                  5      rM " S3 S4\R                  5      rN " S5 S6\R                  5      rO " S7 S8\R                  5      rP " S9 S:\R                  5      rQ " S; S<\R                  5      rR " S= S>\5      rS\$ " S? S@\"5      5       rT " SA SB\R                  5      rU\$ " SC SD\T5      5       rV   SMSE\\R|                  \\R|                     S4   SF\\?   S!\\R|                     S\\R|                  \?4   4SG jjrW " SH SI\T\5      rX/ SJQrYg)N    )CallableListOptionalTupleUnionN)nn)ACT2FN   )Cache)GenerationMixin)AttentionMaskConverter)GradientCheckpointingLayer)BaseModelOutputWithPastMoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging)is_causal_conv1d_availableis_mamba_2_ssm_available   )GraniteMoeHybridConfig)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combined)causal_conv1d_fncausal_conv1d_updateNN)	BlockMask)make_flex_block_causal_maskc                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..N   dim)shapetorchcat)xx1x2s      v/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/granitemoehybrid/modeling_granitemoehybrid.pyrotate_halfr2   @   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''    c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer2   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r1   apply_rotary_pos_embr>   G   sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr3   hidden_statesn_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)r+   expandreshape)r?   r@   batchnum_key_value_headsslenhead_dims         r1   	repeat_kvrI   b   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr3   modulequerykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr(   r
   r'   )r*   dtype)ptrainingr   )rI   num_key_value_groupsr,   matmul	transposer+   r   
functionalsoftmaxfloat32torS   rP   rU   
contiguous)rJ   rK   rL   rM   rN   rO   rP   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r1   eager_attention_forwardrd   n   s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r3   c                   l  ^  \ rS rSrSrS\S\4U 4S jjr      SS\R                  S\
\R                     S\
\R                     S	\
\   S
\S\
\R                     S\
\\R                  \R                  4      S\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )GraniteMoeHybridAttention   z=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 z  > [         TU ]  5         Xl        X l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                  U l	        UR                  U l        U R                  U R                  -  U l        UR                  U l        U R                  U R                  -  U l        SU l        UR                   U l        U R                  U R                  -  U R                  :w  a&  [%        SU R                   SU R                   S35      e[&        R(                  " U R                  U R                  U R                  -  UR*                  S9U l        [&        R(                  " U R                  U R                  U R                  -  UR*                  S9U l        [&        R(                  " U R                  U R                  U R                  -  UR*                  S9U l        [&        R(                  " U R                  U R                  UR*                  S9U l        g )NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).bias)super__init__rh   ri   loggerwarning_once	__class____name__attention_dropouthidden_sizenum_attention_heads	num_headsrH   rF   rV   	is_causalattention_multiplierrO   
ValueErrorr   Linearattention_biasq_projk_projv_projo_projselfrh   ri   rq   s      r1   rn   "GraniteMoeHybridAttention.__init__   s   " !8!8 9 :, , "(!9!9!--33((DNN:#)#=#= $(NNd6N6N$N!22MMDNN*t/?/??QRVRbRbQc$T^^$4B8 
 ii 0 0$..4==2PW]WlWlmii 0 0$2J2JT]]2Zagavavwii 0 0$2J2JT]]2Zagavavwii 0 0$2B2BI^I^_r3   r?   rN   r:   past_key_value	use_cachecache_positionposition_embeddingsrA   c                 *   UR                  5       u  pnU R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUb  UOSu  nnUb  [        XUU5      u  pUb$  UXS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [         R#                  S	5        O[$        U R                  R                     nU" U UUUU4U R&                  (       d  S
OU R(                  U R*                  S.UD6u  nnUR	                  XS5      nU R-                  U5      nUUU4$ )Nr   r(   r#   )r9   r8   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )rP   rO   r'   )sizer|   r}   r~   viewrv   rH   rX   rF   r>   updateri   rd   rh   _attn_implementationgetro   rp   r   rU   rs   rO   r   )r   r?   rN   r:   r   r   r   r   r^   bszq_len_query_statesr_   r`   r8   r9   cache_kwargsattention_interfacerc   ra   s                        r1   forward!GraniteMoeHybridAttention.forward   s    &**,A{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm*=*I&|S*';LVY[^'_$L%#&sUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7	%
  $}}C$2H2HLL	%
 	%
!\ "&&s26kk+.L.88r3   )rs   rh   rH   rt   rw   r}   ri   rv   rV   rF   r   r|   rO   r~   )NNNFNN)rr   
__module____qualname____firstlineno____doc__r   intrn   r,   Tensorr   
LongTensorr   boolr   r   __static_attributes____classcell__rq   s   @r1   rf   rf      s    G`5 `# `F 2637*.59KO69||69 !.69 u//0	69
 !69 69 !!1!1269 &eELL%,,,F&GH69 
u||Xell3XeELL>Q5RR	S69 69r3   rf   c                   P   ^  \ rS rSrSr\R                  S4S\4U 4S jjjrSr	U =r
$ ) HybridMambaAttentionDynamicCache   a|  
A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
(which has a constant shape regardless of seq_len).

This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
Nrh   c                 J  > [         T	U ]  XX45        UR                  U l        SU l        UR                  nUR
                  n/ U l        / U l        / U l        [        UR                  5       GH%  nU R                  U   S:X  a  U =R                  [        R                  " UUR                  UR                  -  SUR                  -  U-  -   UUUS9/-  sl        U =R                  [        R                  " UUR                   UR"                  UUUS9/-  sl        M  U =R                  [        R$                  " / /U-  US9/-  sl        U =R                  [        R$                  " / /U-  US9/-  sl        U R                  R'                  U5        GM(     [        UR                  5       Vs/ s H  n[        R$                  " / /U-  US9PM     snU l        [        UR                  5       Vs/ s H  n[        R$                  " / /U-  US9PM     snU l        g s  snf s  snf )NFmambar(   devicerS   r   )rm   rn   layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr,   zerosmamba_expandrt   mamba_n_groupsmamba_n_headsmamba_d_headtensorappend	key_cachevalue_cache)
r   rh   
batch_sizerS   r   conv_kernel_sizessm_state_sizeir   rq   s
            r1   rn   )HybridMambaAttentionDynamicCache.__init__   s   U;!'!9!9"'!..--"$v//0A%%a(G3  KK",,v/A/AAAH]H]D]`nDnn(%#%   KK",,++&%#	$ 	   U\\2$2CF%S$TT ELL"
1B6$R#SS''..q11 14 SXX^XpXpRqrRqQ%,,tj'8HRqrTYZ`ZrZrTstTsqELL"
):6JTst sts   -#H/#H )r   r   r   r   r   r   r   )rr   r   r   r   r   r,   float16r   rn   r   r   r   s   @r1   r   r      s+     JO_c %u5 %u %ur3   r   input_tensorpad_sizec                     [        U R                  5      S:X  a
  SSSSSUSS4OSSSUSS4n[        R                  R                  R                  XSSS9$ )zv
Padding x tensor with `pad_size` on the seq_len dim (dim=1)

Assumes that we only have tensors of either size 4 or 3
   r   constant)moderM   )lenr+   r,   r   rY   pad)r   r   	pad_shapes      r1   pad_tensor_by_sizer   !  sd     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr3   c                    [        X5      n [        U R                  5      S:X  a-  U R                  U R                  S   SX R                  S   5      $ U R                  U R                  S   SX R                  S   U R                  S   5      $ )z
Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
simultaneously splitting it into chunk sequences.

Assumes that we only have tensors of either size 4 or 3
r
   r   r'   r(   )r   r   r+   rD   )r   r   
chunk_sizes      r1   reshape_into_chunksr   ,  s     &l=L
<!###L$6$6q$92zK]K]^_K`aa ##q!2z3E3Ea3H,J\J\]^J_
 	
r3   c           	      
   U R                  S5      nU S   R                  " / U R                  5       QUP76 n [        R                  " [        R                  " XU R
                  [        R                  S9SS9nU R                  U) S5      n [        R                  " U SS9n[        R                  " [        R                  " XU R
                  [        R                  S9SS9nUR                  U) [        R                  * 5      nU$ )zg
More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
r'   .Nr   diagonalr   rR   r)   )
r   rC   r,   trilonesr   r   masked_fillcumsuminf)r   r   masktensor_segsums       r1   segment_sumr   @  s     ""2&J  	*11S<3D3D3FS
SL::ejj@S@S[`[e[efqstD++TE15LLL26M ::ejj@S@S[`[e[efqrsD!--teeiiZ@Mr3   c                     UbO  UR                   S   S:  a<  UR                   S   S:  a)  U R                  nXSS2SS2S4   -  R                  U5      n U $ )ze
Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
Nr   r   )r+   rS   r\   )r?   rN   rS   s      r1   apply_mask_to_padding_statesr   W  s_     !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr3   c                     ^  \ rS rSrSrS\S\4U 4S jjr    SS\R                  S\
\   S\
\R                     S	\
\R                     S
\
\R                     4
S jjr   SS\
\   S\
\R                     S	\
\R                     4S jjr    SS\
\   S\
\R                     S	\
\R                     S
\
\R                     4S jjrSrU =r$ )GraniteMoeHybridMambaLayeric  u'  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)

The are a few differences between this and Mamba2Mixer:
- The variable use_precomputed_states is slightly different due to the HybridCache structure
- There's a few non-obvious bugs fixed with batching in the slow path that exist in main
- Some extra variables that our layer doesn't need have been removed
- We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
rh   ri   c           	        > [         TU ]  5         UR                  U l        UR                  U l        UR
                  U l        UR                  U l        [        UR                  U R                  -  5      U l        X l        UR                  U l        UR                  U l        ["        UR                     U l        UR&                  U l        UR*                  U l        UR.                  U l        UR2                  U l        UR6                  U l        S[;        S5      4U l        SU l        SU l         U R                  SU R0                  -  U R                  -  -   U l!        [D        RF                  " U RB                  U RB                  UR                  U R                  U RB                  U R                  S-
  S9U l$        U R                  U RB                  -   U R                  -   n[D        RJ                  " U R                  UU R(                  S9U l&        [D        RN                  " [P        RR                  " U R                  5      5      U l*        [P        RV                  " SU R                  S-   5      n[D        RN                  " [P        RX                  " U5      5      U l-        S	U RZ                  l.        [_        U R                  U R,                  S
9U l0        [D        RN                  " [P        RR                  " U R                  5      5      U l1        S	U Rb                  l.        [D        RJ                  " U R                  U R                  U R(                  S9U l2        [f        (       d  [h        Rk                  S5        g [h        Rk                  S5        g )Nr   r   gMbP?g?r(   r   )in_channelsout_channelsrl   kernel_sizegroupspaddingrk   Tepsa  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzOThe fast path for GraniteMoeHybrid will be used when running the model on a GPU)6rm   rn   r   rv   rt   r   r   r   r   r   r   intermediate_sizeri   mamba_conv_biasuse_conv_bias
hidden_act
activationr	   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonr   n_groupsr   rH   mamba_chunk_sizer   floattime_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1drz   in_proj	Parameterr,   r   dt_biasarangelogA_log_no_weight_decayGraniteMoeHybridRMSNormGatednormDout_projis_fast_path_availablero   rp   )r   rh   ri   projection_sizeArq   s        r1   rn   #GraniteMoeHybridMambaLayer.__init__q  s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11 !$U5\2" ..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#01G1GTMdMde	ejj89"&		$"8"8$:J:JQUQ^Q^_%%>  qrr3   r?   cache_paramsr   rN   seq_idxc                 h   [        X5      nU R                  U5      nUR                  u  pxn	U R                  U R                  -  n
US L=(       a    UR
                  =(       a    US:H  =(       aw    UR                  U R                     R                  S   UR                  U R                     R                  S   s=:H  =(       a    U:H  Os  =(       a    US L=(       a    US   S:  nU(       Ga  UR                  S5      R                  U R                  U R                  U R                  /SS9u  pn[        UUR                  U R                     U R                  R                   R                  S5      U R                  R"                  U R$                  5      n[&        R                  " UU R                  X/SS9u  pn[&        R(                  " U R*                  R-                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R/                  SU R0                  U R                  5      R3                  [&        R4                  S9nUS S 2S S 2S 4   R/                  SSU R0                  5      nU R6                  S S 2S S4   R/                  SU R0                  5      nU R8                  S S 2S S4   R/                  SU R0                  5      nUR;                  XpR                  UR                  S   U R                  -  5      nUR;                  XpR                  UR                  S   U R                  -  5      nUR;                  XpR                  U R0                  5      n[=        UR                  U R                     UUUUUUS USS9
nUR;                  XpR                  U R0                  -  5      nU R?                  X5      nU RA                  U5      S S 2S S4   nU$ [&        R(                  " U R*                  R-                  5       5      * nU RB                  S	[-        S
5      4:X  a  0 OSU RB                  0nU RD                  (       a  Uc  [G        UU R                  R                   R                  S5      U R                  R"                  U R6                  U4U R8                  U RH                  UU R$                  U R>                  R                   U R>                  RJ                  U R@                  R                   U R@                  R"                  U R0                  U R                  SSS.UD6nU$ UR                  U R                  U R                  U R                  /SS9u  pnUbv  URM                  SS5      n[N        RP                  RS                  UU RT                  UR                  S   -
  S45      nUR                  U R                     RW                  U5        U R$                  S;  aH  U RY                  U R                  URM                  SS5      5      SS U24   RM                  SS5      5      nOn[[        URM                  SS5      U R                  R                   R                  S5      U R                  R"                  U R$                  US9RM                  SS5      n[        X5      n[&        R                  " UU R                  X/SS9u  pn[]        UR;                  XxSU R0                  5      UUUR;                  XxU R                  S5      UR;                  XxU R                  S5      4U RH                  U R8                  S USU R6                  SS.UD6u  nnUb+  Ub(  UR                  U R                     RW                  U5        UR;                  XxS5      nU R?                  UU5      nU RA                  U5      nU$ )Nr   r   r'   r)   .rS   T)zr   dt_softplusr   r   dt_limitF)r  r   r  r   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr(   )siluswish)r.   weightrl   r   r  )r   r  r  r  r  r   r  )/r   r   r+   r   r   r   r   ri   r   squeezesplitr   r   rv   r"   r   r  rl   r   r,   expr  r   rC   rH   r\   r[   r   r  r   r   r  r  r   rU   r    r   variance_epsilonrX   r   rY   r   r   copy_r   r!   r   )r   r?   r  r   rN   r  projected_statesr   seq_lenr   groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr	  r   r  hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedr   scan_output	ssm_states                              r1   cuda_kernels_forward/GraniteMoeHybridMambaLayer.cuda_kernels_forward  s    5]S<<6 "/!4!4
Q!%1D1D!D $ &//&1& ((8>>qA&&t~~6<<Q? & d*& q!A% 	 "*:*B*B1*E*K*K''GR +L +'DR
 !5!((8""**1-  ! #(++!'')?X#Ma 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2''7& M *..z>>DMM;YZM IIm:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K !,,T^^<BB;O??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AAR$c!&+kk%++-C\'#! *C!&&zBNFF:rBFF:rB*  $ff#(, LL $* &*&Y" (\-E ++DNN;AA)L)..zBG"iiT: mmK0
r3   c                    UR                   u  pVnUR                  n[        X5      nU R                  U5      n	U	R	                  U R
                  U R                  U R                  /SS9u  pnUS L=(       a    UR                  =(       a    US:H  =(       aw    UR                  U R                     R                   S   UR                  U R                     R                   S   s=:H  =(       a    U:H  Os  =(       a    US L=(       a    US   S:  nU(       GaT  UR                  U R                     R                  SSS9UR                  U R                  '   US S 2SS S 24   R                  UR                  U R                     R                  5      UR                  U R                     S S 2S S 2S4'   UR                  U R                     R                  U R                  R                   R                  S9n["        R$                  " XR                  R                   R'                  S5      -  SS9nU R(                  (       a  XR                  R*                  -   nU R-                  U5      nOUbu  UR/                  SS5      n[0        R2                  R5                  XR6                  UR                   S   -
  S45      nUR                  U R                     R9                  U5        U R-                  U R                  UR/                  SS5      5      SS U24   R/                  SS5      5      n[        X5      n["        R                  " UU R
                  U R:                  U R<                  -  U R:                  U R<                  -  /SS9u  nnn["        R>                  " U R@                  RC                  5       5      * nU(       Ga  UR                  U R                     R                  nUS S 2SS S 24   S S 2S S4   nUR/                  SS5      RE                  X\R                   S   U RF                  5      nU RH                  S	   RE                  U RH                  R                   S   U RF                  5      n["        R0                  R2                  RK                  UUR                  UR                  5      -   5      n["        RL                  " XRN                  S   U RN                  S   5      nUS
   RE                  U R                  U RF                  U R<                  5      R                  ["        RP                  S9n["        R>                  " US	   U-  5      R                  US9nURS                  XPR:                  S5      SS S S 24   nURE                  XPR:                  U R                  U R:                  -  UR                   S   5      RU                  5       nURS                  USUR                   S   5      nUS	   USS S S 24   -  nURS                  USU RF                  5      nUUS	   -  R                  US9nUR                  U R                     R9                  UR                  U R                     U-  U-   5        URS                  XPR:                  S5      SS S S 24   nURE                  XPR:                  U R                  U R:                  -  UR                   S   5      RU                  5       nURS                  USUR                   S   5      nUR                  U R                     R                  UR                  UR                  S9nURW                  XPR                  -  U RF                  U R<                  5      nURW                  XPR                  -  U R<                  S5      n["        RX                  " UU5      nURW                  XPR                  U RF                  5      nU RZ                  S	   RE                  U RZ                  R                   S   U RF                  5      nUUU-  -   R                  UR                  5      nURS                  US5      S S 2S S4   nGO[0        R2                  RK                  XRH                  -   5      n["        RL                  " XRN                  S   U RN                  S   5      nURS                  XVSU RF                  5      RC                  5       nURS                  XVSU R<                  5      RC                  5       nURS                  XVSU R<                  5      RC                  5       nUR]                  U R                  U R:                  -  SU R                  S9nUR]                  U R                  U R:                  -  SU R                  S9nU R^                  X`R^                  -  -
  U R^                  -  nU RZ                  S	   [a        UU5      -  nUUS	   -  nUR                  UR                  5      U-  nUUUU4 V s/ s H  n [c        U UU R^                  5      PM     sn u  nnnnURe                  SSSS5      n["        Rf                  " USS9n!["        R>                  " [i        U5      5      n"US S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n#U#R%                  SS9n$U$S	   U"Re                  SSSSS5      S	   -  n%U%R%                  SS9n&U&S	   US S 2S S 2S 4   -  R%                  SS9n'["        R>                  " U!S S 2S S 2S S 2SS 24   U!-
  5      n(UU(Re                  SSSS5      S	   -  n)U)SS S S 24   US	   -  R%                  SS9n*U(       a9  UR                  U R                     S S 2S S4   R                  U*R                  S9n+O["        Rj                  " U*S S 2S S24   5      n+["        Rl                  " U+U*/SS9n*["        R>                  " [i        [0        R2                  R5                  U!S S 2S S 2S S 2S4   S5      5      5      n,U,R/                  SS5      n,U,S
   U*S S 2S S 2S S4   -  R%                  SS9n-U-S S 2S S24   U-S S 2S4   n.n*["        R>                  " U!5      n/USS S S 24   U*S S 2S S 2S S4   -  n0U/Re                  SSSS5      n1U0R%                  S5      U1S	   -  n2U'U2-   nURS                  USU R                  U RF                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nURS                  XVS5      nU.b2  Ub/  UR                  U R                     R9                  U.5        SUl        U Ro                  UU
5      n3U Rq                  U3R                  U5      5      n4U4$ s  sn f )Nr'   r)   r   r   )shiftsdimsr   r(   .r   ).NNr  r   )r*   output_sizer
   r   rR   )r   r   T)9r+   rS   r   r   r  r   r   rv   r   r   ri   r   rollr\   r   r   r  r,   sumr  r   rl   r   rX   r   rY   r   r   r!  r   r   r  r  r   rC   rH   r   softplusclampr   r[   rD   r]   r   bmmr  repeat_interleaver   r   r   permuter   r   
zeros_liker-   r  r  )5r   input_statesr  r   rN   r   r#  r   rS   r"  r&  r'  r(  r%  r   r.  r?   r)  r*  r	  cache_devicer   dAdBdBxr   ssm_states_reshaped
C_reshapedyr  r   
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr0  state_decay_outC_times_statesstate_decay_out_permutedY_offr/  contextualized_statess5                                                        r1   torch_forward(GraniteMoeHybridMambaLayer.torch_forward^  s9    ".!3!3
Q"" 4LQ<<5&6&<&<''GR '= '
#
 $ &//&1& ((8>>qA&&t~~6<<Q? & d*& q!A% 	 "7C7O7OPTP^P^7_7d7dlnuw7d7xL$$T^^4ARSTVWYZSZA[A^A^_k_w_wx|  yG  yG  `H  `O  `O  BPL$$T^^4Q2X> '224>>BEET[[M_M_MfMfEgK %		kk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//03H3HKgKmKmnpKq3qst2u ((8>>{K $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89J[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**!'224>>BIIL Aq!GQc\*Ba#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ##DNN399''7"<sB 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!5!5a!8$:N:Nq:QRB)11*r4==Y__aM		*r43F3FGMMOA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99hq!Q|&<x&GIL,..q"b!<YGGGc4l+mI.FFKKPQKRF &"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A $)A''7==iH26/ii4(
 !%knnU.C D$$I &{s   !vc                 ~   [         (       aA  SU R                  R                  R                  R                  ;   a  U R                  XX4U5      $ Ub  [        S5      eUR                  nUbC  UR                  S   S:  a0  UR                  S   S:  a  XS S 2S S 2S 4   -  R                  U5      nU R                  XX45      $ )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r   r   )r  r   r  r   typer1  NotImplementedErrorrS   r+   r\   r[  )r   r?   r  r   rN   r  r^   rS   s           r1   r   "GraniteMoeHybridMambaLayer.forward.  s     "!f0C0C0J0J0O0O&O,,].jqrr%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*Aq$J-GGKKERM!!-~^^r3   )r  r  r   r   r   r   r   r   r   rH   rt   r   r   ri   r   r   r  rv   r  r   r   r   r   r   r   )NNNN)NNN)rr   r   r   r   r   r   r   rn   r,   r   r   r   r   	IntTensorr1  r[  r   r   r   r   s   @r1   r   r   c  sP   As5 As# AsL DH5915-1g||g ?@g !!1!12	g
 !.g %//*gZ DH5915M% ?@M% !!1!12	M%
 !.M%f DH5915-1_ ?@_ !!1!12	_
 !._ %//*_ _r3   r   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )r  iE  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g Nrm   rn   r   r   r,   r   r  r   r   rt   r   rq   s      r1   rn   %GraniteMoeHybridRMSNormGated.__init__F  s-    ll5::k#:; #r3   c                    UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  S5      R                  SSS9nU[        R                  " X@R                  -   5      -  nU R                  UR                  U5      -  $ Nr(   r'   T)keepdim)rS   r\   r,   r[   r   rY   r  powmeanrsqrtr   r  )r   r?   r&  input_dtypevariances        r1   r   $GraniteMoeHybridRMSNormGated.forwardK  s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%H?T?T4T(UU{{]--k:::r3   r   r  gư>re  )rr   r   r   r   rn   r   r   r   r   s   @r1   r  r  E  s    $
	; 	;r3   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	GraniteMoeHybridMLPiW  zj
MLP layer for shared experts

Args:
    config:
        Configuration object with model hyperparameters.
rh   c                 `  > [         [        U ]  5         UR                  U l        UR
                  U l        [        UR                     U l        [        R                  " U R                  U R                  S-  SS9U l        [        R                  " U R                  U R                  SS9U l        g )Nr(   Frk   )rm   ru  rn   rt   
input_sizeshared_intermediate_sizer	   r   r   r   rz   input_linearoutput_linearr   rh   rq   s     r1   rn   GraniteMoeHybridMLP.__init__`  s    !413 ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uUr3   r?   rA   c                     U R                  U5      nUR                  SSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr(   r'   r)   r   r   )ry  chunkr   rz  )r   r?   chunked_hidden_statess      r1   r   GraniteMoeHybridMLP.forwardi  s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r3   )r   rt   ry  rw  rz  )rr   r   r   r   r   r   rn   r,   r   r   r   r   r   s   @r1   ru  ru  W  s7    V5 VU\\ ell  r3   ru  c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )GraniteMoeHybridRMSNormiq  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z6
GraniteMoeHybridRMSNorm is equivalent to T5LayerNorm
Nrf  rg  s      r1   rn    GraniteMoeHybridRMSNorm.__init__r  s/     	ll5::k#:; #r3   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ rj  )	rS   r\   r,   r[   rl  rm  rn  r   r  )r   r?   ro  rp  s       r1   r   GraniteMoeHybridRMSNorm.forwardz  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r3   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler  r+   r   r   s    r1   
extra_repr"GraniteMoeHybridRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr3   rr  rs  )	rr   r   r   r   rn   r   r  r   r   r   s   @r1   r  r  q  s    $;J Jr3   r  c                   B   ^  \ rS rSrS\S\S\SS4U 4S jjrS rS	rU =r$ )
GraniteMoeHybridParallelExpertsi  num_expertsrw  r6  rA   Nc                    > [         TU ]  5         [        R                  " [        R
                  " XU5      5      U l        Xl        X l        X0l	        g)a]  
Initialize the GraniteMoeHybridParallelExperts module.
The experts weights are stored in [num_experts, output_size, input_size] format. Such that it's compatible with
many MoE libraries, such as [Megablock](https://github.com/databricks/megablocks) and
[ScatterMoE](https://github.com/shawntan/scattermoe), as well as the
[MoE kernel](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/fused_moe/fused_moe.py)
used in vllm.

Args:
    num_experts (int):
        Number of experts.
    input_size (int):
        Size of the input.
    output_size (int):
        Size of the output.
N)
rm   rn   r   r   r,   emptyr  r  rw  r6  )r   r  rw  r6  rq   s       r1   rn   (GraniteMoeHybridParallelExperts.__init__  s<    " 	ll5;;{#TU&$&r3   c                     UR                  USS9n/ n[        U R                  5       H8  nUR                  [        R
                  " X5   U R                  U   5      5        M:     [        R                  " USS9nU$ )z
Forward pass of the GraniteMoeHybridParallelExperts module.

Args:
    inputs (Tensor):
        Input tensor.
    expert_size:
        Expert size information.

Returns:
    Tensor: Output tensor.
r   r)   )	r  r   r  r   Flinearr  r,   r-   )r   inputsexpert_size
input_listoutput_listr   resultss          r1   r   'GraniteMoeHybridParallelExperts.forward  sh     \\+1\5
t''(Aqxx
t{{1~FG )))KQ/r3   )rw  r  r6  r  	rr   r   r   r   r   rn   r   r   r   r   s   @r1   r  r    s.    'C 'S 's 't '. r3   r  c                   >   ^  \ rS rSrS\S\S\4U 4S jjrS rSrU =r$ )GraniteMoeHybridTopKGatingi  rw  r  top_kc                 z   > [         TU ]  5         X l        Xl        X0l        [
        R                  " XSS9U l        g)z
Initialize the top-k gating mechanism.
Args:
    input_size (`int`):
        Size of the input.
    num_experts (`int`):
        Number of experts.
    top_k (`int`):
        Number of top experts to select.
Frk   N)rm   rn   r  rw  r  r   rz   layer)r   rw  r  r  rq   s       r1   rn   #GraniteMoeHybridTopKGating.__init__  s2     	&$
YYzUC
r3   c                 z   U R                  U5      R                  5       nUR                  U R                  SS9u  p4[        R
                  " USS9R                  U5      n[        R                  " UR                  S5      U R                  /UR                  UR                  S9nUR                  SUS5      nUR                  5       R                  S5      nUR                  5       nUR!                  5       n	U	R#                  S5      u  pUR%                  U R                  SS9nUR!                  5       nX[   nXXU4$ )Nr   r)   r   rS   r   trunc)rounding_mode)r  r   topkr  r,   rZ   type_asr   r   r  rS   r   scatterlongr8  tolistflattensortdiv)r   r?   logitstop_k_logitstop_k_indicestop_k_gatesr   gatesr  top_k_expertsr   index_sorted_expertsbatch_indexbatch_gatess                 r1   r   "GraniteMoeHybridTopKGating.forward  s"   M*002&,kk$**!k&D#mmLa8@@O a $"2"23;;L;LU`UgUg
 a2jjl&&q) "((* &--/"/"4"4Q"7*..tzz.Q "))+!7#+FRRr3   )rw  r  r  r  r  r   s   @r1   r  r    s-    D3 DS D D&S Sr3   r  c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )GraniteMoeHybridMoEi  z
A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.

Args:
    config:
        Configuration object with model hyperparameters.
rh   c                   > [         [        U ]  5         UR                  U l        UR
                  U l        [        UR                     U l        [        UR                  U R                  U R                  S-  5      U l        [        UR                  U R                  U R                  5      U l        [        U R                  UR                  UR                  S9U l        g )Nr(   )rw  r  r  )rm   r  rn   rt   rw  r   r	   r   r   r  num_local_expertsry  rz  r  num_experts_per_tokrouterr{  s     r1   rn   GraniteMoeHybridMoE.__init__  s    !413 ,,!33 !2!23;$$doot7G7G!7K
 =$$d&6&6
 100,,
r3   c                    UR                  5       u  p#nUR                  SU5      nU R                  U5      u  pVpxn	X   n
U R                  X5      nUR	                  SSS9nU R                  US   5      US   -  nU R                  X5      nXSS2S4   -  n[        R                  " X#-  U R                  4UR                  UR                  S9nUR                  SXm5      nUR                  X#U R                  5      nX4$ )z
Forward pass of the mixture of experts layer.

Args:
    layer_input (Tensor):
        Input tensor.

Returns:
    Tensor:
        Output tensor.
    Tensor:
        Router logits.
r'   r(   r)   r   r   Nr  )r   rD   r  ry  r~  r   rz  r,   r   rw  rS   r   	index_addr   )r   layer_inputr   lengthemb_sizer   r  r  r  router_logitsexpert_inputsr?   r  expert_outputsr   layer_outputs                   r1   r   GraniteMoeHybridMoE.forward  s    !, 0 0 2X!))"h7BF++kBZ?-#0))-E - 3 3A2 3 >(=a(@ADYZ[D\\++MG'ag*>>S\4??;>CWCW`n`u`uvq+F#((dooF**r3   )r   rt   ry  rw  rz  r  )
rr   r   r   r   r   r   rn   r   r   r   r   s   @r1   r  r    s    
5 
&+ +r3   r  c                   b  ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\
   S\	\   S	\	\   S
\	\R                     S\	\   S\	\\R                  \R                  4      S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )GraniteMoeHybridDecoderLayeri  rh   ri   c                   > [         TU ]  5         UR                  U l        S U l        [	        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l	        UR                  U l
        [        U5      U l        S U l        UR                  U   S:X  a  [        X5      U l        O[!        X5      U l        UR                  U   U l        g )Nr   r   )rm   rn   rt   	self_attnr  block_sparse_moer  r   input_layernormpost_attention_layernormresidual_multiplierru  
shared_mlpr   r   r   rf   
layer_typer   s      r1   rn   %GraniteMoeHybridDecoderLayer.__init__  s    !-- 3F ;6v7I7IvObObc(?@R@RX^XkXk(l%#)#=#= -f5
##I.'93FFDJ6vIDN 229=r3   r?   rN   r   r   r   r   output_router_logitsr   rA   c	                    Un
U R                  U5      nU R                  b  U R                  UUUUS9nSnOU R                  " SUUUUUUUS.U	D6u  pnXU R                  -  -   nUn
U R	                  U5      nU R                  U5      u  pXR                  U5      -   nXU R                  -  -   nU4nU(       a  X4-  nU(       a  X4-  nU(       a  X4-  nU$ )aY  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    output_router_logits (`bool`, *optional*):
        Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
        should not be returned during inference.
    position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
N)r?   r   r  rN   )r?   rN   r   r   r   r   r    )r  r   r  r  r  r  r  )r   r?   rN   r   r   r   r   r  r   r^   residualself_attn_weightsr   moe_hidden_statesr  outputss                   r1   r   $GraniteMoeHybridDecoderLayer.forward2  s   J !,,];::! JJ+-+-	 ' M !%26.. 	3+--"3#-$7	3 	3/Ma !43K3K#KK !55mD+/+@+@+O()OOM,JJ 43K3K#KK "++G((G''Gr3   )	r  rt   r  r  r   r  r  r  r  )NNFFNFN)rr   r   r   r   r   r   rn   r,   r   r   r   r   r   r   FloatTensorr   r   r   r   s   @r1   r  r    s   >5 ># >, 26*.,1$)59/4KOR||R !.R !	R
 $D>R D>R !!1!12R 'tnR &eELL%,,,F&GHR 
u  (51B1BEDUDU1U+V"WW	XR Rr3   r  c                   J    \ rS rSr\rSrSrS/rS/r	Sr
SrSrSrSrSrS rSrg	)
GraniteMoeHybridPreTrainedModeli  modelTr  past_key_valuesFc                 ,   [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b$  UR                  R                  R                  5         GO[        U[        R                  5      (       aw  UR                  R                  R                  SU R                  R                  S9  UR                  b1  UR                  R                  UR                     R                  5         O[        U[        5      (       a&  UR                  R                  R                  S5        OM[        U[        5      (       a8  UR                  R                  R                  SU R                  R                  S9  [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         g g [        U[         5      (       a  UR"                  R                  R                  S5        [$        R&                  " [$        R(                  " SUR*                  S-   5      5      UR,                  l        UR.                  R                  R                  S5        g [        U[0        5      (       a&  UR                  R                  R                  S5        g g )Nr   )rm  stdg      ?r   )
isinstancer   rz   r  datanormal_rh   initializer_rangerl   zero_	Embeddingpadding_idxr  fill_r  r   r   r   r,   r   r   rv   r  r  r  )r   rJ   s     r1   _init_weights-GraniteMoeHybridPreTrainedModel._init_weights  s   fbii((MM&&CT[[5R5R&S{{&  &&(--MM&&CT[[5R5R&S!!-""6#5#56<<> 788MM$$S) ?@@MM&&CT[[5R5R&Sfryy**MM&&CT[[5R5R&S{{&  &&( ' :;;NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ <==MM$$S) >r3   r  N)rr   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_cache_class_supports_quantized_cache_supports_static_cache_is_statefulr  r   r  r3   r1   r  r    sL    )L&*#78#4"5!N  $"L*r3   r  c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )GraniteMoeHybridRotaryEmbeddingi  rh   c                   > [         TU ]  5         [        US5      (       aH  UR                  b;  UR                  R	                  SUR                  R	                  S5      5      U l        OSU l        UR                  U l        UR                  U l        Xl	        [        U R
                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                  U l        g )Nrope_scaling	rope_typer_  defaultinv_freqF)
persistent)rm   rn   hasattrr  r   r  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrh   r   rope_init_fnattention_scalingregister_bufferr  original_inv_freq)r   rh   r   r  rq   s       r1   rn   (GraniteMoeHybridRotaryEmbedding.__init__  s    6>**v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r3   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r'   r   mpscpuF)device_typeenabledr(   r)   r  )r  r   rC   r+   r\   r   r  r_  strr,   autocastrX   r-   r8   r  r9   rS   )
r   r.   r:   inv_freq_expandedposition_ids_expandedr  freqsembr8   r9   s
             r1   r   'GraniteMoeHybridRotaryEmbedding.forward  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r  rh   r  r  r  r  r  re  )rr   r   r   r   r   rn   r,   no_gradr   r   r   r   r   s   @r1   r  r    s7    /5 / /" ]]_<  <r3   r  c                   J  ^  \ rS rSrS\4U 4S jjrS rS r\\	           SS\
R                  S\\
R                     S\\
R                     S	\\\\\
R"                     4      S
\\
R"                     S\\   S\\   S\\   S\\   S\\   S\\
R                     S\\\4   4S jj5       5       r SS\\
R                  S4   S\
R                  S\
R                  S	\S\4
S jjr\S\
R                  S\S\S\
R2                  S\
R                  S\4S j5       rS rSrU =r$ )GraniteMoeHybridModeli  rh   c           	      8  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ s H  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        SU l        UR$                  U l        UR                  U l        UR&                  U l        U R                  U R(                  -  U l        UR,                  U l        UR.                  U l        UR0                  U l        U R0                  S:X  a  [3        U5      OS U l        U R7                  5         g s  snf )Nr   Frope)rm   rn   pad_token_idr  
vocab_sizer   r  rt   embed_tokens
ModuleListr   r   r  layersr  r   r  gradient_checkpointingembedding_multiplierru   rv   rH   r  
rope_thetaposition_embedding_typer  
rotary_emb	post_initr   s      r1   rn   GraniteMoeHybridModel.__init__  sE    !.. ++LL):):F<N<NPTP`P`ammNSTZTlTlNmnNm)&<Nmn
 ,F,>,>FDWDWX	&+#$*$?$?!!--33((DNN:'-'E'E$ ++'-'E'E$EIEaEaekEk9&Aqu 	! os   Fc                     U R                   $ re  r  r  s    r1   get_input_embeddings*GraniteMoeHybridModel.get_input_embeddings  s       r3   c                     Xl         g re  r'  r   rM   s     r1   set_input_embeddings*GraniteMoeHybridModel.set_input_embeddings  s    !r3   	input_idsrN   r:   r  inputs_embedsr   r   output_hidden_statesr  return_dictr   rA   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nXPR                  -  nU(       a  Uc  [        R                  S5        UcD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                   S9nUc  UR#                  S5      nU R%                  X%XU5      nU R'                  X+5      nUnS nU R(                  b  U R)                  X5      nU(       a  SOS nU(       a  SOS nU	(       a  SOS nS nU R*                   H{  nUR,                  S	:X  a  UOUnU(       a  UU4-  nU" UUUUUUU	US
9nUS   nU(       a  UU(       a  SOS   nU(       a  US   b	  UUS   4-  nU	(       d  Mj  US   c  Mr  UUS   4-  nM}     U R/                  U5      nU(       a  UU4-  nU(       a  UOS n[1        UUUUUS9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   r   r  r   )rN   r   r   r   r   r  r   r(   r'   )last_hidden_stater  r?   
attentionsr  )rh   r   r0  r   use_return_dictry   r  rU   ro   rp   r  r   get_seq_lengthr,   r   r+   r   r5   _update_causal_mask_update_mamba_maskr#  r  r  r  r   )r   r.  rN   r:   r  r/  r   r   r0  r  r1  r   past_seen_tokensrb   
mamba_maskr?   r   all_hidden_statesall_self_attnsall_router_logitsnext_decoder_cachedecoder_layer
layer_masklayer_outputs
next_caches                            r1   r   GraniteMoeHybridModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M%(A(AA 0K
 !CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 ,,^L
 &"??&"&//-"N #7BD0d"6BD!![[M'4'?'?7'JP[J#!m%55!))."3#-%9$7	M *!,M%28I1q%Q"  #/"}Q'7&99N## $0%-*;)==%? )B 		-0  -!11+4'$
%+&+%+
 	
r3   r$   r   c           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r   flex_attentionr   Fr   )r/  past_key_values_lengthis_trainingr   r'   )sequence_lengthtarget_lengthrS   r   r   )r^  xpunpu)rh   r   anyr  r,   r   r%   r6  is_compileabler   _ignore_causal_mask_sdparU   rS   r+   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r_  finfomin_unmask_unattended)r   rN   r   r   r  r   r9  using_compilable_cacherS   rI  rJ  rb   	min_dtypes                r1   r7  )GraniteMoeHybridModel._update_causal_maskj  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr3   rI  rJ  rS   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr   )
fill_valuerS   r   r   r   r   r'   r   )r*   r,   rR  rS  fullr   triur   rD   rC   cloner+   r\   r   )rN   rI  rJ  rS   r   r   r^   rb   rV  mask_lengthpadding_masks              r1   rQ  KGraniteMoeHybridModel._prepare_4d_causal_attention_mask_with_cache_position  s}   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r3   c                 b    UnUS   S:  d!  Ub   [         R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
r   Nr   )r,   all)r   rN   r   r:  s       r1   r8  (GraniteMoeHybridModel._update_mamba_mask  s:     $
!q ^%?EIIn`aNaDbDbJr3   )r  r   r  rH   rt   r  r  r  rv   r  r"  r!  r#  r  )NNNNNNNNNNN)F)rr   r   r   r   r   rn   r(  r,  r   r   r,   r   r   r   r   r   r   r  r   r   r   r   r7  staticmethodr   rS   rQ  r8  r   r   r   s   @r1   r  r    s   5 2!"  '+1537KO59$(,0/3/3&*59t
##t
 !.t
 u//0	t

 "%tE4E4E/F(F"GHt
   1 12t
 D>t
 $D>t
 'tnt
 'tnt
 d^t
 !!1!12t
 
u--	.t
  t
x #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4l	 	r3   r  gate_logitsr  c                    U b  [        U [        5      (       d  g[        U [        5      (       aC  U S   R                  n[        R                  " U  Vs/ s H  oUR                  U5      PM     snSS9n[        R                  R                  R                  WSS9n[        R                  " XrSS9u  p[        R                  R                  R                  X5      n
Uc:  [        R                  " U
R                  5       SS9n[        R                  " USS9nGOUR                  u  pUR                  S   X-  -  nUSSS2SS2SS4   R                  XXU45      R                  SX!5      R                  W5      n[        R                   " U
R                  5       U-  SS9[        R                   " USS9-  nUSSS2SS2S4   R                  XX45      R                  SU5      R                  U5      n[        R                   " UU-  SS9[        R                   " USS9-  n[        R                   " XR#                  S5      -  5      nUU-  $ s  snf )ap  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    gate_logits:
        Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [batch_size X sequence_length, num_experts].
    num_experts:
        Number of experts
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   r)   r'   )r  r  r   r,   r-   r\   r   rY   rZ   r  one_hotrm  r   r+   rC   rD   r8  r5   )rd  r  r  rN   compute_device
layer_gateconcatenated_gate_logitsrouting_weightsr   selected_expertsexpert_masktokens_per_expertrouter_prob_per_expertr   rI  r   expert_attention_mask router_per_expert_attention_maskoverall_losss                      r1   load_balancing_loss_funcrr    s+   : *[%"@"@+u%%$Q..#(99^i-j^iPZmmN.K^i-jpq#r hh))112JPR1SO**_DA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
4::1=*B^_ 4AtT12V&OKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&OQRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1Q1QRS1TTUL+%%[ .ks   Ic                      ^  \ rS rSrS/rS\4U 4S jjrS rS rS r	S r
S	 rS
 r\             SS\\R                      S\\R"                     S\\R                      S\\\\\R*                     4      S\\R*                     S\\R                      S\\   S\\   S\\   S\\   S\\   S\\R                      S\\\R"                  4   S\\\4   4S jj5       r\S 5       r      SS jrS\4S jrSrU =r $ ) GraniteMoeHybridForCausalLMiC  zlm_head.weightrh   c                 J  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        UR                  U l	        UR                  U l        UR                  U l        U R                  5         g )NFrk   )rm   rn   r  r  r  r   rz   rt   lm_headrouter_aux_loss_coefr  r  r  r$  r{  s     r1   rn   $GraniteMoeHybridForCausalLM.__init__F  s     *62
 ++yy!3!3V5F5FUS$*$?$?!!33#)#=#=  	r3   c                 .    U R                   R                  $ re  r  r  r  s    r1   r(  0GraniteMoeHybridForCausalLM.get_input_embeddingsS  s    zz&&&r3   c                 $    XR                   l        g re  rz  r+  s     r1   r,  0GraniteMoeHybridForCausalLM.set_input_embeddingsV  s    "'

r3   c                     U R                   $ re  rv  r  s    r1   get_output_embeddings1GraniteMoeHybridForCausalLM.get_output_embeddingsY  s    ||r3   c                     Xl         g re  r  )r   new_embeddingss     r1   set_output_embeddings1GraniteMoeHybridForCausalLM.set_output_embeddings\  s    %r3   c                     Xl         g re  r  )r   decoders     r1   set_decoder'GraniteMoeHybridForCausalLM.set_decoder_  s    
r3   c                     U R                   $ re  r  r  s    r1   get_decoder'GraniteMoeHybridForCausalLM.get_decoderb  s    zzr3   r.  rN   r:   r  r/  labelsr   r   r0  r  r1  r   logits_to_keeprA   c                    Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
U	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU R                  UUUUUUUU	U
UUS9nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nUU R                   R                  -  nSnUb:  UR                  5       nU R                  " UU4SU R                   R                  0UD6nSnU
(       af  [        U(       a  UR                  OUS   U R                   U R"                  U5      nUb+  UU R$                  UR'                  UR(                  5      -  -  nU(       d!  U4USS -   nU
(       a  U4U-   nUb  U4U-   $ U$ [+        UUUUR,                  UR.                  UR0                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, GraniteMoeHybridForCausalLM

>>> model = GraniteMoeHybridForCausalLM.from_pretrained("ibm/PowerMoE-3b")
>>> tokenizer = AutoTokenizer.from_pretrained("ibm/PowerMoE-3b")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)r.  rN   r:   r  r/  r   r   r0  r  r1  r   r   r  r'   r   )lossaux_lossr  r  r?   r4  r  )rh   r   r  r0  r5  r  r  r   slicerv  logits_scalingr   loss_functionr  rr  r  r  r  rw  r\   r   r   r  r?   r4  )r   r.  rN   r:   r  r/  r  r   r   r0  r  r1  r   r  r^   r  r?   slice_indicesr  r  r  outputs                         r1   r   #GraniteMoeHybridForCausalLM.forwarde  s   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 %9$D $++JjJj 	 &1%<k$++B]B] **)%+'/!5!5#)  
  
8B>SV8W8W~ot4]kmA}a,?@A$++444\\^F%%  ;;11 	D /)4%%'"+  ((	H !11HKK4LLLY,F#"v-'+'7D7V#CVC(#33!//))!//
 	
r3   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nr  c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectr\   r   ).0
past_statebeam_idxs     r1   	<genexpr>=GraniteMoeHybridForCausalLM._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)r  )r  r  reordered_past
layer_pasts    `  r1   _reorder_cache*GraniteMoeHybridForCausalLM._reorder_cache  s8    )Jncmnn N * r3   c                 n   US L n	U	(       d]  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nOaUR                   S   UR                   S   :w  a	  US S 2U4   nO7[        U R                  UR                   S   U R                  U R                  S9nUbZ  UcW  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U	(       d  US S 2UR                   S   * S 24   nUb  U	(       a  SU0n
OSUR                  5       0n
U
R                  UUUUUS.5        U
$ )Nr'   r   r   r   r/  r.  )r:   r  r   rN   r   )
r+   r   rh   rS   r   r  r   masked_fill_r]   r   )r   r.  r  rN   r/  r   r:   r   r^   empty_past_kvmodel_inputss              r1   prepare_inputs_for_generation9GraniteMoeHybridForCausalLM.prepare_inputs_for_generation  sU    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"0	
 r3   c                     g)a  
Function overwritten as this class uses its own `HybridMambaAttentionDynamicCache`
and do not need to initialize the Cache in advance in order to save memory
(because no back and forth `to_legacy_cache` and `from_legacy_cache` will be performed
for `HybridMambaAttentionDynamicCache`).
Fr  r  s    r1   _supports_default_dynamic_cache;GraniteMoeHybridForCausalLM._supports_default_dynamic_cache  s     r3   )rv  r  r  r  rw  r  )NNNNNNNNNNNNr   )NNNNNT)!rr   r   r   r   _tied_weights_keysr   rn   r(  r,  r  r  r  r  r   r   r,   r   r   r   r   r   r  r   r   r   r   r   rc  r  r  r  r   r   r   s   @r1   rt  rt  C  s   *+5 '(&  151537KO59-1$(,0/3/3&*5934j
E,,-j
 !.j
 u//0	j

 "%tE4E4E/F(F"GHj
   1 12j
 ))*j
 D>j
 $D>j
 'tnj
 'tnj
 d^j
 !!1!12j
 c5<</0j
  
u//	0!j
 j
X   7r  r3   rt  )rt  r  r  )Nr   )r   )Nr(   N)Ztypingr   r   r   r   r   r,   torch.nn.functionalr   rY   r  (transformers.models.jamba.modeling_jambamodelsjambamodeling_jambatransformers.activationsr	   cache_utilsr   
generationr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   utilsr   r   r   r   utils.import_utilsr   r   configuration_granitemoehybridr   +mamba_ssm.ops.triton.selective_state_updater   !mamba_ssm.ops.triton.ssd_combinedr   r    causal_conv1dr!   r"   !torch.nn.attention.flex_attentionr$   integrations.flex_attentionr%   
get_loggerrr   ro   r2   r>   r   r   rI   Moduler   rd   rf   r   r   r   r   ra  r  r   r   r  ru  r  r  r  r  r  r  r  r  rr  rt  __all__r  r3   r1   <module>r     s   , : 9     A A +   ) > 9 j j K F \ \ V B Rmm!DD-7**  !!;J 
		H	%(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % %:Y9		 Y9z3u~'V'V 3urVU\\ VS V
(( 46FH\]^ __ __D;588?? ;$")) 4Jbii J(*bii *Z-S -S`9+")) 9+xf#= fR $*o $* $*N<bii <D \; \ \B	 "&
-1	O&u||U5<<%8$>?O&#O& U\\*	O&
 5<<O&dX"A? Xv fr3   