
    fTh	1                     l    S r SSKJr  SSKJr  SSKJr  \R                  " \5      r	 " S S\5      r
S/rg)z$GraniteMoeHybrid model configuration   )PretrainedConfig)rope_config_validation)loggingc                      ^  \ rS rSrSrSrSS0rS/r                                       S
U 4S jjr\	S 5       r
S	rU =r$ )GraniteMoeHybridConfig   a@  
This is the configuration class to store the configuration of a [`GraniteMoeHybridConfig`]. It is used to
instantiate an GraniteMoeHybrid model according to the specified arguments, defining the model architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 32000):
        Vocabulary size of the GraniteMoeHybrid model. Defines the number of different tokens that
        can be represented by the `inputs_ids` passed when calling [`GraniteMoeHybridModel`]
    hidden_size (`int`, *optional*, defaults to 4096):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 11008):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 32):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 32):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details checkout [this
        paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
        `num_attention_heads`.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to 2048):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).
        Only relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*):
        Padding token id.
    bos_token_id (`int`, *optional*, defaults to 1):
        Beginning of stream token id.
    eos_token_id (`int`, *optional*, defaults to 2):
        End of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
        strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
        `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
        `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
        these scaling strategies behave:
        https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
        experimental feature, subject to breaking API changes in future versions.
    attention_bias (`bool`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    embedding_multiplier (`float`, *optional*, defaults to 1.0): embedding multiplier.
    logits_scaling (`float`, *optional*, defaults to 1.0): divisor for output logits.
    residual_multiplier (`float`, *optional*, defaults to 1.0): residual multiplier.
    attention_multiplier (`float`, *optional*, defaults to 1.0): attention multiplier.
    num_local_experts (`int`, *optional*, defaults to 8): total number of experts.
    num_experts_per_tok (`int`, *optional*, defaults to 2): number of experts per token.
    output_router_logits (`bool`, *optional*, defaults to `False`):
        Whether or not the router logits should be returned by the model. Enabling this will also
        allow the model to output the auxiliary loss.
    router_aux_loss_coef (`float`, *optional*, defaults to 0.001): router auxialiary loss coefficient
    shared_intermediate_size (`int`, *optional*, defaults to 1024): intermediate size for shared experts.
    position_embedding_type (`str`, *optional*): Positional embedding
        type to be used; defaults to None. Allowed options: `[None, "rope"]`
    layer_types (`List`, *optional*): list of strings to be used as layer types.
        Allowed choices: "mamba", "attention".
    mamba_n_heads (`int`, *optional*, defaults to 128):
        The number of mamba heads used.
    mamba_n_groups (`int`, *optional*, defaults to 1):
        The number of the mamba groups used.
    mamba_d_state (`int`, *optional*, defaults to 256):
        The dimension the mamba latent state space.
    mamba_d_head (`int`, *optional*, defaults to `"auto"`):
        Head embedding dimension size.
    mamba_d_conv (`int`, *optional*, defaults to 4):
        The size of the mamba convolution kernel.
    mamba_expand (`int`, *optional*, defaults to 2):
        Expanding factor (relative to hidden_size) used to determine the mamba intermediate size.
    mamba_chunk_size (`int`, *optional*, defaults to 256):
        The chunks in which to break the sequence when doing prefill/training.
    mamba_conv_bias (`bool`, *optional*, defaults to `True`):
        Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
    mamba_proj_bias (`bool`, *optional*, defaults to `False`):
        Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"])
        of the mamba mixer block.
```python
>>> from transformers import GraniteMoeHybridModel, GraniteMoeHybridConfig

>>> # Initializing a GraniteMoeHybrid config
>>> configuration = GraniteMoeHybridConfig()


>>> # Accessing the model configuration
>>> configuration = model.config
```granitemoehybridlayers_block_typelayer_typespast_key_valuesc(                    > Xl         Xl        X l        X0l        X@l        XPl        Uc  UnX`l        Xpl        Xl        Xl	        Xl
        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        U$U-  n)Ub"  [3        S U 5       5      (       a  [5        S5      eU)U-  S:w  a  [5        S5      eU"S:X  a  U)U-  n"U"U-  U):w  a  [5        S5      eUU l        U"U l        U U l        U!U l        U#U l        U%U l         U&U l!        U'U l"        U$U l#        UU l$        [J        T*U ]  " S	UUUUS.U(D6  U R0                  S:X  a  [O        U 5        g g )
Nc              3   *   #    U  H	  oS ;  v   M     g7f))mamba	attentionN ).0
layer_types     {/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py	<genexpr>2GraniteMoeHybridConfig.__init__.<locals>.<genexpr>   s     *rfqXb=S+Sfqs   z<layer_types must be a list strings in  [`mamba` `attention`]    z4mamba_n_heads must divide mamba_expand * hidden_sizeautozPThe dimensions for the Mamba head state do not match the model intermediate_size)pad_token_idbos_token_ideos_token_idtie_word_embeddingsroper   )(
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actinitializer_rangerms_norm_eps	use_cache
rope_thetarope_scalingattention_biasembedding_multiplierlogits_scalingresidual_multiplierattention_multiplierattention_dropoutnum_local_expertsnum_experts_per_tokoutput_router_logitsrouter_aux_loss_coefshared_intermediate_sizeposition_embedding_typeany
ValueErrormamba_n_headsmamba_d_headmamba_n_groupsmamba_d_statemamba_d_convmamba_chunk_sizemamba_conv_biasmamba_proj_biasmamba_expandr   super__init__r   )+selfr   r    r!   r"   r#   r$   r%   r   r&   r'   r(   r   r   r   r   r)   r*   r+   r0   r,   r-   r.   r/   r1   r2   r3   r4   r5   r6   r   r9   r;   r<   r:   r=   rA   r>   r?   r@   kwargsmamba_intermediate	__class__s+                                             r   rC   GraniteMoeHybridConfig.__init__   s   V %'>$&!2!2#6  &"5#6 $!2("$(,$8!,#6 $8!!2!2#6 $8!$8!(@%'>$)K7"s*rfq*r'r'r[\\-2STT 6!->L-'+==opp*(,*( 0..(& 	
%%% 3		

 	
 ''61"4( 2    c                 \    U R                   (       a  U R                   $ S/U R                  -  $ )Nr   )r   r"   )rD   s    r   r
   (GraniteMoeHybridConfig.layers_block_type   s(    #'#3#3t['TE[E[9[[rI   )#r+   r0   r/   r,   r%   r    r&   r!   r   r-   r>   r?   r=   r:   r<   rA   r;   r9   r@   r   r#   r2   r"   r$   r1   r3   r6   r.   r'   r*   r)   r4   r5   r(   r   )'i }  i   i +      rL   Nsilui   g{Gz?gư>TN      Fg     @NFg              ?rP   rP   rP      rO   FgMbP?i   NN   rN      r      rO   rS   TF)__name__
__module____qualname____firstlineno____doc__
model_typeattribute_mapkeys_to_ignore_at_inferencerC   propertyr
   __static_attributes____classcell__)rG   s   @r   r   r      s    hT $J]M $5"5   $!  ""!% $Qm)` \ \rI   r   N)rY   configuration_utilsr   modeling_rope_utilsr   utilsr   
get_loggerrU   loggerr   __all__r   rI   r   <module>rf      sA     + 3 9  
		H	%c\- c\L $
$rI   