
    fTh,                     8    S SK Jr  SSKJr   " S S\5      rS/rg)    )Literal   )PretrainedConfigc                      ^  \ rS rSrSrSrS/r                                   S
S\S   4U 4S jjjrU 4S jr	S	r
U =r$ )ModernBertConfig   a  
This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the ModernBERT-base.
e.g. [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 50368):
        Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`ModernBertModel`]
    hidden_size (`int`, *optional*, defaults to 768):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 1152):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 22):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer decoder.
    hidden_activation (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the decoder. Will default to `"gelu"`
        if not specified.
    max_position_embeddings (`int`, *optional*, defaults to 8192):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
        The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
    norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    norm_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the normalization layers.
    pad_token_id (`int`, *optional*, defaults to 50283):
        Padding token id.
    eos_token_id (`int`, *optional*, defaults to 50282):
        End of stream token id.
    bos_token_id (`int`, *optional*, defaults to 50281):
        Beginning of stream token id.
    cls_token_id (`int`, *optional*, defaults to 50281):
        Classification token id.
    sep_token_id (`int`, *optional*, defaults to 50282):
        Separation token id.
    global_rope_theta (`float`, *optional*, defaults to 160000.0):
        The base period of the global RoPE embeddings.
    attention_bias (`bool`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    global_attn_every_n_layers (`int`, *optional*, defaults to 3):
        The number of layers between global attention layers.
    local_attention (`int`, *optional*, defaults to 128):
        The window size for local attention.
    local_rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the local RoPE embeddings.
    embedding_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the embeddings.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the MLP layers.
    mlp_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the MLP layers.
    decoder_bias (`bool`, *optional*, defaults to `True`):
        Whether to use bias in the decoder layers.
    classifier_pooling (`str`, *optional*, defaults to `"cls"`):
        The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
        CLS token doesn't attend to all tokens on long sequences.
    classifier_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the classifier.
    classifier_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the classifier.
    classifier_activation (`str`, *optional*, defaults to `"gelu"`):
        The activation function for the classifier.
    deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
        Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
    sparse_prediction (`bool`, *optional*, defaults to `False`):
        Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
    sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
        The index to ignore for the sparse prediction.
    reference_compile (`bool`, *optional*):
        Whether to compile the layers of the model which were compiled during pretraining. If `None`, then parts of
        the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
        shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
        be faster in some scenarios.
    repad_logits_with_grad (`bool`, *optional*, defaults to `False`):
        When True, ModernBertForMaskedLM keeps track of the logits' gradient when repadding for output. This only
        applies when using Flash Attention 2 with passed labels. Otherwise output logits always have a gradient.

Examples:

```python
>>> from transformers import ModernBertModel, ModernBertConfig

>>> # Initializing a ModernBert style configuration
>>> configuration = ModernBertConfig()

>>> # Initializing a model from the modernbert-base style configuration
>>> model = ModernBertModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
modernbertpast_key_valuesclassifier_poolingclsmeanc$           	        > [         T%U ]  " SUUUUUS.U$D6  Xl        Xpl        X l        X0l        X@l        XPl        Xl        Xl	        Xl
        Xl        UU l        UU l        UU l        X`l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        U U l        U!U l        U"U l        U#U l        U R.                  S;  a  [A        SU R.                   S35      eg )N)pad_token_idbos_token_ideos_token_idcls_token_idsep_token_idr   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is . )!super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsinitializer_rangeinitializer_cutoff_factornorm_eps	norm_biasglobal_rope_thetaattention_biasattention_dropouthidden_activationglobal_attn_every_n_layerslocal_attentionlocal_rope_thetaembedding_dropoutmlp_biasmlp_dropoutdecoder_biasr   classifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionsparse_pred_ignore_indexreference_compilerepad_logits_with_grad
ValueError)&selfr   r   r   r   r   r&   r   r   r    r!   r"   r   r   r   r   r   r#   r$   r%   r'   r(   r)   r*   r+   r,   r-   r   r.   r/   r0   r1   r2   r3   r4   r5   kwargs	__class__s&                                        o/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/modernbert/configuration_modernbert.pyr   ModernBertConfig.__init__   s>   N 	 	
%%%%%	
 	
 %'>$&!2!2#6 !2)B& "!2,!2!2*D'. 0!2 &("4"4.%:"(@%!2(@%!2&<#""/9cdhd{d{c||}~  :    c                 H   > [         TU ]  5       nUR                  SS 5        U$ )Nr4   )r   to_dictpop)r7   outputr9   s     r:   r>   ModernBertConfig.to_dict   s#    "

&-r<   )r$   r%   r0   r/   r.   r   r-   r1   r*   r'   r#   r&   r   r    r   r   r(   r)   r   r+   r,   r"   r!   r   r   r4   r5   r3   r2   r   )#i  i   i        gelui    g{Gz?g       @gh㈵>Fik  j  i  rF   rE   g     AF        r      g     @rG   FrG   Tr   rG   FrD   FFiNF)__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencer   r   r>   __static_attributes____classcell__)r9   s   @r:   r   r      s    eN J#4"5   $"%"#$ 5:$!&!%$IQ8 $M29Q Qf r<   r   N)typingr   configuration_utilsr   r   __all__r   r<   r:   <module>rU      s'   ,  3A' AH 
r<   