o
    Zh,                     @   s2   d dl mZ ddlmZ G dd deZdgZdS )    )Literal   )PretrainedConfigc                       s   e Zd ZdZdZdgZ								
																													d!ded f fddZ fdd Z  Z	S )"ModernBertConfiga  
    This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the ModernBERT-base.
    e.g. [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 50368):
            Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`ModernBertModel`]
        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 22):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer decoder.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu"`
            if not specified.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
            The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        norm_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the normalization layers.
        pad_token_id (`int`, *optional*, defaults to 50283):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 50282):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 50281):
            Beginning of stream token id.
        cls_token_id (`int`, *optional*, defaults to 50281):
            Classification token id.
        sep_token_id (`int`, *optional*, defaults to 50282):
            Separation token id.
        global_rope_theta (`float`, *optional*, defaults to 160000.0):
            The base period of the global RoPE embeddings.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        global_attn_every_n_layers (`int`, *optional*, defaults to 3):
            The number of layers between global attention layers.
        local_attention (`int`, *optional*, defaults to 128):
            The window size for local attention.
        local_rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the local RoPE embeddings.
        embedding_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the embeddings.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the MLP layers.
        mlp_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the MLP layers.
        decoder_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the decoder layers.
        classifier_pooling (`str`, *optional*, defaults to `"cls"`):
            The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
            CLS token doesn't attend to all tokens on long sequences.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the classifier.
        classifier_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the classifier.
        classifier_activation (`str`, *optional*, defaults to `"gelu"`):
            The activation function for the classifier.
        deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
            Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
        sparse_prediction (`bool`, *optional*, defaults to `False`):
            Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
        sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
            The index to ignore for the sparse prediction.
        reference_compile (`bool`, *optional*):
            Whether to compile the layers of the model which were compiled during pretraining. If `None`, then parts of
            the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
            shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
            be faster in some scenarios.
        repad_logits_with_grad (`bool`, *optional*, defaults to `False`):
            When True, ModernBertForMaskedLM keeps track of the logits' gradient when repadding for output. This only
            applies when using Flash Attention 2 with passed labels. Otherwise output logits always have a gradient.

    Examples:

    ```python
    >>> from transformers import ModernBertModel, ModernBertConfig

    >>> # Initializing a ModernBert style configuration
    >>> configuration = ModernBertConfig()

    >>> # Initializing a model from the modernbert-base style configuration
    >>> model = ModernBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Z
modernbertZpast_key_values             gelu    {Gz?       @h㈵>Fk  j  i       A        r           @TclsNclassifier_poolingr   meanc$           %         s   t  jd|||||d|$ || _|| _|| _|| _|| _|| _|| _|	| _	|
| _
|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _|| _| | _|!| _|"| _|#| _| jdvrwt d| j dd S )N)pad_token_idbos_token_ideos_token_idcls_token_idsep_token_idr   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is . )!super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsinitializer_rangeinitializer_cutoff_factornorm_eps	norm_biasglobal_rope_thetaattention_biasattention_dropouthidden_activationglobal_attn_every_n_layerslocal_attentionlocal_rope_thetaembedding_dropoutmlp_biasmlp_dropoutdecoder_biasr   classifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionsparse_pred_ignore_indexreference_compilerepad_logits_with_grad
ValueError)%selfr%   r'   r(   r)   r*   r2   r&   r+   r,   r-   r.   r   r   r   r   r    r/   r0   r1   r3   r4   r5   r6   r7   r8   r9   r   r:   r;   r<   r=   r>   r?   r@   rA   kwargs	__class__r"   f/var/www/auris/lib/python3.10/site-packages/transformers/models/modernbert/configuration_modernbert.pyr$      sX   '
zModernBertConfig.__init__c                    s   t   }|dd  |S )Nr@   )r#   to_dictpop)rC   outputrE   r"   rG   rH      s   
zModernBertConfig.to_dict)#r   r   r   r	   r
   r   r   r   r   r   Fr   r   r   r   r   r   Fr   r   r   r   r   Fr   Tr   r   Fr   FFr   NF)
__name__
__module____qualname____doc__Z
model_typeZkeys_to_ignore_at_inferencer   r$   rH   __classcell__r"   r"   rE   rG   r      sV    gSr   N)typingr   Zconfiguration_utilsr   r   __all__r"   r"   r"   rG   <module>   s
    
E