o
    Zh&                     @   sp   d Z ddlmZmZ ddlmZ ddlmZ ee	Z
G dd deZG dd	 d	eZG d
d deZdgZdS )zDBRX model configuration    )AnyOptional   )PretrainedConfig)loggingc                       sH   e Zd ZdZdZ				ddedee d	ed
edef
 fddZ	  Z
S )DbrxAttentionConfiga_  Configuration class for Dbrx Attention.

    [`DbrxAttention`] class. It is used to instantiate attention layers
    according to the specified arguments, defining the layers architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        attn_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability for the attention layers.
        clip_qkv (`float`, *optional*):
            If set, clip the queries, keys, and values in the attention layer to this value.
        kv_n_heads (`int`, *optional*, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
        rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
    attn_config        N        @
attn_pdropclip_qkv
kv_n_heads
rope_thetakwargsc                    sd   t  jdi | || _|| _|| _|| _dD ]}||v r"|| qt|dkr0td|d S )N
model_typeZattn_implementationZtransformers_versionZ_commit_hashZtorch_dtyper   Found unknown kwargs= )	super__init__r   r   r   r   poplen
ValueError)selfr   r   r   r   r   k	__class__r   Z/var/www/auris/lib/python3.10/site-packages/transformers/models/dbrx/configuration_dbrx.pyr   .   s   
zDbrxAttentionConfig.__init__)r	   Nr
   r   )__name__
__module____qualname____doc__base_config_keyfloatr   intr   r   __classcell__r   r   r   r   r      s$    r   c                       sb   e Zd ZdZdZ							dd	ee d
edededee dedee de	f fddZ
  ZS )DbrxFFNConfiga|  Configuration class for Dbrx FFN.

    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
    the specified arguments, defining the layers architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
            The dict should have a key 'name' with the value being the name of the activation function along with
            any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
        ffn_hidden_size (`int`, *optional*, defaults to 3584): The hidden size of the feedforward network.
        moe_num_experts (`int`, *optional*, defaults to 4): The number of experts in the mixture of experts layer.
        moe_top_k (`int`, *optional*, defaults to 1): The number of experts to use in the mixture of experts layer.
        moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
        moe_loss_weight (`float`, *optional*, defaults to 0.01): The loss weight for the mixture of experts layer.
        moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
    
ffn_configN      r
   {Gz?      ?
ffn_act_fnffn_hidden_sizemoe_num_experts	moe_top_kmoe_jitter_epsmoe_loss_weightmoe_normalize_expert_weightsr   c           
         s~   t    |d u rddi}|| _|| _|| _|| _|| _|| _|| _dD ]}	|	|v r/|	|	 q$t
|dkr=td|d S )NnameZsilur   r   r   )r   r   r-   r.   r/   r0   r1   r2   r3   r   r   r   )
r   r-   r.   r/   r0   r1   r2   r3   r   r   r   r   r   r   Z   s"   

zDbrxFFNConfig.__init__)Nr)   r*   r
   Nr+   r,   )r   r    r!   r"   r#   r   dictr%   r$   r   r   r&   r   r   r   r   r'   C   s6    	r'   c                       s   e Zd ZdZdZeedZdddddZ			
											dde	de	de	de	de	de
de
dee dee dede
dedef fddZ  ZS )
DbrxConfiga.  

    This is the configuration class to store the configuration of a [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
    specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a different configuration to that of the [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.


    Args:
        d_model (`int`, *optional*, defaults to 2048):
            Dimensionality of the embeddings and hidden states.
        n_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        n_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        max_seq_len (`int`, *optional*, defaults to 2048):
            The maximum sequence length of the model.
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`DbrxModel`].
        resid_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability applied to the attention output before combining with residual.
        emb_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability for the embedding layer.
        attn_config (`dict`, *optional*):
            A dictionary used to configure the model's attention module.
        ffn_config (`dict`, *optional*):
            A dictionary used to configure the model's FFN module.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss. See [here]() for more details.


    Example:
    ```python
    >>> from transformers import DbrxConfig, DbrxModel

    >>> # Initializing a Dbrx configuration
    >>> configuration = DbrxConfig(n_layers=2, d_model=256, n_heads=8, vocab_size=128)

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = DbrxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    Zdbrx)r   r(   n_headsd_modeln_layersmax_seq_len)Znum_attention_headsZhidden_sizeZnum_hidden_layersZmax_position_embeddings          }  r	   NT{Gz?F
vocab_sizeresid_pdrop	emb_pdropr   r(   	use_cacheinitializer_rangeoutput_router_logitsr   c                    s   |d u r	t  | _nt|trt di || _n|| _|	d u r#t | _nt|	tr1tdi |	| _n|	| _|| _|| _|| _|| _	|| _
|| _|| _|
| _|| _|| _| jj| _|dd}|rctdt jdd|i| d S )Ntie_word_embeddingsFz5tie_word_embeddings is not supported for DBRX models.r   )r   r   
isinstancer5   r'   r(   r8   r7   r9   r:   r@   rA   rB   rC   rD   rE   r   Znum_key_value_headsr   r   r   r   )r   r8   r7   r9   r:   r@   rA   rB   r   r(   rC   rD   rE   r   rF   r   r   r   r      s2   




zDbrxConfig.__init__)r;   r<   r=   r;   r>   r	   r	   NNTr?   F)r   r    r!   r"   r   r   r'   Zsub_configsZattribute_mapr%   r$   r   boolr   r   r&   r   r   r   r   r6   w   s`    6
		
r6   N)r"   typingr   r   Zconfiguration_utilsr   utilsr   Z
get_loggerr   loggerr   r'   r6   __all__r   r   r   r   <module>   s   
)4
q