
    fTh&                         S r SSKJrJr  SSKJr  SSKJr  \R                  " \	5      r
 " S S\5      r " S S	\5      r " S
 S\5      rS/rg)zDBRX model configuration    )AnyOptional   )PretrainedConfig)loggingc                   Z   ^  \ rS rSrSrSr    SS\S\\   S\S\S\	4
U 4S	 jjjr
S
rU =r$ )DbrxAttentionConfig   a/  Configuration class for Dbrx Attention.

[`DbrxAttention`] class. It is used to instantiate attention layers
according to the specified arguments, defining the layers architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    attn_pdrop (`float`, *optional*, defaults to 0.0):
        The dropout probability for the attention layers.
    clip_qkv (`float`, *optional*):
        If set, clip the queries, keys, and values in the attention layer to this value.
    kv_n_heads (`int`, *optional*, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
    rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
attn_config
attn_pdropclip_qkv
kv_n_heads
rope_thetakwargsc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        S H  nXe;   d  M
  UR                  U5        M     [        U5      S:w  a  [        SU< 35      eg )N
model_typeattn_implementationtransformers_version_commit_hashtorch_dtyper   Found unknown kwargs= )	super__init__r   r   r   r   poplen
ValueError)selfr   r   r   r   r   k	__class__s          c/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/dbrx/configuration_dbrx.pyr   DbrxAttentionConfig.__init__.   sj     	"6"$ $$mA{

1 n v;!5fY788     )r   r   r   r   )        N   g     @)__name__
__module____qualname____firstlineno____doc__base_config_keyfloatr   intr   r   __static_attributes____classcell__r!   s   @r"   r	   r	      s[    " $O  $(#99 5/9 	9
 9 9 9r$   r	   c                   x   ^  \ rS rSrSrSr       SS\\   S\S\S\S\\	   S	\	S
\\	   S\
4U 4S jjjrSrU =r$ )DbrxFFNConfigC   a@  Configuration class for Dbrx FFN.

[`DbrxFFN`] class. It is used to instantiate feedforward layers according to
the specified arguments, defining the layers architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
        The dict should have a key 'name' with the value being the name of the activation function along with
        any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
    ffn_hidden_size (`int`, *optional*, defaults to 3584): The hidden size of the feedforward network.
    moe_num_experts (`int`, *optional*, defaults to 4): The number of experts in the mixture of experts layer.
    moe_top_k (`int`, *optional*, defaults to 1): The number of experts to use in the mixture of experts layer.
    moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
    moe_loss_weight (`float`, *optional*, defaults to 0.01): The loss weight for the mixture of experts layer.
    moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.

ffn_config
ffn_act_fnffn_hidden_sizemoe_num_experts	moe_top_kmoe_jitter_epsmoe_loss_weightmoe_normalize_expert_weightsr   c                   > [         T
U ]  5         Uc  SS0nXl        X l        X0l        X@l        XPl        X`l        Xpl        S H  n	X;   d  M
  UR                  U	5        M     [        U5      S:w  a  [        SU< 35      eg )Nnamesilur   r   r   )r   r   r6   r7   r8   r9   r:   r;   r<   r   r   r   )r   r6   r7   r8   r9   r:   r;   r<   r   r    r!   s             r"   r   DbrxFFNConfig.__init__Z   s     	 &)J$..",.,H)mA{

1 n v;!5fY788 r$   )r6   r7   r:   r;   r<   r8   r9   )Ni      r&   Ng{Gz?g      ?)r'   r(   r)   r*   r+   r,   r   dictr.   r-   r   r   r/   r0   r1   s   @r"   r3   r3   C   s    ( #O &*# *.!%8;9TN9 9 	9
 9 !9 9 '/uo9 9 9r$   r3   c                      ^  \ rS rSrSrSr\\S.rSSSSS	.r	            SS\
S\
S\
S\
S
\
S\S\S\\   S\\   S\S\S\S\4U 4S jjjrSrU =r$ )
DbrxConfigw   a
  

This is the configuration class to store the configuration of a [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a different configuration to that of the [databricks/dbrx-instruct](https://huggingface.co/databricks/dbrx-instruct) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    d_model (`int`, *optional*, defaults to 2048):
        Dimensionality of the embeddings and hidden states.
    n_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    n_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    max_seq_len (`int`, *optional*, defaults to 2048):
        The maximum sequence length of the model.
    vocab_size (`int`, *optional*, defaults to 32000):
        Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`DbrxModel`].
    resid_pdrop (`float`, *optional*, defaults to 0.0):
        The dropout probability applied to the attention output before combining with residual.
    emb_pdrop (`float`, *optional*, defaults to 0.0):
        The dropout probability for the embedding layer.
    attn_config (`dict`, *optional*):
        A dictionary used to configure the model's attention module.
    ffn_config (`dict`, *optional*):
        A dictionary used to configure the model's FFN module.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    output_router_logits (`bool`, *optional*, defaults to `False`):
        Whether or not the router logits should be returned by the model. Enabling this will also
        allow the model to output the auxiliary loss. See [here]() for more details.


Example:
```python
>>> from transformers import DbrxConfig, DbrxModel

>>> # Initializing a Dbrx configuration
>>> configuration = DbrxConfig(n_layers=2, d_model=256, n_heads=8, vocab_size=128)

>>> # Initializing a model (with random weights) from the configuration
>>> model = DbrxModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
dbrx)r   r5   n_headsd_modeln_layersmax_seq_len)num_attention_headshidden_sizenum_hidden_layersmax_position_embeddings
vocab_sizeresid_pdrop	emb_pdropr   r5   	use_cacheinitializer_rangeoutput_router_logitsr   c                   > Uc  [        5       U l        O,[        U[        5      (       a  [        S0 UD6U l        OXl        U	c  [	        5       U l        O,[        U	[        5      (       a  [	        S0 U	D6U l        OXl        Xl        X l        X0l        X@l	        XPl
        X`l        Xpl        Xl        Xl        Xl        U R                  R                   U l        UR%                  SS5      nU(       a  ['        S5      e[(        TU ]T  " SSU0UD6  g )Ntie_word_embeddingsFz5tie_word_embeddings is not supported for DBRX models.r   )r	   r   
isinstancerB   r3   r5   rH   rG   rI   rJ   rO   rP   rQ   rR   rS   rT   r   num_key_value_headsr   r   r   r   )r   rH   rG   rI   rJ   rO   rP   rQ   r   r5   rR   rS   rT   r   rV   r!   s                  r"   r   DbrxConfig.__init__   s      24DT**2A[AD*+oDO
D))+9j9DO(O &$&""!2$8!#'#3#3#>#> $jj)>FTUUK-@KFKr$   )r   rH   rQ   r5   rS   rJ   rG   rI   rX   rT   rP   rR   rO   )         rZ   i }  r%   r%   NNTg{Gz?F)r'   r(   r)   r*   r+   r   r	   r3   sub_configsattribute_mapr.   r-   r   boolr   r   r/   r0   r1   s   @r"   rD   rD   w   s    4l J"5]SK( '#0	M  59.2#'%*.L.L .L 	.L
 .L .L .L .L 12.L ]+.L .L !.L #.L .L .Lr$   rD   N)r+   typingr   r   configuration_utilsr   utilsr   
get_loggerr'   loggerr	   r3   rD   __all__r   r$   r"   <module>rf      s\       3  
		H	%&9* &9R19$ 19hnL! nLb .r$   