
    fTh\                         S SK Jr  S SKJr  S SKJr  SSKJr  \R                  " \	5      r
 " S S\5      r " S S	\5      rSS	/rg
)   )PretrainedConfig)rope_config_validation)logging   )
AutoConfigc                   j   ^  \ rS rSrSrSrSrS/r                      SU 4S jjrSr	U =r
$ )	CsmDepthDecoderConfig   a  
This is the configuration class to store the configuration of a [`CsmDepthDecoderModel`]. It is used to instantiate an CSM depth decoder
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
a similar configuration to that of the csm-1b.

e.g. [eustlb/csm-1b](https://huggingface.co/eustlb/csm-1b)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    num_codebooks (`int`, *optional*, defaults to 32):
        Number of codebooks used in the underlying codec model responsible for tokenizing the audio.
    backbone_hidden_size (`int`, *optional*, defaults to 2048):
        Dimension of the hidden representations of the backbone model used with this depth decoder.
    vocab_size (`int`, *optional*, defaults to 2051):
        Vocabulary size of the CsmDepthDecoder model. Defines the number of different audio tokens that can be represented by each codebook.
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 8192):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 4):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*, defaults to 2):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details checkout [this
        paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
        `num_attention_heads`.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to 33):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*, defaults to 2050):
        Padding token id.
    bos_token_id (`int`, *optional*):
        Beginning of stream token id.
    eos_token_id (`int`, *optional*):
        End of stream token id.
    rope_theta (`float`, *optional*, defaults to 500000):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    attention_bias (`bool`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
    head_dim (`int`, *optional*):
        The attention head dimension. If None, it will default to hidden_size // num_attention_heads

```python
>>> from transformers import CsmDepthDecoder, CsmDepthDecoderConfig

>>> # Initializing a CsmDepthDecoder
>>> configuration = CsmDepthDecoderConfig()
>>> model = CsmDepthDecoderModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```csm_depth_decoder_modeldepth_decoder_configpast_key_valuesc                 .  > UR                  SS5      (       a  [        S5      e[        TU ]  " SUUUSS.UD6  Xl        X0l        X l        Xl        X@l        XPl	        X`l
        Xpl        Uc  UnXl        Xl        Xl        Xl        Xl        UU l        UU l        UU l        UU l        UU l        Ub  UOU R                  U R                  -  U l        U R$                  b,  SU R$                  ;   a  U R$                  S   U R$                  S'   [/        U 5        g )Ntie_word_embeddingsFzE`tie_word_embeddings=True` is not supported for CsmDepthDecoderConfigpad_token_idbos_token_ideos_token_idr   type	rope_type )pop
ValueErrorsuper__init__num_codebooks
vocab_sizebackbone_hidden_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actinitializer_rangerms_norm_eps	use_cache
rope_thetarope_scalingattention_biasattention_dropoutmlp_biashead_dimr   )selfr   r   r   r   r    r!   r"   r#   r$   r   r%   r&   r'   r   r   r   r(   r)   r*   r+   r,   r-   kwargs	__class__s                           a/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/csm/configuration_csm.pyr   CsmDepthDecoderConfig.__init__   s*   4 ::+U33dee 	
%%% %		

 	
 +$$8!'>$&!2!2#6  &"5#6 $!2("$(,!2 $,$8d>N>NRVRjRj>j (Vt7H7H-H-1->->v-FDk*t$    )r*   r+   r   r-   r$   r   r%   r    r   r,   r"   r   r!   r#   r&   r)   r(   r'   r   )         i             r   silu!   {Gz?h㈵>TNNN  NF        FN)__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keykeys_to_ignore_at_inferencer   __static_attributes____classcell__r0   s   @r1   r	   r	      so    l\ +J,O#4"5 ! "/@% @%r3   r	   c                      ^  \ rS rSrSrSrSrS/r\\	S.r
                             S	U 4S jjrSrU =r$ )
	CsmConfig   a  
This is the configuration class to store the configuration of a [`CsmForConditionalGeneration`]. It is used to instantiate an CSM
model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the csm-1b.

e.g. [eustlb/csm-1b](https://huggingface.co/eustlb/csm-1b)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    num_codebooks (`int`, *optional*, defaults to 32):
        Number of codebooks used in the underlying codec model responsible for tokenizing the audio.
    vocab_size (`int`, *optional*, defaults to 2051):
        Vocabulary size of the Csm model. Defines the number of different audio tokens that can be represented by each codebook.
    text_vocab_size (`int`, *optional*, defaults to 128256):
        Vocabulary size of the text input for the Csm model. Defines the number of different text tokens that can be represented.
    hidden_size (`int`, *optional*, defaults to 2048):
        Dimension of the hidden representations of the backbone model.
    intermediate_size (`int`, *optional*, defaults to 8192):
        Dimension of the MLP representations of the backbone model.
    num_hidden_layers (`int`, *optional*, defaults to 16):
        Number of hidden layers in the backbone model Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 32):
        Number of attention heads for each attention layer in the backbone model Transformer decoder.
    num_key_value_heads (`int`, *optional*, defaults to 8):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details checkout [this
        paper](https://arxiv.org/pdf/2305.13245.pdf).
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the backbone model Transformer decoder.
    max_position_embeddings (`int`, *optional*, defaults to 2048):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*, defaults to 128002):
        Padding token id.
    codebook_pad_token_id (`int`, *optional*, defaults to 2050):
        Padding token id for codebook tokens.
    codebook_eos_token_id (`int`, *optional*, defaults to 0):
        End of stream token id for codebook tokens.
    bos_token_id (`int`, *optional*, defaults to 128000):
        Beginning of stream token id.
    eos_token_id (`int`, *optional*):
        End of stream token id.
    audio_token_id (`int`, *optional*, defaults to 128002):
        Audio token id in the text input.
    audio_eos_token_id (`int`, *optional*, defaults to 128003):
        End of stream token id for audio in the text input.
    rope_theta (`float`, *optional*, defaults to 500000):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*, defaults to `{'factor': 32.0, 'high_freq_factor': 0.5, 'low_freq_factor': 0.125, 'original_max_position_embeddings': 1024, 'rope_type': 'llama3'}`):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    attention_bias (`bool`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
    head_dim (`int`, *optional*):
        The attention head dimension. If None, it will default to hidden_size // num_attention_heads
    tie_codebooks_embeddings (`bool`, *optional*, defaults to `True`):
        Whether to tie the codebook tokens embeddings of the backbone model to the codebook tokens embeddings of the depth decoder.
    depth_decoder_config (`CsmDepthDecoderConfig`, *optional*):
        Configuration for the depth decoder.
    codec_config (`PretrainedConfig`, *optional*):
        Configuration for the codec.

```python
>>> from transformers import CsmForConditionalGeneration, CsmConfig

>>> # Initializing a CsmConfig
>>> configuration = CsmConfig()

>>> # Initializing a model
>>> model = CsmForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```csm
csm_configr   )codec_configr   c                 H  > UR                  SS5      (       a  [        S5      e[        TU ]  " S
UUUSS.UD6  Uc%  [	        5       U l        [        R                  S5        OB[        U[        5      (       a  [	        S
0 UD6U l        O[        U[        5      (       a  UU l        Uc1  [        R                  " S5      U l        [        R                  S5        OM[        U[        5      (       a  [        R                  " S
0 UD6U l        O[        U[        5      (       a  UU l        X0l        Xl        UU l        UU l        Xl        UU l        UU l        X l        Xl        X@l        XPl        X`l        Xpl        Uc  UnXl        Xl        Xl        Xl        Xl        UU l         UU l!        UU l"        UU l#        UU l$        Ub  UOU R.                  U R4                  -  U l%        U RB                  b,  SU RB                  ;   a  U RB                  S   U RB                  S	'   [M        U 5        g )Nr   Fz9`tie_word_embeddings=True` is not supported for CsmConfigr   zAdepth_decoder_config is None, using default depth decoder config.mimiz9codec_config is None, using default audio encoder config.r   r   r   )'r   r   r   r   r	   r   loggerinfo
isinstancedictr   	for_modelrP   r   text_vocab_sizer   audio_token_idaudio_eos_token_idcodebook_pad_token_idcodebook_eos_token_idtie_codebooks_embeddingsr   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r   ) r.   r   r   rX   r   r    r!   r"   r#   r$   r   r%   r&   r'   r   r[   r\   r   r   rY   rZ   r(   r)   r*   r+   r,   r-   r]   r   rP   r/   r0   s                                   r1   r   CsmConfig.__init__T  s   B ::+U33XYY 	
%%% %		

 	
  '(=(?D%KK[\,d33(=(U@T(UD%,.CDD(<D% * 4 4V <DKKSTd++ * 4 4 D| DD&677 ,D.*,"4%:"%:"(@%$'>$&!2!2#6  &"5#6 $!2("$(,!2 $,$8d>N>NRVRjRj>j (Vt7H7H-H-1->->v-FDk*t$r3   )r*   r+   rZ   rY   r\   r[   rP   r   r-   r$   r   r%   r    r   r,   r"   r   r!   r#   r&   r)   r(   rX   r]   r'   r   )r4   r6   i  r5   r7      r4   r9   r:   r5   r<   r=   T i      i  Nr`   i r>   NFr?   FNTNN)r@   rA   rB   rC   rD   rE   rF   rG   r   r	   sub_configsr   rH   rI   rJ   s   @r1   rL   rL      s    zx J"O#4"5" 5K  $"!!%!=^% ^%r3   rL   N)configuration_utilsr   modeling_rope_utilsr   utilsr   auto.configuration_autor   
get_loggerr@   rS   r	   rL   __all__r   r3   r1   <module>ri      sT     4 9  0 
		H	%s%, s%lc%  c%N r3   