
    fTh*                         S r SSKJr  SSKJr  SSKJr  \R                  " \5      r	 " S S\5      r
 " S S	\5      rS	S/rg
)zMusicGen model configuration   )PretrainedConfig)logging   )
AutoConfigc                   f   ^  \ rS rSrSrSrSrS/r                    SU 4S jjrSr	U =r
$ )	MusicgenDecoderConfig   aC  
This is the configuration class to store the configuration of an [`MusicgenDecoder`]. It is used to instantiate a
MusicGen decoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the MusicGen
[facebook/musicgen-small](https://huggingface.co/facebook/musicgen-small) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 2048):
        Vocabulary size of the MusicgenDecoder model. Defines the number of different tokens that can be
        represented by the `inputs_ids` passed when calling [`MusicgenDecoder`].
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the layers and the pooler layer.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of decoder layers.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer block.
    ffn_dim (`int`, *optional*, defaults to 4096):
        Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer block.
    activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the decoder and pooler. If string, `"gelu"`,
        `"relu"`, `"silu"` and `"gelu_new"` are supported.
    dropout (`float`, *optional*, defaults to 0.1):
        The dropout probability for all fully connected layers in the embeddings, text_encoder, and pooler.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    activation_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for activations inside the fully connected layer.
    max_position_embeddings (`int`, *optional*, defaults to 2048):
        The maximum sequence length that this model might ever be used with. Typically, set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    initializer_factor (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layerdrop (`float`, *optional*, defaults to 0.0):
        The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
        for more details.
    scale_embedding (`bool`, *optional*, defaults to `False`):
        Scale embeddings by diving by sqrt(hidden_size).
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether the model should return the last key/values attentions (not used by all models)
    num_codebooks (`int`, *optional*, defaults to 4):
        The number of parallel codebooks forwarded to the model.
    tie_word_embeddings(`bool`, *optional*, defaults to `False`):
        Whether input and output word embeddings should be tied.
    audio_channels (`int`, *optional*, defaults to 1
        Number of channels in the audio data. Either 1 for mono or 2 for stereo. Stereo models generate a separate
        audio stream for the left/right output channels. Mono models generate a single audio stream output.
musicgen_decoderdecoder_configpast_key_valuesc                   > Xl         X l        Xl        X@l        X0l        XPl        Xl        Xl        Xl        Xl	        Xl
        X`l        Xpl        Xl        Xl        US;  a  [        SU S35      eUU l        ["        TU ]H  " SUUUUS.UD6  g )N)   r   z4Expected 1 (mono) or 2 (stereo) audio channels, got z
 channels.)pad_token_idbos_token_ideos_token_idtie_word_embeddings )
vocab_sizemax_position_embeddingshidden_sizeffn_dimnum_hidden_layersnum_attention_headsdropoutattention_dropoutactivation_dropoutactivation_functioninitializer_factor	layerdrop	use_cachescale_embeddingnum_codebooks
ValueErroraudio_channelssuper__init__)selfr   r   r   r   r   r   r    r   r   r   r   r   r   r!   r"   r$   r   r   r   r   kwargs	__class__s                         k/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/musicgen/configuration_musicgen.pyr&   MusicgenDecoderConfig.__init__R   s    0 %'>$&!2#6 !2"4#6 "4"".*'STbSccmnoo, 	
%%% 3		

 	
    )r   r   r   r$   r   r   r   r   r   r   r   r"   r   r!   r    r   )   r-      i              Tgelui   g?r0   r0   g{Gz?F   r   r-   r-   NF)__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keykeys_to_ignore_at_inferencer&   __static_attributes____classcell__r)   s   @r*   r   r      sf    2h $J&O#4"5  $"!+2
 2
r,   r   c                   p   ^  \ rS rSrSrSr\\\S.rSr	U 4S jr
\S\S\S	\4S
 j5       r\S 5       rSrU =r$ )MusicgenConfig   a  
This is the configuration class to store the configuration of a [`MusicgenModel`]. It is used to instantiate a
MusicGen model according to the specified arguments, defining the text encoder, audio encoder and MusicGen decoder
configs.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    kwargs (*optional*):
        Dictionary of keyword arguments. Notably:

            - **text_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
              defines the text encoder config.
            - **audio_encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that
              defines the audio encoder config.
            - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
              the decoder config.

Example:

```python
>>> from transformers import (
...     MusicgenConfig,
...     MusicgenDecoderConfig,
...     T5Config,
...     EncodecConfig,
...     MusicgenForConditionalGeneration,
... )

>>> # Initializing text encoder, audio encoder, and decoder model configurations
>>> text_encoder_config = T5Config()
>>> audio_encoder_config = EncodecConfig()
>>> decoder_config = MusicgenDecoderConfig()

>>> configuration = MusicgenConfig.from_sub_models_config(
...     text_encoder_config, audio_encoder_config, decoder_config
... )

>>> # Initializing a MusicgenForConditionalGeneration (with random weights) from the facebook/musicgen-small style configuration
>>> model = MusicgenForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
>>> config_text_encoder = model.config.text_encoder
>>> config_audio_encoder = model.config.audio_encoder
>>> config_decoder = model.config.decoder

>>> # Saving the model, including its configuration
>>> model.save_pretrained("musicgen-model")

>>> # loading model and config from pretrained folder
>>> musicgen_config = MusicgenConfig.from_pretrained("musicgen-model")
>>> model = MusicgenForConditionalGeneration.from_pretrained("musicgen-model", config=musicgen_config)
```musicgentext_encoderaudio_encoderdecoderTc                   > [         TU ]  " S0 UD6  SU;  d  SU;  d  SU;  a  [        S5      eUR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      nUR                  S5      n[        R
                  " U40 UD6U l        [        R
                  " U40 UD6U l        [        S0 UD6U l	        SU l
        g )NrC   rD   rE   zPConfig has to be initialized with text_encoder, audio_encoder and decoder configr8   Tr   )r%   r&   r#   popr   	for_modelrC   rD   r   rE   is_encoder_decoder)r'   r(   text_encoder_configtext_encoder_model_typeaudio_encoder_configaudio_encoder_model_typer   r)   s          r*   r&   MusicgenConfig.__init__   s    "6"'?&+HI]cLcopp$jj8"5"9"9,"G%zz/:#7#;#;L#I I.&001H`L_`'112JcNbc,>~>"&r,   rJ   rL   r   c                 n    U " SUR                  5       UR                  5       UR                  5       S.UD6$ )z
Instantiate a [`MusicgenConfig`] (or a derived class) from text encoder, audio encoder and decoder
configurations.

Returns:
    [`MusicgenConfig`]: An instance of a configuration object
rB   r   )to_dict)clsrJ   rL   r   r(   s        r*   from_sub_models_config%MusicgenConfig.from_sub_models_config   sD       
,446.668"**,
 	
 	
r,   c                 .    U R                   R                  $ )N)rD   sampling_rate)r'   s    r*   rU   MusicgenConfig.sampling_rate   s     !!///r,   )rD   rE   rI   rC   )r3   r4   r5   r6   r7   r8   r   r   sub_configshas_no_defaults_at_initr&   classmethodr   rR   propertyrU   r;   r<   r=   s   @r*   r?   r?      sq    6p J"#(K
 #'$ 
-
 /
 .	
 
, 0 0r,   r?   N)r7   configuration_utilsr   utilsr   auto.configuration_autor   
get_loggerr3   loggerr   r?   __all__r   r,   r*   <module>ra      sR    # 3  0 
		H	%k
, k
\m0% m0` 4
5r,   