
    fTh[                         S r SSKrSSKrSSKJr  SSKJr  \R                  " \5      r	 " S S\5      r
 " S S	\5      rSS	/rg)
zSpeechT5 model configuration    N   )PretrainedConfig)loggingc                      ^  \ rS rSrSrSrSSS.r                                                         S
U 4S jjrS rS	r	U =r
$ )SpeechT5Config   a+  
This is the configuration class to store the configuration of a [`SpeechT5Model`]. It is used to instantiate a
SpeechT5 model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the SpeechT5
[microsoft/speecht5_asr](https://huggingface.co/microsoft/speecht5_asr) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 81):
        Vocabulary size of the SpeechT5 model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed to the forward method of [`SpeechT5Model`].
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    encoder_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    encoder_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    encoder_ffn_dim (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    encoder_layerdrop (`float`, *optional*, defaults to 0.1):
        The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
        for more details.
    decoder_layers (`int`, *optional*, defaults to 6):
        Number of hidden layers in the Transformer decoder.
    decoder_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer decoder.
    decoder_ffn_dim (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer decoder.
    decoder_layerdrop (`float`, *optional*, defaults to 0.1):
        The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
        for more details.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` are supported.
    positional_dropout (`float`, *optional*, defaults to 0.1):
        The dropout probability for the text position encoding layers.
    hidden_dropout (`float`, *optional*, defaults to 0.1):
        The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
    attention_dropout (`float`, *optional*, defaults to 0.1):
        The dropout ratio for the attention probabilities.
    activation_dropout (`float`, *optional*, defaults to 0.1):
        The dropout ratio for activations inside the fully connected layer.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    layer_norm_eps (`float`, *optional*, defaults to 1e-5):
        The epsilon used by the layer normalization layers.
    scale_embedding (`bool`, *optional*, defaults to `False`):
        Scale embeddings by diving by sqrt(d_model).
    feat_extract_norm (`str`, *optional*, defaults to `"group"`):
        The norm to be applied to 1D convolutional layers in the speech encoder pre-net. One of `"group"` for group
        normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
        convolutional layers.
    feat_proj_dropout (`float`, *optional*, defaults to 0.0):
        The dropout probability for output of the speech encoder pre-net.
    feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the 1D convolutional layers of the feature
        extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
    conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
        A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
        speech encoder pre-net. The length of *conv_dim* defines the number of 1D convolutional layers.
    conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
        A tuple of integers defining the stride of each 1D convolutional layer in the speech encoder pre-net. The
        length of *conv_stride* defines the number of convolutional layers and has to match the length of
        *conv_dim*.
    conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
        A tuple of integers defining the kernel size of each 1D convolutional layer in the speech encoder pre-net.
        The length of *conv_kernel* defines the number of convolutional layers and has to match the length of
        *conv_dim*.
    conv_bias (`bool`, *optional*, defaults to `False`):
        Whether the 1D convolutional layers have a bias.
    num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
        Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
        embeddings layer.
    num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
        Number of groups of 1D convolutional positional embeddings layer.
    apply_spec_augment (`bool`, *optional*, defaults to `True`):
        Whether to apply *SpecAugment* data augmentation to the outputs of the speech encoder pre-net. For
        reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
        Recognition](https://arxiv.org/abs/1904.08779).
    mask_time_prob (`float`, *optional*, defaults to 0.05):
        Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
        procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
        reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
        masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
        actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
    mask_time_length (`int`, *optional*, defaults to 10):
        Length of vector span along the time axis.
    mask_time_min_masks (`int`, *optional*, defaults to 2),:
        The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
        irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
        mask_time_min_masks''
    mask_feature_prob (`float`, *optional*, defaults to 0.0):
        Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
        masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
        the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
        span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
        may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
        True`.
    mask_feature_length (`int`, *optional*, defaults to 10):
        Length of vector span along the feature axis.
    mask_feature_min_masks (`int`, *optional*, defaults to 0),:
        The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
        step, irrespectively of `mask_feature_prob`. Only relevant if
        ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
    num_mel_bins (`int`, *optional*, defaults to 80):
        Number of mel features used per input features. Used by the speech decoder pre-net. Should correspond to
        the value used in the [`SpeechT5Processor`] class.
    speech_decoder_prenet_layers (`int`, *optional*, defaults to 2):
        Number of layers in the speech decoder pre-net.
    speech_decoder_prenet_units (`int`, *optional*, defaults to 256):
        Dimensionality of the layers in the speech decoder pre-net.
    speech_decoder_prenet_dropout (`float`, *optional*, defaults to 0.5):
        The dropout probability for the speech decoder pre-net layers.
    speaker_embedding_dim (`int`, *optional*, defaults to 512):
        Dimensionality of the *XVector* embedding vectors.
    speech_decoder_postnet_layers (`int`, *optional*, defaults to 5):
        Number of layers in the speech decoder post-net.
    speech_decoder_postnet_units (`int`, *optional*, defaults to 256):
        Dimensionality of the layers in the speech decoder post-net.
    speech_decoder_postnet_kernel (`int`, *optional*, defaults to 5):
        Number of convolutional filter channels in the speech decoder post-net.
    speech_decoder_postnet_dropout (`float`, *optional*, defaults to 0.5):
        The dropout probability for the speech decoder post-net layers.
    reduction_factor (`int`, *optional*, defaults to 2):
        Spectrogram length reduction factor for the speech decoder inputs.
    max_speech_positions (`int`, *optional*, defaults to 4000):
        The maximum sequence length of speech features that this model might ever be used with.
    max_text_positions (`int`, *optional*, defaults to 450):
        The maximum sequence length of text features that this model might ever be used with.
    encoder_max_relative_position (`int`, *optional*, defaults to 160):
        Maximum distance for relative position embedding in the encoder.
    use_guided_attention_loss (`bool`, *optional*, defaults to `True`):
        Whether to apply guided attention loss while training the TTS model.
    guided_attention_loss_num_heads (`int`, *optional*, defaults to 2):
        Number of attention heads the guided attention loss will be applied to. Use -1 to apply this loss to all
        attention heads.
    guided_attention_loss_sigma (`float`, *optional*, defaults to 0.4):
        Standard deviation for guided attention loss.
    guided_attention_loss_scale (`float`, *optional*, defaults to 10.0):
        Scaling coefficient for guided attention loss (also known as lambda).
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).

Example:

```python
>>> from transformers import SpeechT5Model, SpeechT5Config

>>> # Initializing a "microsoft/speecht5_asr" style configuration
>>> configuration = SpeechT5Config()

>>> # Initializing a model (with random weights) from the "microsoft/speecht5_asr" style configuration
>>> model = SpeechT5Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```speecht5encoder_attention_headsencoder_layers)num_attention_headsnum_hidden_layersc:           
        > Xl         X l        X0l        XPl        X@l        X`l        Xpl        Xl        Xl        Xl	        Xl
        Xl        Xl        Xl        Xl        UU l        UU l        UU l        UU l        UU l        UU l        [+        U5      U l        [+        U5      U l        [+        U5      U l        UU l        UU l        UU l        [9        U R,                  5      U l        [9        U R.                  5      U R:                  :w  dF  [9        U R0                  5      U R:                  :w  d#  [9        U R,                  5      U R:                  :w  aN  [=        S[9        U R,                  5       S[9        U R.                  5       S[9        U R0                  5       S35      eUU l        UU l         UU l!        UU l"        U U l#        U!U l$        U"U l%        U'U l&        U(U l'        U)U l(        U*U l)        U+U l*        U,U l+        U-U l,        U.U l-        U/U l.        U0U l/        U1U l0        U2U l1        U3U l2        U4U l3        U5U l4        U6U l5        U7U l6        U8U l7        U9U l8        [r        T;U ]  " SU#U$U%U9U&S.U:D6  g )NzConfiguration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` == `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) = z`, `len(config.conv_stride) = z`, `len(config.conv_kernel) = z`.)pad_token_idbos_token_ideos_token_idis_encoder_decoderdecoder_start_token_id );
vocab_sizehidden_sizer   encoder_ffn_dimr
   encoder_layerdropdecoder_layersdecoder_ffn_dimdecoder_attention_headsdecoder_layerdrop
hidden_actpositional_dropouthidden_dropoutattention_dropoutactivation_dropoutinitializer_rangelayer_norm_epsscale_embeddingfeat_extract_normfeat_proj_dropoutfeat_extract_activationlistconv_dimconv_strideconv_kernel	conv_biasnum_conv_pos_embeddingsnum_conv_pos_embedding_groupslennum_feat_extract_layers
ValueErrorapply_spec_augmentmask_time_probmask_time_lengthmask_time_min_masksmask_feature_probmask_feature_lengthmask_feature_min_masksnum_mel_binsspeech_decoder_prenet_layersspeech_decoder_prenet_unitsspeech_decoder_prenet_dropoutspeaker_embedding_dimspeech_decoder_postnet_layersspeech_decoder_postnet_unitsspeech_decoder_postnet_kernelspeech_decoder_postnet_dropoutreduction_factormax_speech_positionsmax_text_positionsencoder_max_relative_positionuse_guided_attention_lossguided_attention_loss_num_headsguided_attention_loss_sigmaguided_attention_loss_scale	use_cacher   super__init__)<selfr   r   r   r
   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r)   r*   r+   r,   r-   r.   r2   r3   r4   r5   r6   r7   r8   r   r   r   r   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   r   kwargs	__class__s<                                                              k/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/speecht5/configuration_speecht5.pyrL   SpeechT5Config.__init__   su   z %&,.'>$!2,.'>$!2$"4,!2"4!2,.!2!2'>$X,,"'>$-J*'*4=='9$ !!"d&B&BBD$$%)E)EEDMM"d&B&BB&''Ec$JZJZF[E\ ]//243C3C/D.ERI  #5, 0#6 !2#6 &<#(,H)+F(-J*%:"-J*,H)-J*.L+ 0$8!"4-J*)B&/N,+F(+F(""4 	
%%%1#9	
 	
    c                 b    [         R                  " [        R                  U R                  S5      $ )N   )	functoolsreduceoperatormulr*   )rM   s    rP   inputs_to_logits_ratio%SpeechT5Config.inputs_to_logits_ratioP  s!    d.>.>BBrR   )6r!   r2   r    r,   r)   r+   r*   r   r   r   r   r
   r   r   r   rE   r'   r%   r&   rG   rI   rH   r   r   r   r"   r   r#   r7   r8   r6   r4   r5   r3   rC   rD   r.   r-   r0   r9   r   rB   r$   r=   rA   r@   r>   r?   r<   r:   r;   rJ   rF   r   )9Q   i      r\      皙?   r]   r\   r^   gelur^   r^   r^   r^   g{Gz?gh㈵>Fgroup        r`   )   rc   rc   rc   rc   rc   rc   )      re   re   re   re   re   )
   r   r   r   r   re   re   F      Tg?rf   re   rb   rf   r   rT   r   re   re   P   re            ?rc   rd   rj   rd   rk   re   i  i     Tre   g?g      $@TT)__name__
__module____qualname____firstlineno____doc__
model_typeattribute_maprL   rY   __static_attributes____classcell__rO   s   @rP   r   r      s    ^@ J,E\lmM  " "! &4)* #&(  %&$'&)!&'%(&''*!&)"&()$'$(uO
bC CrR   r   c            
       f   ^  \ rS rSrSrSrSSS/ SQ/ SQ/ S	Q/ S
Q/ S
Q/ S
Q/SSS4
U 4S jjrSrU =r$ )SpeechT5HifiGanConfigiT  aU  
This is the configuration class to store the configuration of a [`SpeechT5HifiGanModel`]. It is used to instantiate
a SpeechT5 HiFi-GAN vocoder model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the SpeechT5
[microsoft/speecht5_hifigan](https://huggingface.co/microsoft/speecht5_hifigan) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    model_in_dim (`int`, *optional*, defaults to 80):
        The number of frequency bins in the input log-mel spectrogram.
    sampling_rate (`int`, *optional*, defaults to 16000):
        The sampling rate at which the output audio will be generated, expressed in hertz (Hz).
    upsample_initial_channel (`int`, *optional*, defaults to 512):
        The number of input channels into the upsampling network.
    upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[4, 4, 4, 4]`):
        A tuple of integers defining the stride of each 1D convolutional layer in the upsampling network. The
        length of *upsample_rates* defines the number of convolutional layers and has to match the length of
        *upsample_kernel_sizes*.
    upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 8, 8]`):
        A tuple of integers defining the kernel size of each 1D convolutional layer in the upsampling network. The
        length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match the length of
        *upsample_rates*.
    resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
        A tuple of integers defining the kernel sizes of the 1D convolutional layers in the multi-receptive field
        fusion (MRF) module.
    resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
        A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
        multi-receptive field fusion (MRF) module.
    initializer_range (`float`, *optional*, defaults to 0.01):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    leaky_relu_slope (`float`, *optional*, defaults to 0.1):
        The angle of the negative slope used by the leaky ReLU activation.
    normalize_before (`bool`, *optional*, defaults to `True`):
        Whether or not to normalize the spectrogram before vocoding using the vocoder's learned mean and variance.

Example:

```python
>>> from transformers import SpeechT5HifiGan, SpeechT5HifiGanConfig

>>> # Initializing a "microsoft/speecht5_hifigan" style configuration
>>> configuration = SpeechT5HifiGanConfig()

>>> # Initializing a model (with random weights) from the "microsoft/speecht5_hifigan" style configuration
>>> model = SpeechT5HifiGan(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```hifiganri   i>  rc   )   rz   rz   rz   )   r{   r{   r{   )r         )rT   r   rd   g{Gz?r^   Tc                    > Xl         X l        X0l        X@l        XPl        X`l        Xpl        Xl        Xl        Xl	        [        TU ],  " S0 UD6  g )Nr   )model_in_dimsampling_rateupsample_initial_channelupsample_ratesupsample_kernel_sizesresblock_kernel_sizesresblock_dilation_sizesr"   leaky_relu_slopenormalize_beforerK   rL   )rM   r   r   r   r   r   r   r   r"   r   r   rN   rO   s               rP   rL   SpeechT5HifiGanConfig.__init__  sQ     )*(@%,%:"%:"'>$!2 0 0"6"rR   )
r"   r   r   r   r   r   r   r   r   r   )	rm   rn   ro   rp   rq   rr   rL   rt   ru   rv   s   @rP   rx   rx   T  sA    2h J !$#*(!*Iy A# #rR   rx   )rq   rU   rW   configuration_utilsr   utilsr   
get_loggerrm   loggerr   rx   __all__r   rR   rP   <module>r      sW    #   3  
		H	%vC% vCr	O#, O#d 4
5rR   