o
    ZhlY                     @   s   d Z ddlmZmZmZmZmZ ddlZddl	m
Z
mZmZmZ ddlmZ ddlmZ ddlmZmZmZ eeZG d	d
 d
eZd
gZdS )z)Feature extractor class for UnivNetModel.    )AnyDictListOptionalUnionN   )mel_filter_bankoptimal_fft_lengthspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                )       s  e Zd ZdZg dZ									
														dFdedededededededede	e dedede	e d ed!ed"ed#ed$ed%ed&ed'ef( fd(d)Z
d*d+ Zd,d- Zd.ejd/ejfd0d1Z	dGd2ed3e	ejj d/ejfd4d5ZdGd/eej fd6d7Z												dHd8eejee eej eee  f de	e d9eeeef d:e	e d;ed<e	e d=ed3e	ejj d>ed?e	e de	e d@e	e dAe	eeef  d/efdBdCZd/eeef f fdDdEZ  ZS )IUnivNetFeatureExtractora  
    Constructs a UnivNet feature extractor.

    This class extracts log-mel-filter bank features from raw speech using the short time Fourier Transform (STFT). The
    STFT implementation follows that of TacoTron 2 and Hifi-GAN.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 24000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value to pad with when applying the padding strategy defined by the `padding` argument to
            [`UnivNetFeatureExtractor.__call__`]. Should correspond to audio silence. The `pad_end` argument to
            `__call__` will also use this padding value.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve the
            performance for some models.
        num_mel_bins (`int`, *optional*, defaults to 100):
            The number of mel-frequency bins in the extracted spectrogram features. This should match
            `UnivNetModel.config.num_mel_bins`.
        hop_length (`int`, *optional*, defaults to 256):
            The direct number of samples between sliding windows. Otherwise referred to as "shift" in many papers. Note
            that this is different from other audio feature extractors such as [`SpeechT5FeatureExtractor`] which take
            the `hop_length` in ms.
        win_length (`int`, *optional*, defaults to 1024):
            The direct number of samples for each sliding window. Note that this is different from other audio feature
            extractors such as [`SpeechT5FeatureExtractor`] which take the `win_length` in ms.
        win_function (`str`, *optional*, defaults to `"hann_window"`):
            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
        filter_length (`int`, *optional*, defaults to 1024):
            The number of FFT components to use. If `None`, this is determined using
            `transformers.audio_utils.optimal_fft_length`.
        max_length_s (`int`, *optional*, defaults to 10):
            The maximum input length of the model in seconds. This is used to pad the audio.
        fmin (`float`, *optional*, defaults to 0.0):
            Minimum mel frequency in Hz.
        fmax (`float`, *optional*):
            Maximum mel frequency in Hz. If not set, defaults to `sampling_rate / 2`.
        mel_floor (`float`, *optional*, defaults to 1e-09):
            Minimum value of mel frequency banks. Note that the way [`UnivNetFeatureExtractor`] uses `mel_floor` is
            different than in [`transformers.audio_utils.spectrogram`].
        center (`bool`, *optional*, defaults to `False`):
            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
            `t` will start at time `t * hop_length`.
        compression_factor (`float`, *optional*, defaults to 1.0):
            The multiplicative compression factor for dynamic range compression during spectral normalization.
        compression_clip_val (`float`, *optional*, defaults to 1e-05):
            The clip value applied to the waveform before applying dynamic range compression during spectral
            normalization.
        normalize_min (`float`, *optional*, defaults to -11.512925148010254):
            The min value used for Tacotron 2-style linear normalization. The default is the original value from the
            Tacotron 2 implementation.
        normalize_max (`float`, *optional*, defaults to 2.3143386840820312):
            The max value used for Tacotron 2-style linear normalization. The default is the original value from the
            Tacotron 2 implementation.
        model_in_channels (`int`, *optional*, defaults to 64):
            The number of input channels to the [`UnivNetModel`] model. This should match
            `UnivNetModel.config.model_in_channels`.
        pad_end_length (`int`, *optional*, defaults to 10):
            If padding the end of each waveform, the number of spectrogram frames worth of samples to append. The
            number of appended samples will be `pad_end_length * hop_length`.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether or not [`~UnivNetFeatureExtractor.__call__`] should return `attention_mask`.
    )input_featuresnoise_sequencepadding_mask   ]          Fd         hann_window
   N&.>      ?h㈵>    '    ă@@   Tfeature_sizesampling_ratepadding_valuedo_normalizenum_mel_bins
hop_length
win_lengthwin_functionfilter_lengthmax_length_sfminfmax	mel_floorcentercompression_factorcompression_clip_valnormalize_minnormalize_maxmodel_in_channelspad_end_lengthc              	      s
  t  jd||||d| || _|| _|| _|| _|| _|	| _|| _|d u r-t	|d }|| _
|| _|
| _|
| | _| jd u rGt| j| _n| j| _| jd d | _t| j| jdd| _t| j| j| j| j
| jddd| _|| _|| _|| _|| _|| _|| _|| _d S )	N)r#   r$   r%   return_attention_mask   r   T)Zwindow_lengthnameZperiodicZslaney)Znum_frequency_binsZnum_mel_filtersZmin_frequencyZmax_frequencyr$   ZnormZ	mel_scale )super__init__r&   r'   r(   r)   r*   r+   r-   floatr.   r/   r,   num_max_samplesr	   n_fftn_freqsr   windowr   r$   mel_filtersr0   r1   r2   r3   r4   r5   r6   )selfr#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   kwargs	__class__r:   e/var/www/auris/lib/python3.10/site-packages/transformers/models/univnet/feature_extraction_univnet.pyr<   e   sT   



z UnivNetFeatureExtractor.__init__c                 C   s   d|| j  | j| j    d S )Nr8   r   r3   r4   rC   r
   r:   r:   rG   	normalize      z!UnivNetFeatureExtractor.normalizec                 C   s   | j | j| j  |d d   S )Nr   r8   rH   rI   r:   r:   rG   denormalize   rK   z#UnivNetFeatureExtractor.denormalizewaveformreturnc                 C   s   t j|t| j| j d t| j| j d fdd}t|| j| j| j| jd| jddd	}t t 	|d t 
|d  | j }t | jj|}t t j|| jdd| j }|jS )a  
        Calculates log MEL spectrograms from a batch of waveforms. Note that the input waveform(s) will be padded by
        `int(self.n_fft - self.hop_length) / 2` on both sides using the `reflect` padding mode.

        Args:
            waveform (`np.ndarray` of shape `(length,)`):
                The input waveform. This must be a single real-valued, mono waveform.

        Returns:
            `numpy.ndarray`: Array containing a log-mel spectrogram of shape `(num_frames, num_mel_bins)`.
        r8   Zreflect)modeN)rA   Zframe_lengthr(   Z
fft_lengthpowerr0   rB   r/   )Za_minZa_max)nppadintr?   r(   r
   rA   r0   sqrtrealimagr/   matmulrB   TlogZclipr2   r1   )rC   rM   Zcomplex_spectrogramZamplitude_spectrogrammel_spectrogramZlog_mel_spectrogramr:   r:   rG   rZ      s0   & z'UnivNetFeatureExtractor.mel_spectrogramnoise_length	generatorc                 C   s0   |du r	t j }|| jf}|j|t jd}|S )a  
        Generates a random noise sequence of standard Gaussian noise for use in the `noise_sequence` argument of
        [`UnivNetModel.forward`].

        Args:
            spectrogram_length (`int`):
                The length (dim 0) of the generated noise.
            model_in_channels (`int`, *optional*, defaults to `None`):
                The number of features (dim 1) of the generated noise. This should correspond to the
                `model_in_channels` of the [`UnivNetGan`] model. If not set, this will default to
                `self.config.model_in_channels`.
            generator (`numpy.random.Generator`, *optional*, defaults to `None`)
                An optional `numpy.random.Generator` random number generator to control noise generation. If not set, a
                new generator with fresh entropy will be created.

        Returns:
            `numpy.ndarray`: Array containing random standard Gaussian noise of shape `(noise_length,
            model_in_channels)`.
        Ndtype)rQ   randomZdefault_rngr5   Zstandard_normalfloat32)rC   r[   r\   Znoise_shapenoiser:   r:   rG   generate_noise   s
   

z&UnivNetFeatureExtractor.generate_noisec                    s0   dd |D } dur fddt |D }|S )a  
        Removes padding from generated audio after running [`UnivNetModel.forward`]. This returns a ragged list of 1D
        audio waveform arrays and not a single tensor/array because in general the waveforms will have different
        lengths after removing padding.

        Args:
            waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                The batched output waveforms from the [`UnivNetModel`].
            waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
                The batched lengths of each waveform before padding.

        Returns:
            `List[np.ndarray]`: A ragged list of 1D waveform arrays with padding removed.
        c                 S   s"   g | ]}|  jd dd qS )cpuT)Zdevicecopy)detachtonumpy.0rM   r:   r:   rG   
<listcomp>  s   " z8UnivNetFeatureExtractor.batch_decode.<locals>.<listcomp>Nc                    s    g | ]\}}|d  |  qS Nr:   )ri   irM   waveform_lengthsr:   rG   rj     s     )	enumerate)rC   Z	waveformsrn   r:   rm   rG   batch_decode  s   z$UnivNetFeatureExtractor.batch_decode
raw_speechpadding
max_length
truncationpad_to_multiple_ofreturn_noisepad_end
pad_lengthr7   return_tensorsc              
      sl  |dur|nj }|dur(|jkr'tdjj dj dj d| d	ntdjj d t|tj	o?t
|jd	k}|rPt
|jd
krPtd |pct|ttfoct|d tj	ttf}|rndd |D }n&|st|tj	stj|tjd}nt|tj	r|jttju r|tj}|stj|tjdg}|	rdurnjfdd|D }td|i}j|||dur|nj|||d}|d}fdd|D }t|d trdd |D |d< n	dd |D |d< |d}|durdd |D |d< |r fdd|d D }||d< |r*fdd|d D |d< |dur4||}|S )a  
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
                Select a strategy to pad the input `raw_speech` waveforms (according to the model's padding side and
                padding index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).

                If `pad_end = True`, that padding will occur before the `padding` strategy is applied.
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`, *optional*, defaults to `True`):
                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_noise (`bool`, *optional*, defaults to `True`):
                Whether to generate and return a noise waveform for use in [`UnivNetModel.forward`].
            generator (`numpy.random.Generator`, *optional*, defaults to `None`):
                An optional `numpy.random.Generator` random number generator to use when generating noise.
            pad_end (`bool`, *optional*, defaults to `False`):
                Whether to pad the end of each waveform with silence. This can help reduce artifacts at the end of the
                generated audio sample; see https://github.com/seungwonpark/melgan/issues/8 for more details. This
                padding will be done before the padding strategy specified in `padding` is performed.
            pad_length (`int`, *optional*, defaults to `None`):
                If padding the end of each waveform, the length of the padding in spectrogram frames. If not set, this
                will default to `self.config.pad_end_length`.
            do_normalize (`bool`, *optional*):
                Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve
                the performance for some models. If not set, this will default to `self.config.do_normalize`.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.np.array` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r8   z2Only mono-channel audio is supported for input to r   c                 S      g | ]
}t j|t jd qS r]   rQ   asarrayr`   )ri   Zspeechr:   r:   rG   rj         z4UnivNetFeatureExtractor.__call__.<locals>.<listcomp>r]   c                    s(   g | ]}t j|d  j fjdqS )r   )Zconstant_values)rQ   rR   r(   r%   rh   )rx   rC   r:   rG   rj     s    r   )rr   rs   rt   ru   r7   c                       g | ]}  |qS r:   )rZ   rh   rC   r:   rG   rj     s    c                 S   r{   r|   r}   ri   Zmelr:   r:   rG   rj     r   c                 S   s   g | ]}| tjqS r:   )astyperQ   r`   r   r:   r:   rG   rj     s    attention_maskc                 S   r{   r|   )rQ   r~   Zint32)ri   arrayr:   r:   rG   rj     r   r   c                    s   g | ]} |jd   qS )r   )rb   shaperi   r
   )r\   rC   r:   rG   rj     s    r   c                    r   r:   )rJ   r   r   r:   rG   rj     s    
)r&   r$   
ValueErrorrF   __name__loggerwarning
isinstancerQ   ndarraylenr   listtupler~   r`   r^   Zfloat64r   r6   r   rR   r>   getr   Zconvert_to_tensors)rC   rq   r$   rr   rs   rt   ru   rv   r\   rw   rx   r&   r7   ry   Zis_batched_numpyZ
is_batchedZbatched_speechZpadded_inputsr   Zmel_spectrogramsr   ra   r:   )r\   rx   rC   rG   __call__  s   L
"






z UnivNetFeatureExtractor.__call__c                    s.   t   }g d}|D ]	}||v r||= q|S )N)rA   rB   r?   r@   r>   )r;   to_dict)rC   outputnamesr9   rE   r:   rG   r     s   
zUnivNetFeatureExtractor.to_dict)r   r   r   Fr   r   r   r   r   r   r   Nr   Fr   r   r    r!   r"   r   Trk   )NTNTNTNFNNNN)r   
__module____qualname____doc__Zmodel_input_namesrS   r=   boolstrr   r<   rJ   rL   rQ   r   rZ   r_   	Generatorrb   r   rp   r   r   r   r   r   r   r   r   __classcell__r:   r:   rE   rG   r      s    E	
L3

 "
	

 ""r   )r   typingr   r   r   r   r   rg   rQ   Zaudio_utilsr   r	   r
   r   Z!feature_extraction_sequence_utilsr   Zfeature_extraction_utilsr   utilsr   r   r   Z
get_loggerr   r   r   __all__r:   r:   r:   rG   <module>   s   
   
1