o
    Zh*                     @   s   d Z ddlmZmZmZ ddlZddlmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ eeZG d	d
 d
eZd
gZdS )z"
Feature extractor class for CLVP
    )ListOptionalUnionN   )mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                       s   e Zd ZdZddgZ								
		d fdd	ZdejdejfddZ								dde
ejee eej eee  f dee dedee dee
eef  dee dee dee defddZ  ZS ) ClvpFeatureExtractora!  
    Constructs a CLVP feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts log-mel-spectrogram features from raw speech using a custom numpy implementation of the `Short
    Time Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 22050):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        default_audio_length (`int`, *optional*, defaults to 6):
            The default length of raw audio in seconds. If `max_length` is not set during `__call__` then it will
            automatically be set to default_audio_length * `self.sampling_rate`.
        hop_length (`int`, *optional*, defaults to 256):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, *optional*, defaults to 1024):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        mel_norms (`list` of length `feature_size`, *optional*):
            If `mel_norms` is provided then it will be used to normalize the log-mel spectrograms along each
            mel-filter.
        return_attention_mask (`bool`, *optional*, defaults to `False`):
            Whether to return the attention mask. If left to the default, it will return the attention mask.

            [What are attention masks?](../glossary#attention-mask)
    input_featuresZattention_maskP   "V                      NFc
              	      sz   t  jd	||||	d|
 || _|| _|| _|| | _| j| | _|| _|| _|| _	t
d|d  |dd|ddd| _d S )
N)feature_sizesampling_ratepadding_valuereturn_attention_mask      r   g     @@ZslaneyZhtk)Znum_frequency_binsZnum_mel_filtersZmin_frequencyZmax_frequencyr   ZnormZ	mel_scale )super__init__n_fft
hop_lengthchunk_lengthZ	n_samplesZnb_max_framesr   default_audio_length	mel_normsr   mel_filters)selfr   r   r"   r    r!   r   r   r#   r   kwargs	__class__r   _/var/www/auris/lib/python3.10/site-packages/transformers/models/clvp/feature_extraction_clvp.pyr   G   s2   

zClvpFeatureExtractor.__init__waveformreturnc              	   C   sd   t |t| jd| j| jd| jdd}ttj|ddd}| jdur0|t	| jdddf  }|S )z
        This method first computes the log-mel spectrogram of the provided audio then applies normalization along the
        each mel-filterbank, if `mel_norms` is provided.
        Zhanng       @N)Zframe_lengthr    powerr$   Zlog_melgh㈵>)Za_minZa_max)
r   r   r   r    r$   nplogZclipr#   array)r%   r*   Zlog_specr   r   r)   _np_extract_fbank_featuresm   s   


z/ClvpFeatureExtractor._np_extract_fbank_featuresT
max_length
raw_speechr   
truncationpad_to_multiple_ofreturn_tensorsr   paddingc	              
      s  |dur| j krtd jj d j  d j  d| d	ntd jj d t|tjo6t	|j
d	k}
|
rGt	|j
d
krGtd  |
pZt|ttfoZt|d tjttf}|redd |D }n&|svt|tjsvtj|tjd}nt|tjr|jttju r|tj}|st|gjg}td|i}|du r j j  n|} j||||||d}|dd
dd	} fdd|d D }t|d trdd |D |d< n||d< ||S )a	  
        `ClvpFeatureExtractor` is used to extract various voice specific properties such as the pitch and tone of the
        voice, speaking speed, and even speaking defects like a lisp or stuttering from a sample voice or `raw_speech`.

        First the voice is padded or truncated in a way such that it becomes a waveform of `self.default_audio_length`
        seconds long and then the log-mel spectrogram is extracted from it.

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*, defaults to `True`):
                Whether to return the attention mask. If left to the default, it will return the attention mask.

                [What are attention masks?](../glossary#attention-mask)
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            max_length (`int`, *optional*):
                The maximum input length of the inputs.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   c                 S   s    g | ]}t j|gt jd jqS )dtype)r-   asarrayfloat32T).0Zspeechr   r   r)   
<listcomp>   s     z1ClvpFeatureExtractor.__call__.<locals>.<listcomp>r8   r   )r6   r1   r3   r4   r   c                    s   g | ]}  |tjqS r   )r0   astyper-   r;   )r=   r*   r%   r   r)   r>      s    c                 S   s   g | ]}t |qS r   )r-   r:   )r=   featurer   r   r)   r>      s    )r   
ValueErrorr(   __name__loggerwarning
isinstancer-   ndarraylenshapelisttupler:   r;   r9   Zfloat64r?   r<   r
   r"   padgetZ	transposer   Zconvert_to_tensors)r%   r2   r   r3   r4   r5   r   r6   r1   r&   Zis_batched_numpyZ
is_batchedZbatched_speechZpadded_inputsr   r   r@   r)   __call__   s^   3
"


zClvpFeatureExtractor.__call__)	r   r   r   r   r   r   r   NF)NTNNTr1   N)rC   
__module____qualname____doc__Zmodel_input_namesr   r-   r/   rG   r0   r   r   floatr   intboolstrr   r
   rN   __classcell__r   r   r'   r)   r   !   sP    #&"	r   )rQ   typingr   r   r   numpyr-   Zaudio_utilsr   r   r   Z!feature_extraction_sequence_utilsr	   Zfeature_extraction_utilsr
   utilsr   r   Z
get_loggerrC   rD   r   __all__r   r   r   r)   <module>   s   
 
Q