o
    ZhE                     @   s   d Z ddlZddlmZmZmZmZmZ ddlZ	ddl
mZmZmZmZ ddlmZ ddlmZ ddlmZmZmZ eeZG d	d
 d
eZd
gZdS )z%Feature extractor class for SpeechT5.    N)AnyDictListOptionalUnion   )mel_filter_bankoptimal_fft_lengthspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                       s  e Zd ZdZddgZ								
							d7dedededededededededededededef fdd Z	e
	d8deej deej ded!eej fd"d#Zd$ejd!ejfd%d&Z	'	'		'		'	'	'	'd9d(eeejee eej eee  f  d)eeejee eej eee  f  d*eeeef d+ee d,ed-ee dee d.eeeef  dee d!efd/d0Z			'		'	'	'd:d1eejee eej eee  f d2ed*eeeef d+ee d,ed-ee dee d.eeeef  d!efd3d4Zd!eeef f fd5d6Z  ZS );SpeechT5FeatureExtractora
  
    Constructs a SpeechT5 feature extractor.

    This class can pre-process a raw speech signal by (optionally) normalizing to zero-mean unit-variance, for use by
    the SpeechT5 speech encoder prenet.

    This class can also extract log-mel filter bank features from raw speech, for use by the SpeechT5 speech decoder
    prenet.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        do_normalize (`bool`, *optional*, defaults to `False`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
            improve the performance for some models.
        num_mel_bins (`int`, *optional*, defaults to 80):
            The number of mel-frequency bins in the extracted spectrogram features.
        hop_length (`int`, *optional*, defaults to 16):
            Number of ms between windows. Otherwise referred to as "shift" in many papers.
        win_length (`int`, *optional*, defaults to 64):
            Number of ms per window.
        win_function (`str`, *optional*, defaults to `"hann_window"`):
            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
        frame_signal_scale (`float`, *optional*, defaults to 1.0):
            Constant multiplied in creating the frames before applying DFT. This argument is deprecated.
        fmin (`float`, *optional*, defaults to 80):
            Minimum mel frequency in Hz.
        fmax (`float`, *optional*, defaults to 7600):
            Maximum mel frequency in Hz.
        mel_floor (`float`, *optional*, defaults to 1e-10):
            Minimum value of mel frequency banks.
        reduction_factor (`int`, *optional*, defaults to 2):
            Spectrogram length reduction factor. This argument is deprecated.
        return_attention_mask (`bool`, *optional*, defaults to `True`):
            Whether or not [`~SpeechT5FeatureExtractor.__call__`] should return `attention_mask`.
    input_valuesattention_mask   >          FP      @   hann_window      ?  绽|=   Tfeature_sizesampling_ratepadding_valuedo_normalizenum_mel_bins
hop_length
win_lengthwin_functionframe_signal_scalefminfmax	mel_floorreduction_factorreturn_attention_maskc              	      s   t  jd|||d| || _|| _|| _|| _|| _|| _|	| _|
| _	|| _
|| _|| _|| d | _|| d | _t| j| _| jd d | _t| j| jdd| _t| j| j| j	| j
| jddd| _|	d	krotd
t |dkr{tdt d S d S )N)r   r    r!   i  r   r   T)Zwindow_lengthnameZperiodicZslaney)Znum_frequency_binsZnum_mel_filtersZmin_frequencyZmax_frequencyr    ZnormZ	mel_scaler   zeThe argument `frame_signal_scale` is deprecated and will be removed in version 4.30.0 of Transformersg       @zcThe argument `reduction_factor` is deprecated and will be removed in version 4.30.0 of Transformers )super__init__r"   r,   r#   r$   r%   r&   r'   r(   r)   r*   r+   sample_sizesample_strider	   n_fftn_freqsr   windowr   r    mel_filterswarningswarnFutureWarning)selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   kwargs	__class__r.   g/var/www/auris/lib/python3.10/site-packages/transformers/models/speecht5/feature_extraction_speecht5.pyr0   N   sJ   
z!SpeechT5FeatureExtractor.__init__returnc                 C   s   |durEt |t j}g }t| |dD ]-\}}||d|   t |d|  d  }||jd k r=|||d< |	| q|S dd | D }|S )z[
        Every array in the list is normalized to have zero mean and unit variance
        NHz>r   c                 S   s*   g | ]}||   t| d   qS )rA   )meannpsqrtvar).0xr.   r.   r>   
<listcomp>   s   * zDSpeechT5FeatureExtractor.zero_mean_unit_var_norm.<locals>.<listcomp>)
rC   arrayint32zipsumrB   rD   rE   shapeappend)r   r   r!   Znormed_input_valuesZvectorlengthZnormed_slicer.   r.   r>   zero_mean_unit_var_norm   s   .z0SpeechT5FeatureExtractor.zero_mean_unit_var_normone_waveformc              
   C   s*   t || j| j| j| j| j| jdd}|jS )zZ
        Extracts log-mel filterbank features for one waveform array (unbatched).
        log10)r5   Zframe_lengthr$   Z
fft_lengthr6   r*   Zlog_mel)r
   r5   r1   r2   r3   r6   r*   T)r:   rQ   Zlog_mel_specr.   r.   r>   _extract_mel_features   s   
z.SpeechT5FeatureExtractor._extract_mel_featuresNaudioaudio_targetpadding
max_length
truncationpad_to_multiple_ofreturn_tensorsc
              
   K   s   |du r|du rt d|	dur)|	| jkr(t d|  d| j d| j d|	 d	ntd| jj d	 |durI| j|d
||||||fi |
}nd}|durx| j|d||||||fi |
}|du re|S |d |d< |d}|durx||d< |S )aA  
        Main method to featurize and prepare for the model one or several sequence(s).

        Pass in a value for `audio` to extract waveform features. Pass in a value for `audio_target` to extract log-mel
        spectrogram features.

        Args:
            audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. This outputs waveform features. Must
                be mono channel audio, not stereo, i.e. single float per timestep.
            audio_target (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`, *optional*):
                The sequence or batch of sequences to be processed as targets. Each sequence can be a numpy array, a
                list of float values, a list of numpy arrays or a list of list of float values. This outputs log-mel
                spectrogram features.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `audio` or `audio_target` input was sampled. It is strongly recommended
                to pass `sampling_rate` at the forward call to prevent silent errors.
        Nz9You must provide either `audio` or `audio_target` values.z3The model corresponding to this feature extractor: z& was trained using a sampling rate of zB. Please make sure that the provided audio input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.FTr   labelsr   decoder_attention_mask)
ValueErrorr    loggerwarningr=   __name___process_audioget)r:   rU   rV   rW   rX   rY   rZ   r,   r[   r    r;   ZinputsZinputs_targetr^   r.   r.   r>   __call__   sf   ?
		
z!SpeechT5FeatureExtractor.__call__speech	is_targetc	                    sP  t |tjot|jdk}
|
rt|jdkrtd  |
p0t |ttfo0t |d tjttf}|r;dd |D }n&|sLt |tjsLtj|tj	d}nt |tjra|j
t
tju ra|tj	}|sf|g} j}|r fdd|D }td	|i} j _ntd	|i} j|f|||||d
|	}| _|d	 }t |d tjsdd |D |d	< n:t |tjst |d tjr|d j
t
tju rdd |D |d	< nt |tjr|j
t
tju r|tj	|d	< |d}|d urdd |D |d< |s jr j||dtjur|nd } j|d	 | jd|d	< |d ur&||}|S )Nr   r   z2Only mono-channel audio is supported for input to r   c                 S      g | ]
}t j|t jd qS dtyperC   asarrayfloat32)rF   rf   r.   r.   r>   rH   =      z;SpeechT5FeatureExtractor._process_audio.<locals>.<listcomp>rj   c                    s   g | ]}  |qS r.   )rT   )rF   Zwaveformr:   r.   r>   rH   L  s    r   )rW   rX   rY   rZ   r,   c                 S   rh   ri   rl   rF   rI   r.   r.   r>   rH   a  ro   c                 S   s   g | ]}| tjqS r.   )astyperC   rn   rq   r.   r.   r>   rH   g  s    r   c                 S   rh   ri   )rC   rm   rJ   rq   r.   r.   r>   rH   n  ro   )rX   )r   r!   )
isinstancerC   ndarraylenrM   r_   listtuplerm   rn   rk   Zfloat64rr   r   r   r#   padrd   r"   Z_get_padding_strategiesr   Z
DO_NOT_PADrP   r!   Zconvert_to_tensors)r:   rf   rg   rW   rX   rY   rZ   r,   r[   r;   Zis_batched_numpyZ
is_batchedZfeature_size_hackfeaturesZencoded_inputsZpadded_inputsr   r   r.   rp   r>   rc   )  sr   "






z'SpeechT5FeatureExtractor._process_audioc                    s.   t   }g d}|D ]	}||v r||= q|S )N)r5   r6   r1   r2   r3   r4   )r/   to_dict)r:   outputnamesr-   r<   r.   r>   rz     s   
z SpeechT5FeatureExtractor.to_dict)r   r   r   Fr   r   r   r   r   r   r   r   r   T)r   )	NNFNFNNNN)FFNFNNN)rb   
__module____qualname____doc__Zmodel_input_namesintfloatboolstrr0   staticmethodr   rC   rt   rP   rT   r   r   r   r   r   re   rc   r   r   rz   __classcell__r.   r.   r<   r>   r      s    ,	
<
&&	

x"	
"Wr   )r   r7   typingr   r   r   r   r   numpyrC   Zaudio_utilsr   r	   r
   r   Z!feature_extraction_sequence_utilsr   Zfeature_extraction_utilsr   utilsr   r   r   Z
get_loggerrb   r`   r   __all__r.   r.   r.   r>   <module>   s   
  
o