o
    Zh=                     @   s   d Z ddlmZmZmZ ddlZddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZ e r;ddlZeeZG d
d deZdgZdS )z%
Feature extractor class for Whisper
    )ListOptionalUnionN   )is_torch_available)mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypeloggingc                       s>  e Zd ZdZdgZ									d( fd
d	Zdejdedej	fddZ
d)dejdedej	fddZe	d*deej	 deej	 dedeej	 fddZ										d+deej	ee eej	 eee  f dedee d eeeef  d!ee d"ee dee d#ee d$ee dee d%ee defd&d'Z  ZS ),WhisperFeatureExtractora  
    Constructs a Whisper feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the `Short Time
    Fourier Transform` which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 80):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        hop_length (`int`, *optional*, defaults to 160):
            Length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
        chunk_length (`int`, *optional*, defaults to 30):
            The maximum number of chunks of `sampling_rate` samples used to trim and pad longer or shorter audio
            sequences.
        n_fft (`int`, *optional*, defaults to 400):
            Size of the Fourier transform.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        dither (`float`, *optional*, defaults to 0.0):
            Adds dithering. In other words, adds a small Gaussian noise to each frame.
            E.g. use 0.0001 to add dithering with a normal distribution centered
            around 0.0 with standard deviation 0.0001 (assuming [-1,+1] range of raw_speech).
            The value 0.0 means no dithering.
            Dithering has similar effect as `spectrogram(mel_floor=...)`. It reduces
            the high log_mel_fbank values for signals with hard-zero sections,
            when VAD cutoff is present in the signal.
    input_featuresP   >                  Fc	           
   	      st   t  jd||||d|	 || _|| _|| _|| | _| j| | _|| _|| _t	d|d  |dd|ddd| _
d S )	N)feature_sizesampling_ratepadding_valuereturn_attention_mask      r   g     @@Zslaney)Znum_frequency_binsZnum_mel_filtersZmin_frequencyZmax_frequencyr   ZnormZ	mel_scale )super__init__n_fft
hop_lengthchunk_length	n_samplesZnb_max_framesr   ditherr   mel_filters)
selfr   r   r    r!   r   r   r#   r   kwargs	__class__r   e/var/www/auris/lib/python3.10/site-packages/transformers/models/whisper/feature_extraction_whisper.pyr   H   s0   

z WhisperFeatureExtractor.__init__waveform_batchdevicereturnc                 C   s   |dkrt d| dg }|D ]5}t|t| jd| j| jd| j| jdd}|dddd	f }t||	 d
 }|d d }|
| qt|}|S )z
        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
        implementation with 1e-5 tolerance.
        cpuzGot device `z` for feature extraction, but feature extraction on CUDA accelerator devices requires torch, which is not installed. Either set `device='cpu'`, or install torch according to the official instructions: https://pytorch.org/get-started/locally/Zhanng       @log10)Zframe_lengthr    powerr#   r$   Zlog_melN       @      @)
ValueErrorr   r	   r   r    r#   r$   npmaximummaxappendarray)r%   r*   r+   Zlog_spec_batchwaveformlog_specr   r   r)   _np_extract_fbank_featuresl   s,   



z2WhisperFeatureExtractor._np_extract_fbank_featuresr-   r9   c           
      C   s*  t ||t j}t j| j|d}| jdkr'|| jt j|j|j	|j
d 7 }t j|| j| j|dd}|dddf  d	 }t | j|t j}|j| }t j|d
d }| d	krw|jd	ddd jdddd }	t ||	d }n
t || d }|d d }|dkr|  }| S )z
        Compute the log-mel spectrogram of the audio using PyTorch's GPU-accelerated STFT implementation with batching,
        yielding results similar to cpu computing with 1e-5 tolerance.
        )r+   r   )dtyper+   T)windowZreturn_complex.Nr0   r   g|=)min)dimZkeepdimr   r   r1   r2   r-   )torchZ
from_numpytofloat32Zhann_windowr   r#   Zrandnshaper<   r+   stftr    absr$   Tclampr.   r?   r6   r5   detachr-   numpy)
r%   r9   r+   r=   rD   Z
magnitudesr$   Zmel_specr:   Zmax_valr   r   r)   _torch_extract_fbank_features   s"   
 
 z5WhisperFeatureExtractor._torch_extract_fbank_featuresinput_valuesattention_maskr   c                 C   s   |durEt |t j}g }t| |dD ]-\}}||d|   t |d|  d  }||jd k r=|||d< |	| q|S dd | D }|S )z[
        Every array in the list is normalized to have zero mean and unit variance
        Nr0   Hz>r   c                 S   s*   g | ]}||   t| d   qS )rM   )meanr4   sqrtvar).0xr   r   r)   
<listcomp>   s   * zCWhisperFeatureExtractor.zero_mean_unit_var_norm.<locals>.<listcomp>)
r4   r8   Zint32zipsumrN   rO   rP   rC   r7   )rK   rL   r   Znormed_input_valuesZvectorlengthZnormed_slicer   r   r)   zero_mean_unit_var_norm   s   .z/WhisperFeatureExtractor.zero_mean_unit_var_normTN
max_length
raw_speech
truncationpad_to_multiple_ofreturn_tensorsr   paddingr   do_normalizereturn_token_timestampsc              
      sN  |dur| j krtd jj d j  d j  d| d	ntd jj d t|tjo6t	|j
d	k}|rGt	|j
d
krGtd  |pZt|ttfoZt|d tjttf}|redd |D }n&|svt|tjsvtj|tjd}nt|tjr|jttju r|tj}|st|gjg}td|i} j|||r|n j|||p|	d}|	rɈ j|d |d  jd|d< tj|d dd|d< |dd
dd	}t rو jn j}||d |
}t|d trdd |D |d< n||d< |r|d dddd jf |d< |dur fdd|D |d< |dur%| |}|S )a  
        Main method to featurize and prepare for the model one or several sequence(s). Implementation uses PyTorch for
        the STFT computation if available, otherwise a slower NumPy based one.

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`bool`, *optional*, default to `True`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*, defaults to None):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                For Whisper models, `attention_mask` should always be passed for batched inference, to avoid subtle
                bugs.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
            padding_value (`float`, *optional*, defaults to 0.0):
                The value that is used to fill the padding values / vectors.
            do_normalize (`bool`, *optional*, defaults to `False`):
                Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
                improve the performance of the model.
            device (`str`, *optional*, defaults to `'cpu'`):
                Specifies the device for computation of the log-mel spectrogram of audio signals in the
                `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda")
            return_token_timestamps (`bool`, *optional*, defaults to `None`):
                Whether or not to return the number of frames of the input raw_speech.
                These num_frames can be used by the model to compute word level timestamps.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r   r   z2Only mono-channel audio is supported for input to r   c                 S   s    g | ]}t j|gt jd jqS r<   )r4   asarrayrB   rF   )rQ   Zspeechr   r   r)   rS     s     z4WhisperFeatureExtractor.__call__.<locals>.<listcomp>rb   r   )r]   rX   rZ   r[   r   rL   )rL   r   )Zaxisc                 S   s   g | ]
}t j|t jd qS ra   )r4   rc   rB   )rQ   featurer   r   r)   rS   @  s    c                    s   g | ]	}t | j qS r   )lenr    )rQ   Zraw_speech_ir%   r   r)   rS   J  s    Z
num_frames)!r   r3   r(   __name__loggerwarning
isinstancer4   ndarrayre   rC   listtuplerc   rB   r<   Zfloat64ZastyperF   r   padr"   rW   r   stackgetZ	transposer   rJ   r;   r   r    Zconvert_to_tensors)r%   rY   rZ   r[   r\   r   r]   rX   r   r^   r+   r_   r&   Zis_batched_numpyZ
is_batchedZbatched_speechZpadded_inputsr   Zextract_fbank_featuresr   rf   r)   __call__   sv   B
"

 


z WhisperFeatureExtractor.__call__)r   r   r   r   r   r   r   F)r-   )r   )
TNNNrX   NNNr-   N)rg   
__module____qualname____doc__Zmodel_input_namesr   r4   r8   strrk   r;   rJ   staticmethodr   floatrW   r   boolr   intr   r   rq   __classcell__r   r   r'   r)   r   $   sx    !$"	
r   )rt   typingr   r   r   rI   r4    r   Zaudio_utilsr   r   r	   Z!feature_extraction_sequence_utilsr
   Zfeature_extraction_utilsr   utilsr   r   r@   Z
get_loggerrg   rh   r   __all__r   r   r   r)   <module>   s   
  
0