o
    ZhI                     @   s   d Z ddlZddlmZmZmZmZmZ ddlZ	ddl
Z
ddlmZmZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ eeZed
dG dd deZdgZdS )z!Feature extractor class for CLAP.    N)AnyDictListOptionalUnion   )mel_filter_bankspectrogramwindow_function)SequenceFeatureExtractor)BatchFeature)
TensorTypelogging)requires)torch)backendsc                       s   e Zd ZdZddgZ								
					d(dededee dedef
 fddZ	de
eef fddZd)dejdeej dejfddZdd Zdejdejfd d!Z					d*d"eejee eej eee  f dee dee d#ee d$ee d%eeeef  defd&d'Z  ZS )+ClapFeatureExtractora  
    Constructs a CLAP feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts mel-filter bank features from raw speech using a custom numpy implementation of the *Short Time
    Fourier Transform* (STFT) which should match pytorch's `torch.stft` equivalent.

    Args:
        feature_size (`int`, *optional*, defaults to 64):
            The feature dimension of the extracted Mel spectrograms. This corresponds to the number of mel filters
            (`n_mels`).
        sampling_rate (`int`, *optional*, defaults to 48000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). This only serves
            to warn users if the audio fed to the feature extractor does not have the same sampling rate.
        hop_length (`int`,*optional*, defaults to 480):
            Length of the overlapping windows for the STFT used to obtain the Mel Spectrogram. The audio will be split
            in smaller `frames` with a step of `hop_length` between each frame.
        max_length_s (`int`, *optional*, defaults to 10):
            The maximum input length of the model in seconds. This is used to pad the audio.
        fft_window_size (`int`, *optional*, defaults to 1024):
            Size of the window (in samples) on which the Fourier transform is applied. This controls the frequency
            resolution of the spectrogram. 400 means that the fourier transform is computed on windows of 400 samples.
        padding_value (`float`, *optional*, defaults to 0.0):
            Padding value used to pad the audio. Should correspond to silences.
        return_attention_mask (`bool`, *optional*, defaults to `False`):
            Whether or not the model should return the attention masks corresponding to the input.
        frequency_min (`float`, *optional*, defaults to 0):
            The lowest frequency of interest. The STFT will not be computed for values below this.
        frequency_max (`float`, *optional*, defaults to 14000):
            The highest frequency of interest. The STFT will not be computed for values above this.
        top_db (`float`, *optional*):
            The highest decibel value used to convert the mel spectrogram to the log scale. For more details see the
            `audio_utils.power_to_db` function
        truncation (`str`, *optional*, defaults to `"fusion"`):
            Truncation pattern for long audio inputs. Two patterns are available:
                - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and a
                  downsampled version of the entire mel spectrogram.
            If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a copy
            of the original mel obtained from the padded audio.
                - `rand_trunc` will select a random crop of the mel spectrogram.
        padding (`str`, *optional*, defaults to `"repeatpad"`):
               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
                - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
                - `repeat`: the audio is repeated and then cut to fit the `max_length`
                - `pad`: the audio is padded.
    input_features	is_longer@   逻    
              Fr   6  Nfusion	repeatpadfrequency_minfrequency_maxtop_db
truncationpaddingc              	      s   t  jd||||d| |
| _|| _|| _|| _|d? d | _|| _|| _|| | _	|| _
|| _|	| _t| j|||	|d dd| _t| j|||	|ddd| _d S )N)feature_sizesampling_ratepadding_valuereturn_attention_mask   Zhtk)Znum_frequency_binsZnum_mel_filtersZmin_frequencyZmax_frequencyr$   ZnormZ	mel_scaleZslaney )super__init__r    r!   r"   fft_window_sizeZnb_frequency_bins
hop_lengthmax_length_snb_max_samplesr$   r   r   r   mel_filtersmel_filters_slaney)selfr#   r$   r,   r-   r+   r%   r&   r   r   r    r!   r"   kwargs	__class__r(   _/var/www/auris/lib/python3.10/site-packages/transformers/models/clap/feature_extraction_clap.pyr*   V   sJ   
	zClapFeatureExtractor.__init__returnc                 C   s8   t | j}| jj|d< d|v r|d= d|v r|d= |S )a)  
        Serializes this instance to a Python dictionary.

        Returns:
            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, except for the
            mel filter banks, which do not need to be saved or printed as they are too long.
        Zfeature_extractor_typer/   r0   )copydeepcopy__dict__r4   __name__)r1   outputr(   r(   r5   to_dict   s   zClapFeatureExtractor.to_dictwaveformr/   c              	   C   s(   t |t| jd| j| jd|dd}|jS )a  
        Compute the log-mel spectrogram of the provided `waveform` using the Hann window. In CLAP, two different filter
        banks are used depending on the truncation pattern:
            - `self.mel_filters`: they correspond to the default parameters of `torchaudio` which can be obtained from
              calling `torchaudio.transforms.MelSpectrogram().mel_scale.fb`. These filters are used when `truncation`
              is set to `"fusion"`.
            - `self.mel_filteres_slaney` : they correspond to the default parameters of `librosa` which used
              `librosa.filters.mel` when computing the mel spectrogram. These filters were only used in the original
              implementation when the truncation mode is not `"fusion"`.
        Zhanng       @ZdB)Zframe_lengthr,   powerr/   Zlog_mel)r	   r
   r+   r,   T)r1   r=   r/   Zlog_mel_spectrogramr(   r(   r5   _np_extract_fbank_features   s   
	z/ClapFeatureExtractor._np_extract_fbank_featuresc                 C   s&  t ttd|| d d}t|d dkrdg|d< t|d dkr)dg|d< t j|d }t j|d }t j|d }|||| d d f }|||| d d f }	|||| d d f }
t|d d d d f }tj	j
j||dgddd}|d d  }t j|||	|
gdd	}|S )
Nr   r'   r      r   ZbilinearF)sizemodeZalign_cornersZaxis)npZarray_splitlistrangelenrandomchoicer   ZtensornnZ
functionalZinterpolatenumpystack)r1   meltotal_frameschunk_framesrangesZ	idx_frontZ
idx_middleZidx_backZmel_chunk_frontZmel_chunk_middleZmel_chunk_backZ
mel_shrinkZ
mel_fusionr(   r(   r5   _random_mel_fusion   s$   

z'ClapFeatureExtractor._random_mel_fusionc                 C   s  |j d |krv|dkr5d}t|| }tjd|d }||||  }| || jdddf }||fS |dkrn| || j}	|| j d }
|	j d }|
|kratj	|	|	|	|	gdd}d}||fS | 
|	||
}d}||fS td	| d
d}|j d |k r|dkrt|t| }t||d d| }|dkrt|t| }t||}tj|d||j d  fddd}|dkr| || j}tj	||||gdd}||fS | || jdddf }||fS )a  
        Extracts the mel spectrogram and prepares it for the mode based on the `truncation` and `padding` arguments.
        Four different path are possible:
            - `truncation="fusion"` and the length of the waveform is greater than the max length: the mel spectrogram
              will be computed on the entire audio. 3 random crops and a dowsampled version of the full mel spectrogram
              are then stacked together. They will later be used for `feature_fusion`.
            - `truncation="rand_trunc"` and the length of the waveform is smaller than the max length: the audio is
              padded based on `padding`.
            - `truncation="fusion"` and the length of the waveform is smaller than the max length: the audio is padded
              based on `padding`, and is repeated `4` times.
            - `truncation="rand_trunc"` and the length of the waveform is greater than the max length: the mel
              spectrogram will be computed on a random crop of the waveform.

        r   Z
rand_truncTr'   Nr   rD   Fzdata_truncating z not implementedrepeatr   Zconstant)rC   Zconstant_values)shaperH   rE   rI   randintr@   r0   r/   r,   rM   rR   NotImplementedErrorintZtilepad)r1   r=   
max_lengthr!   r"   longeroverflowidx	input_melrN   rP   rO   Zn_repeatr(   r(   r5   _get_input_mel   sF   "
 z#ClapFeatureExtractor._get_input_mel
raw_speechrY   r$   return_tensorsc              
      s
  durnj rnj|dur/|jkr.tdjj dj dj d| d	ntdjj d t|t	j
oFt|jd	k}|rWt|jd
krWtd |pjt|ttfojt|d t	j
ttf}	|	rudd |D }n&|	st|t	j
st	j|t	jd}nt|t	j
r|jt	t	ju r|t	j}|	st	|g} fdd|D }
g }g }|
D ]\}}|| || qdkrt|dkrt	jdt|}d||< t|d trdd |D }dd |D }||d}t|}|dur||}|S )af  
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            truncation (`str`, *optional*):
                Truncation pattern for long audio inputs. Two patterns are available:
                    - `fusion` will use `_random_mel_fusion`, which stacks 3 random crops from the mel spectrogram and
                      a downsampled version of the entire mel spectrogram.
                If `config.fusion` is set to True, shorter audios also need to to return 4 mels, which will just be a
                copy of the original mel obtained from the padded audio.
                    - `rand_trunc` will select a random crop of the mel spectrogram.
            padding (`str`, *optional*):
               Padding pattern for shorter audio inputs. Three patterns were originally implemented:
                    - `repeatpad`: the audio is repeated, and then padded to fit the `max_length`.
                    - `repeat`: the audio is repeated and then cut to fit the `max_length`
                    - `pad`: the audio is padded.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.np.array` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
                pipeline.
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r'   rA   z2Only mono-channel audio is supported for input to r   c                 S      g | ]
}t j|t jd qS dtyperE   asarrayfloat64).0Zspeechr(   r(   r5   
<listcomp>E      z1ClapFeatureExtractor.__call__.<locals>.<listcomp>rd   c                    s&   g | ]} | r njqS r(   )r^   r.   )ri   r=   rY   r"   r1   r!   r(   r5   rj   P  s    r   Tc                 S   rb   rc   rf   )ri   featurer(   r(   r5   rj   a  rk   c                 S   s   g | ]}|gqS r(   r(   )ri   rZ   r(   r(   r5   rj   d  s    )r   r   )r!   r"   r$   
ValueErrorr4   r:   loggerwarning
isinstancerE   ndarrayrH   rT   rF   tuplerg   rh   re   ZastypeappendsumrI   rU   r   r   Zconvert_to_tensors)r1   r_   r!   r"   rY   r$   r`   r2   Zis_batched_numpyZ
is_batchedZpadded_inputsr]   r   rN   rZ   Zrand_idxr   r(   rl   r5   __call__  sd   )
"



zClapFeatureExtractor.__call__)r   r   r   r   r   r   Fr   r   Nr   r   )N)NNNNN)r:   
__module____qualname____doc__Zmodel_input_namesfloatr   rW   strr*   r   r   r<   rE   arrayrr   r@   rR   r^   r   r   r   r   rv   __classcell__r(   r(   r3   r5   r   !   sd    1	
5"="	r   )ry   r7   typingr   r   r   r   r   rL   rE   r   Zaudio_utilsr   r	   r
   Z!feature_extraction_sequence_utilsr   Zfeature_extraction_utilsr   utilsr   r   Zutils.import_utilsr   Z
get_loggerr:   ro   r   __all__r(   r(   r(   r5   <module>   s    
  
O