o
    ZhN                     @   s   d Z ddlZddlmZmZmZ ddlZddlZddlm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZmZmZmZmZ dd	lmZ e rMddlZddlZe rTddlZe r[ddlZeeZed
dG dd deZdgZ dS )z%Feature extractor class for Pop2Piano    N)ListOptionalUnion   )mel_filter_bankspectrogram)SequenceFeatureExtractor)BatchFeature)
TensorTypeis_essentia_availableis_librosa_availableis_scipy_availableloggingrequires_backends)requires)essentialibrosascipyZtorch)backendsc                       sT  e Zd ZdZg dZ								d1d
ededededededef fddZdej	fddZ
dej	fddZdej	dej	dej	fddZdej	dej	fdd Zd2d"ej	fd#d$Z	%d3d&ed'ed(ed)eeeef  fd*d+Z			!	,	%d4deej	ee eej	 eee  f d
eeee f ded-ee d(ee d)eeeef  d.efd/d0Z  ZS )5Pop2PianoFeatureExtractora  
    Constructs a Pop2Piano feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    This class extracts rhythm and preprocesses the audio before it is passed to the model. First the audio is passed
    to `RhythmExtractor2013` algorithm which extracts the beat_times, beat positions and estimates their confidence as
    well as tempo in bpm, then beat_times is interpolated and to get beatsteps. Later we calculate
    extrapolated_beatsteps from it to be used in tokenizer. On the other hand audio is resampled to self.sampling_rate
    and preprocessed and then log mel spectogram is computed from that to be used in our transformer model.

    Args:
        sampling_rate (`int`, *optional*, defaults to 22050):
            Target Sampling rate of audio signal. It's the sampling rate that we forward to the model.
        padding_value (`int`, *optional*, defaults to 0):
            Padding value used to pad the audio. Should correspond to silences.
        window_size (`int`, *optional*, defaults to 4096):
            Length of the window in samples to which the Fourier transform is applied.
        hop_length (`int`, *optional*, defaults to 1024):
            Step size between each window of the waveform, in samples.
        min_frequency (`float`, *optional*, defaults to 10.0):
            Lowest frequency that will be used in the log-mel spectrogram.
        feature_size (`int`, *optional*, defaults to 512):
            The feature dimension of the extracted features.
        num_bars (`int`, *optional*, defaults to 2):
            Determines interval between each sequence.
    input_features	beatstepsextrapolated_beatstep"V  r               $@      sampling_ratepadding_valuewindow_size
hop_lengthmin_frequencyfeature_sizenum_barsc           	   	      sz   t  jd|||d| || _|| _|| _|| _|| _|| _|| _t	| jd d | j| jt
| jd | jd dd| _d S )N)r%   r    r!   r      Zhtk)Znum_frequency_binsZnum_mel_filtersr$   Zmax_frequencyr    ZnormZ	mel_scale )super__init__r    r!   r"   r#   r$   r%   r&   r   floatmel_filters)	selfr    r!   r"   r#   r$   r%   r&   kwargs	__class__r(   i/var/www/auris/lib/python3.10/site-packages/transformers/models/pop2piano/feature_extraction_pop2piano.pyr*   T   s.   z"Pop2PianoFeatureExtractor.__init__sequencec                 C   sT   g }|D ]}t | jd dd }|t||| j| jd| jd qt |}|S )z
        Generates MelSpectrogram.

        Args:
            sequence (`numpy.ndarray`):
                The sequence of which the mel-spectrogram will be computed.
        r'   Ng       @)ZwaveformwindowZframe_lengthr#   powerr,   )npZhanningr"   appendr   r#   r,   array)r-   r2   	mel_specsseqr4   r(   r(   r1   mel_spectrogramv   s   

z)Pop2PianoFeatureExtractor.mel_spectrogramaudioc                 C   s:   t | dg tjjdd}||\}}}}}|||||fS )a  
        This algorithm(`RhythmExtractor2013`) extracts the beat positions and estimates their confidence as well as
        tempo in bpm for an audio signal. For more information please visit
        https://essentia.upf.edu/reference/std_RhythmExtractor2013.html .

        Args:
            audio(`numpy.ndarray`):
                raw audio waveform which is passed to the Rhythm Extractor.
        r   Zmultifeature)method)r   r   standardZRhythmExtractor2013)r-   r<   Zessentia_trackerbpm
beat_times
confidence	estimatesessentia_beat_intervalsr(   r(   r1   extract_rhythm   s   
z(Pop2PianoFeatureExtractor.extract_rhythmr@   steps_per_beatn_extendc                 C   sR   t | dg tjjt|j|ddd}|td|j| d |j| | }|S )a  
        This method takes beat_times and then interpolates that using `scipy.interpolate.interp1d` and the output is
        then used to convert raw audio to log-mel-spectrogram.

        Args:
            beat_times (`numpy.ndarray`):
                beat_times is passed into `scipy.interpolate.interp1d` for processing.
            steps_per_beat (`int`):
                used as an parameter to control the interpolation.
            n_extend (`int`):
                used as an parameter to control the interpolation.
        r   FZextrapolate)Zbounds_errorZ
fill_valuer   r'   )r   r   ZinterpolateZinterp1dr6   ZarangesizeZlinspace)r-   r@   rE   rF   Zbeat_times_functionZ	ext_beatsr(   r(   r1   interpolate_beat_times   s   
 z0Pop2PianoFeatureExtractor.interpolate_beat_timesbeatstepc                 C   s8  |durt |jdkrtd|j d|d dkr ||d  }| jd }t |}| j|d| jd d d d}g }d}td||D ]+}|}	t|| |}
t||	 | j }t||
 | j }|	||f t
||| }qBg }|D ] \}}||| }tj|d||jd  ffd	dd
}|	| qrt|}||fS )a  
        Preprocessing for log-mel-spectrogram

        Args:
            audio (`numpy.ndarray` of shape `(audio_length, )` ):
                Raw audio waveform to be processed.
            beatstep (`numpy.ndarray`):
                Interpolated values of the raw audio. If beatstep[0] is greater than 0.0, then it will be shifted by
                the value at beatstep[0].
        Nr'   zUExpected `audio` to be a single channel audio input of shape `(n, )` but found shape .r   g           r@   rE   rF   constantZconstant_values)lenshape
ValueErrorr&   rH   rangeminintr    r7   maxr6   padZasarray)r-   r<   rI   Z	num_stepsZnum_target_stepsr   Zsample_indicesZmax_feature_lengthiZ	start_idxZend_idxstart_sample
end_sampleZpadded_batchfeatureZpadded_featurer(   r(   r1   preprocess_mel   s@   

z(Pop2PianoFeatureExtractor.preprocess_melTfeaturesc                 C   s  dd |D }g g }}t |D ]\}}t|jdkrItg t| d || d  }tj|| d d tjd}	dd|fdf}
|
d |
d f}n,|dd	}tg t| d || d  }tj|| tjddd	}	dd|ff }
}tj	||
d
| j
d}tj	|	|d
| j
d}	|rtg t| d }tj|td|| jggdd}tj|	tjd|g|	jdgdd}	|| ||	 qtj|ddtj}tj|ddtj}||fS )Nc                 S   s   g | ]}|j qS r(   )rP   ).0each_featurer(   r(   r1   
<listcomp>   s    z2Pop2PianoFeatureExtractor._pad.<locals>.<listcomp>r   r'   r   )dtype)r   r   r   r3   rM   rN   )Zaxis)	enumeraterO   rP   rU   zipr6   ZonesZint64ZreshaperV   r!   ZconcatenateZzerosr%   r`   r7   astypefloat32)r-   r\   add_zero_lineZfeatures_shapesZattention_masksZpadded_featuresrW   r^   Zfeatures_pad_valueattention_maskZfeature_paddingZattention_mask_paddingZeach_padded_featureZzero_array_lenr(   r(   r1   _pad   s:   
  

zPop2PianoFeatureExtractor._padNinputs
is_batchedreturn_attention_maskreturn_tensorsc                 C   s   i }|  D ]2\}}|dkr"| j|dd\}}	|||< |r!|	|d< q| j|dd\}}	|||< |r8|	|d| < q|sI|sI|d ddd	f |d< t||d
}
|
S )a  
        Pads the inputs to same length and returns attention_mask.

        Args:
            inputs (`BatchFeature`):
                Processed audio features.
            is_batched (`bool`):
                Whether inputs are batched or not.
            return_attention_mask (`bool`):
                Whether to return attention mask or not.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
                If nothing is specified, it will return list of `np.ndarray` arrays.
        Return:
            `BatchFeature` with attention_mask, attention_mask_beatsteps and attention_mask_extrapolated_beatstep added
            to it:
            - **attention_mask** numpy.ndarray of shape `(batch_size, max_input_features_seq_length)` --
                Example :
                    1, 1, 1, 0, 0 (audio 1, also here it is padded to max length of 5 that's why there are 2 zeros at
                    the end indicating they are padded)

                    0, 0, 0, 0, 0 (zero pad to separate audio 1 and 2)

                    1, 1, 1, 1, 1 (audio 2)

                    0, 0, 0, 0, 0 (zero pad to separate audio 2 and 3)

                    1, 1, 1, 1, 1 (audio 3)
            - **attention_mask_beatsteps** numpy.ndarray of shape `(batch_size, max_beatsteps_seq_length)`
            - **attention_mask_extrapolated_beatstep** numpy.ndarray of shape `(batch_size,
              max_extrapolated_beatstep_seq_length)`
        r   T)re   rf   FZattention_mask_Nr3   .)Ztensor_type)itemsrg   r	   )r-   rh   ri   rj   rk   Zprocessed_features_dictZfeature_nameZfeature_valueZpadded_feature_valuesrf   Zoutputsr(   r(   r1   rV     s"   *zPop2PianoFeatureExtractor.padFresamplereturnc                 K   s  t | dg tt|ttfot|d tjttf}|r2t|ts)td| d|du r/dn|}n|g}|g}|du r>dn|}g g g }	}
}t||D ]\}}| j	|d\}}}}}| j
||d	d
}| j|kr| jdur|r|tjj||| jdd}ntd| j d| d | j}t|d | }t|d | }| ||| ||d  \}}| |tj}ttj|ddd}t|d}|	| |
| || qMt|	|
|d}| j||||d}|S )a  
        Main method to featurize and prepare for the model.

        Args:
            audio (`np.ndarray`, `List`):
                The audio or batch of audio to be processed. Each audio can be a numpy array, a list of float values, a
                list of numpy arrays or a list of list of float values.
            sampling_rate (`int`):
                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
            steps_per_beat (`int`, *optional*, defaults to 2):
                This is used in interpolating `beat_times`.
            resample (`bool`, *optional*, defaults to `True`):
                Determines whether to resample the audio to `sampling_rate` or not before processing. Must be True
                during inference.
            return_attention_mask (`bool` *optional*, defaults to `False`):
                Denotes if attention_mask for input_features, beatsteps and extrapolated_beatstep will be given as
                output or not. Automatically set to True for batched inputs.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
                If nothing is specified, it will return list of `np.ndarray` arrays.
        r   r   zwPlease give sampling_rate of each audio separately when you are passing multiple raw_audios at the same time. Received z), expected [audio_1_sr, ..., audio_n_sr].NTF)r<   r'   rL   Zkaiser_best)Zorig_srZ	target_srZres_typezmThe sampling_rate of the provided audio is different from the target sampling_rate of the Feature Extractor, z vs zp. In these cases it is recommended to use `resample=True` in the `__call__` method to get the optimal behaviour.r3   gư>)Za_minZa_max)r   r3   r   )ri   rj   rk   )r   bool
isinstancelisttupler6   ndarrayrQ   rb   rD   rH   r    r   corerm   warningswarnrT   r[   r;   rc   rd   logZclipZ	transposer7   r	   rV   )r-   r<   r    rE   rm   rj   rk   r.   ri   Zbatch_input_featuresZbatch_beatstepsZbatch_ext_beatstepZsingle_raw_audioZsingle_sampling_rater?   r@   rA   rB   rC   r   rX   rY   r   r   r9   Zlog_mel_specsoutputr(   r(   r1   __call__X  sv   #(


z"Pop2PianoFeatureExtractor.__call__)r   r   r   r   r   r   r   )T)N)r   TFN)__name__
__module____qualname____doc__Zmodel_input_namesrT   r+   r*   r6   rt   r;   rD   numpyrH   r[   rg   r	   rp   r   r   strr
   rV   r   rz   __classcell__r(   r(   r/   r1   r   3   s    "
00
D"	r   )!r~   rv   typingr   r   r   r   r6   Zaudio_utilsr   r   Z!feature_extraction_sequence_utilsr   Zfeature_extraction_utilsr	   utilsr
   r   r   r   r   r   Zutils.import_utilsr   r   Zessentia.standardr   r   Z
get_loggerr{   loggerr   __all__r(   r(   r(   r1   <module>   s0    
   
