o
    ZŽh¬  ã                   @   s*   d Z ddlmZ G dd„ deƒZdgZdS )z$Speech processor class for SpeechT5.é   )ÚProcessorMixinc                       sH   e Zd ZdZdZdZ‡ fdd„Zdd„ Zdd	„ Zd
d„ Z	dd„ Z
‡  ZS )ÚSpeechT5Processora}  
    Constructs a SpeechT5 processor which wraps a feature extractor and a tokenizer into a single processor.

    [`SpeechT5Processor`] offers all the functionalities of [`SpeechT5FeatureExtractor`] and [`SpeechT5Tokenizer`]. See
    the docstring of [`~SpeechT5Processor.__call__`] and [`~SpeechT5Processor.decode`] for more information.

    Args:
        feature_extractor (`SpeechT5FeatureExtractor`):
            An instance of [`SpeechT5FeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`SpeechT5Tokenizer`):
            An instance of [`SpeechT5Tokenizer`]. The tokenizer is a required input.
    ZSpeechT5FeatureExtractorZSpeechT5Tokenizerc                    s   t ƒ  ||¡ d S )N)ÚsuperÚ__init__)ÚselfÚfeature_extractorÚ	tokenizer©Ú	__class__© ú_/var/www/auris/lib/python3.10/site-packages/transformers/models/speecht5/processing_speecht5.pyr   %   s   zSpeechT5Processor.__init__c                 O   sd  |  dd¡}|  dd¡}|  dd¡}|  dd¡}|  dd¡}|dur*|dur*tdƒ‚|dur6|dur6tdƒ‚|du rJ|du rJ|du rJ|du rJtd	ƒ‚|dur]| j|g|¢R d|i|¤Ž}n|durk| j|fi |¤Ž}nd}|dur| j|||d
œ|¤Ž}	|	d }
n|dur“| j|fi |¤Ž}	|	d }
nd}	|du r›|	S |	dur°|
|d< |	 d¡}|dur°||d< |S )a  
        Processes audio and text input, as well as audio and text targets.

        You can process audio by using the argument `audio`, or process audio targets by using the argument
        `audio_target`. This forwards the arguments to SpeechT5FeatureExtractor's
        [`~SpeechT5FeatureExtractor.__call__`].

        You can process text by using the argument `text`, or process text labels by using the argument `text_target`.
        This forwards the arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.__call__`].

        Valid input combinations are:

        - `text` only
        - `audio` only
        - `text_target` only
        - `audio_target` only
        - `text` and `audio_target`
        - `audio` and `audio_target`
        - `text` and `text_target`
        - `audio` and `text_target`

        Please refer to the docstring of the above two methods for more information.
        ÚaudioNÚtextÚtext_targetÚaudio_targetÚsampling_ratez\Cannot process both `audio` and `text` inputs. Did you mean `audio_target` or `text_target`?z\Cannot process both `audio_target` and `text_target` inputs. Did you mean `audio` or `text`?zaYou need to specify either an `audio`, `audio_target`, `text`, or `text_target` input to process.)r   r   Úinput_valuesÚ	input_idsÚlabelsÚattention_maskÚdecoder_attention_mask)ÚpopÚ
ValueErrorr   r   Úget)r   ÚargsÚkwargsr   r   r   r   r   ÚinputsÚtargetsr   r   r   r   r   Ú__call__(   sJ   ÿÿ ÿ


zSpeechT5Processor.__call__c           
      O   sd  |  dd¡}|  dd¡}|  dd¡}|dur|durtdƒ‚|du r.|du r.|du r.tdƒ‚|dur@| jj|g|¢R i |¤Ž}n|durO| jj|fi |¤Ž}nd}|dur“d|v sdt|tƒrsd|d v rs| jj|fi |¤Ž}|d }n"| jj}| jj| j_| jj|g|¢R i |¤Ž}|| j_|d }nd}|du r›|S |dur°||d< | 	d¡}	|	dur°|	|d	< |S )
au  
        Collates the audio and text inputs, as well as their targets, into a padded batch.

        Audio inputs are padded by SpeechT5FeatureExtractor's [`~SpeechT5FeatureExtractor.pad`]. Text inputs are padded
        by SpeechT5Tokenizer's [`~SpeechT5Tokenizer.pad`].

        Valid input combinations are:

        - `input_ids` only
        - `input_values` only
        - `labels` only, either log-mel spectrograms or text tokens
        - `input_ids` and log-mel spectrogram `labels`
        - `input_values` and text `labels`

        Please refer to the docstring of the above two methods for more information.
        r   Nr   r   z:Cannot process both `input_values` and `input_ids` inputs.zZYou need to specify either an `input_values`, `input_ids`, or `labels` input to be padded.é    r   r   )
r   r   r   Úpadr   Ú
isinstanceÚlistZfeature_sizeZnum_mel_binsr   )
r   r   r   r   r   r   r   r   Zfeature_size_hackr   r   r   r   r    o   s@   ÿ


zSpeechT5Processor.padc                 O   ó   | j j|i |¤ŽS )z¿
        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r   Úbatch_decode©r   r   r   r   r   r   r$   «   ó   zSpeechT5Processor.batch_decodec                 O   r#   )z¹
        This method forwards all its arguments to SpeechT5Tokenizer's [`~SpeechT5Tokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r   Údecoder%   r   r   r   r'   ²   r&   zSpeechT5Processor.decode)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Zfeature_extractor_classZtokenizer_classr   r   r    r$   r'   Ú__classcell__r   r   r	   r   r      s    G<r   N)r+   Zprocessing_utilsr   r   Ú__all__r   r   r   r   Ú<module>   s
    
'