o
    Zh_-                     @   sp   d Z ddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZmZmZ eeZG dd	 d	eZd	gZdS )
z&
Feature extractor class for Wav2Vec2
    )ListOptionalUnionN   )SequenceFeatureExtractor)BatchFeature)PaddingStrategy
TensorTypeloggingc                       s   e Zd ZdZddgZ					d fd	d
	Ze	ddeej	 deej	 de
deej	 fddZ							ddeej	ee
 eej	 eee
  f deeeef dee dedee dee deeeef  dee defddZ  ZS )Wav2Vec2FeatureExtractora  
    Constructs a Wav2Vec2 feature extractor.

    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
    most of the main methods. Users should refer to this superclass for more information regarding those methods.

    Args:
        feature_size (`int`, *optional*, defaults to 1):
            The feature dimension of the extracted features.
        sampling_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
        padding_value (`float`, *optional*, defaults to 0.0):
            The value that is used to fill the padding values.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
            improve the performance for some models, *e.g.*,
            [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
        return_attention_mask (`bool`, *optional*, defaults to `False`):
            Whether or not [`~Wav2Vec2FeatureExtractor.__call__`] should return `attention_mask`.

            <Tip>

            Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
            [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
            `attention_mask`. For such models, `input_values` should simply be padded with 0 and no `attention_mask`
            should be passed.

            For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
            [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
            passed for batched inference.

            </Tip>input_valuesattention_mask   >          FTc                    s*   t  jd|||d| || _|| _d S )N)feature_sizesampling_ratepadding_value )super__init__return_attention_maskdo_normalize)selfr   r   r   r   r   kwargs	__class__r   g/var/www/auris/lib/python3.10/site-packages/transformers/models/wav2vec2/feature_extraction_wav2vec2.pyr   C   s   	
z!Wav2Vec2FeatureExtractor.__init__r   returnc                 C   s   |durEt |t j}g }t| |dD ]-\}}||d|   t |d|  d  }||jd k r=|||d< |	| q|S dd | D }|S )z[
        Every array in the list is normalized to have zero mean and unit variance
        NHz>r   c                 S   s*   g | ]}||   t| d   qS )r    )meannpsqrtvar).0xr   r   r   
<listcomp>b   s   * zDWav2Vec2FeatureExtractor.zero_mean_unit_var_norm.<locals>.<listcomp>)
r"   arrayint32zipsumr!   r#   r$   shapeappend)r   r   r   Znormed_input_valuesZvectorlengthZnormed_slicer   r   r   zero_mean_unit_var_normP   s   .z0Wav2Vec2FeatureExtractor.zero_mean_unit_var_normN
raw_speechpadding
max_length
truncationpad_to_multiple_ofr   return_tensorsr   c	              
   K   s  |dur|| j krtd|  d| j  d| j  d| d	ntd| jj d t|tjo4t	|j
d	k}
|
rEt	|j
d
krEtd|  |
pXt|ttfoXt|d tjttf}|s^|g}td|i}| j||||||d}|d }t|d tjsdd |D |d< n:t|tjst|d tjr|d jttju rdd |D |d< nt|tjr|jttju r|tj|d< |d}|durdd |D |d< | jr| j||dtjur|nd}| j|d || jd|d< |dur||}|S )a  
        Main method to featurize and prepare for the model one or several sequence(s).

        Args:
            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
                stereo, i.e. single float per timestep.
            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
                Select a strategy to pad the returned sequences (according to the model's padding side and padding
                index) among:

                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
                  sequence if provided).
                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
                  acceptable input length for the model if that argument is not provided.
                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
                  lengths).
            max_length (`int`, *optional*):
                Maximum length of the returned list and optionally padding length (see above).
            truncation (`bool`):
                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value.

                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
            return_attention_mask (`bool`, *optional*):
                Whether to return the attention mask. If left to the default, will return the attention mask according
                to the specific feature_extractor's default.

                [What are attention masks?](../glossary#attention-mask)

                <Tip>

                Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as
                [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
                `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
                `attention_mask` should be passed.

                For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as
                [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should
                be passed for batched inference.

                </Tip>

            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors instead of list of python integers. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return Numpy `np.ndarray` objects.
            sampling_rate (`int`, *optional*):
                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
                `sampling_rate` at the forward call to prevent silent errors.
            padding_value (`float`, *optional*, defaults to 0.0):
        Nz3The model corresponding to this feature extractor: z& was trained using a sampling rate of zI. Please make sure that the provided `raw_speech` input was sampled with z	 and not .zDIt is strongly recommended to pass the `sampling_rate` argument to `zN()`. Failing to do so can result in silent errors that might be hard to debug.r      z2Only mono-channel audio is supported for input to r   r   )r1   r2   r3   r4   r   c                 S      g | ]
}t j|t jd qS )dtype)r"   asarrayfloat32r%   r(   r   r   r   r'          z5Wav2Vec2FeatureExtractor.__call__.<locals>.<listcomp>c                 S   s   g | ]}| tjqS r   )astyper"   r<   r=   r   r   r   r'      s    r   c                 S   r8   r9   )r"   r;   r)   r=   r   r   r   r'      r>   )r2   )r   r   )r   
ValueErrorloggerwarningr   __name__
isinstancer"   ndarraylenr,   listtupler   padr:   Zfloat64r?   r<   getr   Z_get_padding_strategiesr   Z
DO_NOT_PADr/   r   Zconvert_to_tensors)r   r0   r1   r2   r3   r4   r   r5   r   r   Zis_batched_numpyZ
is_batchedZencoded_inputsZpadded_inputsr   r   r   r   r   __call__f   sr   F
"




z!Wav2Vec2FeatureExtractor.__call__)r   r   r   FT)r   )FNFNNNN)rC   
__module____qualname____doc__Zmodel_input_namesr   staticmethodr   r"   rE   floatr/   r   boolstrr   r   intr	   r   rK   __classcell__r   r   r   r   r      s\    !"	r   )rN   typingr   r   r   numpyr"   Z!feature_extraction_sequence_utilsr   Zfeature_extraction_utilsr   utilsr   r	   r
   Z
get_loggerrC   rA   r   __all__r   r   r   r   <module>   s   
 
U