o
    ZhW>                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z
ddlmZmZ e r+d dlZe r2d dlZddlmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ G d
d deddZG dd deddZG dd deZ dgZ!dS )    N)Path)AnyDictListOptionalUnion   )is_soundfile_availableis_torch_available)
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   @   s"   e Zd ZU eeeef  ed< dS )CsmAudioKwargsencoded_length_kwargsN)__name__
__module____qualname__r   r   strr   __annotations__ r   r   U/var/www/auris/lib/python3.10/site-packages/transformers/models/csm/processing_csm.pyr   (   s   
 r   F)totalc                   @   sJ   e Zd ZU eed< ddddg dg dg ddd	d
dddidZdS )CsmProcessorKwargsaudio_kwargsTleftF)paddingZpadding_sideZadd_special_tokens)   r         r   r#   
   r   r#      r   r#      r      )r#   r#   r#   r(   r#   r#      r#   r#      r#   r#   r$   r#      )r#   r#   r#   r#   r#   r#   r#   r#   r#   r#   r#   r#   r#   r#   r#   )kernel_sizesstrides	dilationsuse_causal_convi]  )r   sampling_ratereturn_tensorspt)text_kwargsr   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r   r   ,   s   
 	
r   c                       s   e Zd ZdZddgZdgZdZdZ	d fdd		Ze	dd
dZ
dedeeeeeeef  f dee fddZ			ddeeeeee ee f  dee dee dee dee f
ddZ  ZS )CsmProcessora  
    Constructs a Csm processor which wraps [`EncodecFeatureExtractor`] and
    [`PretrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
    tokenizer functionalities. See the [`~CsmProcessor.__call__`] for more
    information.
    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
        ```python
        from transformers import CsmProcessor
        from datasets import load_dataset

        ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
        audio = ds[0]["audio"]["array"]

        processor = CsmProcessor.from_pretrained("eustlb/csm-1b")

        processor(
            text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
            audio=audio,
            text_kwargs = {"padding": False},
            audio_kwargs = {"sampling_rate": 16000},
            common_kwargs = {"return_tensors": "pt"},
        )
        # this should error out because EncodecFeatureExtractor expects a 24kHz audio :)
        ```

    Args:
        feature_extractor ([`EncodecFeatureExtractor`]):
            The feature extractor is a required input.
        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.

    feature_extractor	tokenizerchat_templateZEncodecFeatureExtractorZPreTrainedTokenizerFastNc                    sv   t |dsd| _|| j| _n|j| _|j| _t |ds(d| _|| j| _n|j| _|j| _t j|||d d S )Naudio_tokenz	<|AUDIO|>audio_eos_tokenz<|audio_eos|>)r9   )hasattrr:   Zconvert_tokens_to_idsaudio_token_idr;   Zaudio_eos_token_idsuper__init__)selfr7   r8   r9   	__class__r   r   r?   j   s   

zCsmProcessor.__init__c                 C   s   | }|du s|du s|du s|du r|S t |||D ]W\}}}|d | d }	|| }
|
d }|
| }||	 |
 | d }t|d }|| | |
 }|| }|rW|
}|}n|}|| }|| | }|||d   d | d }q|S )a|  
        Compute the length of the encoded audio sequence.

        Args:
            audio_length (int): The length of the audio sequence.
            kernel_sizes (List[int]): The kernel sizes for the convolutional layers.
            strides (List[int]): The strides for the convolutional layers.
            use_causal_conv (bool): Whether to use causal convolutions.
        Nr#   r+   )zipmathceil)Zaudio_lengthr,   r-   r.   r/   Z
cur_lengthZkernel_sizeZstrideZdilationZeffective_kernel_sizeZpadding_totalpadding_rightpadding_leftZn_framesZideal_lengthZextra_paddingr   r   r   _get_encoded_length   s(    z CsmProcessor._get_encoded_lengthaudiosaving_pathkwargsc           	      K   s   t  stdt|}t|ttfr|g}nt|ttfr&tdd |D s*t	dt
|t
|kr6t	d| jtfi |}|d }|d }t||D ]\}}t|tjr^|   }t||| qLd S )Nz/Please install `soundfile` to save audio files.c                 s   s    | ]
}t |ttfV  qd S N)
isinstancer   r   ).0pr   r   r   	<genexpr>   s    z*CsmProcessor.save_audio.<locals>.<genexpr>zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer   r0   )r	   ImportErrorr   rM   r   r   listtupleall
ValueErrorlen_merge_kwargsr   rC   torchTensorcpufloatnumpysfwrite)	r@   rI   rJ   rK   output_kwargsr   r0   Zaudio_valuerO   r   r   r   
save_audio   s*    zCsmProcessor.save_audioF      ?textoutput_labelsdepth_decoder_labels_ratioc              
      s  j tfdjji|}|d }|d }|d }	|	dd}
|
dkr,tjj dt|t	r5|g}nt|t
tfrEtd	d
 |D sItdfdd|D }d}|dur`t|}t|}t|dkr|t|kr|du rttdtd| d| d|dur|di   fdd|D }| }g }|D ]<}g }j|v r|d}j| }|| |jdd}j|v sd|v r|d|dd}d|v s|| q|}j|fi |}i }|| |durw|dd g g }}d}|D ]I}|dkr|td |tdg q|tjdd ||||  D dd |tdd ||||  D jdd ||7 }qj|fi |}|dd || tdd
 |D fdd|D }tj|dd|d < |r|d! jk }|j d }|d"krt!|dt"|d|   }|| }n|}t#|d! jk|d! d#}d$||dddf |dddf f< ||d%< t$||
d&S )'a  
        Main method to prepare text(s) and audio to be fed as input to the model. This method forwards the `text`
        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode
        the text. To prepare the audio, this method forwards the `audio` arguments to
        EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`]. Please refer
        to the docstring of the above two methods for more information.

        Args:
            audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
                The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
                tensor.
            text (`str`, `List[str]`, `List[List[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            output_labels (bool, *optional*, default=False):
                Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
                - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
                - `-100` will be ignored in the loss computation
                - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
            depth_decoder_labels_ratio (float, *optional*, default=1.0):
                The ratio of audio frames to keep for the depth decoder labels.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **labels** -- List of labels for the audio frames. Returned when `output_labels=True`.
        Ztokenizer_init_kwargsr3   r   r4   r1   Nr2   z% only supports `return_tensors='pt'`.c                 s   s    | ]}t |tV  qd S rL   )rM   r   rN   tr   r   r   rP         z(CsmProcessor.__call__.<locals>.<genexpr>zAInvalid input text. Please provide a string, or a list of stringsc                    s   g | ]}|  jqS r   )countr:   re   )r@   r   r   
<listcomp>
  s    z)CsmProcessor.__call__.<locals>.<listcomp>r   z@No audio were provided, but there are audio tokens in the promptz)The number of audio tokens in each text (z7) should be the same as the number of provided audios (z).r   c                    s$   g | ]}j |jd  fi  qS )rH   shape)rN   Zaudio_array)r   r@   r   r   ri     s    z<placeholder>r#   Zreturn_attention_maskrk   c                 S   s(   g | ]}t |tjr|  n|qS r   )rM   rX   rY   rZ   r\   rN   elr   r   r   ri   B  s    )Zaxisc                 S   s   g | ]}|j d  qS rj   rl   rm   r   r   r   ri   J  s    )dimZpadding_maskc                 s   s    | ]}|j d  V  qdS )rk   Nro   rN   Zcut_idxsr   r   r   rP   S  rg   c                    s.   g | ]}t jjj|d  |jd  fddqS )r   rk   )value)rX   nnZ
functionalpadrl   rq   )max_lenr   r   ri   T  s     input_values_cutoffsZ	input_idsra   iilabels)dataZtensor_type)%rW   r   r8   Zinit_kwargspoprU   rB   r   rM   r   rR   rS   rT   r   rV   sumcopyr:   appendreplaceupdatenpZzerosrX   ZtensorZconcatenateZcumsumr7   maxstackr=   Znonzerorl   Zrandpermintwherer   )r@   rb   rI   rc   rd   rK   r_   r3   r   r4   r1   Zn_audio_in_textZn_audioZnum_audio_tokens_listZnum_audio_tokens_list_copyZexpanded_textsampleZreplace_strZnum_audio_tokensZexpanded_audio_tokenencodingrx   Zconcatenated_audiorv   offsetZaudio_inputsZaudio_frame_idxsZn_audio_framesZ	rand_idxsZskip_frames_idxsrw   r   )r   ru   r@   r   __call__   s   /
 








	&





$zCsmProcessor.__call__rL   )NNNN)NFra   )r   r   r   __doc__
attributesZvalid_kwargsZfeature_extractor_classZtokenizer_classr?   staticmethodrH   r   r   r   r   r   r   r   r`   r   r   r   boolr[   r   __classcell__r   r   rA   r   r6   A   s>    #&
%r6   )"rD   pathlibr   typingr   r   r   r   r   r\   r   utilsr	   r
   rX   Z	soundfiler]   Zaudio_utilsr   r   Zfeature_extraction_utilsr   Zprocessing_utilsr   r   r   r   Ztokenization_utils_baser   r   r   r   r6   __all__r   r   r   r   <module>   s&     
-