
    fThW>                        S SK r S SKJr  S SKJrJrJrJrJr  S SK	r
SSKJrJr  \" 5       (       a  S SKr\" 5       (       a  S SKrSSKJrJr  SSKJr  SSKJrJrJrJr  SS	KJrJr   " S
 S\SS9r " S S\SS9r " S S\5      r S/r!g)    N)Path)AnyDictListOptionalUnion   )is_soundfile_availableis_torch_available)
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   0    \ rS rSr% \\\\4      \S'   Sr	g)CsmAudioKwargs(   encoded_length_kwargs N)
__name__
__module____qualname____firstlineno__r   r   strr   __annotations____static_attributes__r       ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/csm/processing_csm.pyr   r   (   s    #DcN33r!   r   F)totalc                   T    \ rS rSr% \\S'   SSSS./ SQ/ SQ/ S	QSS
.SS.SS0S.rSrg)CsmProcessorKwargs,   audio_kwargsTleftF)paddingpadding_sideadd_special_tokens)   r	         r	   r-   
   r	   r-      r	   r-      r	      )r-   r-   r-   r2   r-   r-      r-   r-      r-   r-   r.   r-      )r-   r-   r-   r-   r-   r-   r-   r-   r-   r-   r-   r-   r-   r-   r-   )kernel_sizesstrides	dilationsuse_causal_convi]  )r   sampling_ratereturn_tensorspt)text_kwargsr'   common_kwargsr   N)r   r   r   r   r   r   	_defaultsr    r   r!   r"   r%   r%   ,   sG       ""'
 !QHJ#'	& #
 +D1Ir!   r%   c                      ^  \ rS rSrSrSS/rS/rSrSr SU 4S jjr	\
SS	 j5       rS
\S\\\\\\\4      4   S\\   4S jr   SS\\\\\\   \\   4      S
\\   S\\   S\\   S\\   4
S jjrSrU =r$ )CsmProcessorA   a  
Constructs a Csm processor which wraps [`EncodecFeatureExtractor`] and
[`PretrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
tokenizer functionalities. See the [`~CsmProcessor.__call__`] for more
information.
The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
    ```python
    from transformers import CsmProcessor
    from datasets import load_dataset

    ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
    audio = ds[0]["audio"]["array"]

    processor = CsmProcessor.from_pretrained("eustlb/csm-1b")

    processor(
        text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
        audio=audio,
        text_kwargs = {"padding": False},
        audio_kwargs = {"sampling_rate": 16000},
        common_kwargs = {"return_tensors": "pt"},
    )
    # this should error out because EncodecFeatureExtractor expects a 24kHz audio :)
    ```

Args:
    feature_extractor ([`EncodecFeatureExtractor`]):
        The feature extractor is a required input.
    tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
        The tokenizer is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.

feature_extractor	tokenizerchat_templateEncodecFeatureExtractorPreTrainedTokenizerFastc                   > [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR                  U l        [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR
                  U l        [        TU ]  XUS9  g )Naudio_tokenz	<|AUDIO|>audio_eos_tokenz<|audio_eos|>)rE   )hasattrrI   convert_tokens_to_idsaudio_token_idrJ   audio_eos_token_idsuper__init__)selfrC   rD   rE   	__class__s       r"   rP   CsmProcessor.__init__j   s     y-00*D"+"A"A$BRBR"SD(44D"+":":Dy"344#2D &/&E&EdFZFZ&[D##,#<#<D &/&B&BD#*]Sr!   c                 B   U nUb	  Ub  Ub  Uc  U$ [        XU5       H~  u  pgnUS-
  U-  S-   n	Xg-
  n
U
S-  nX-
  nXY-
  U
-   U-  S-   n[        R                  " U5      S-
  nX-  U-   U
-
  nX-
  nU(       a  U
nUnOUnX-   nX\-   U-   nXXUS-
  -  -
  S-
  U-  S-   nM     U$ )aD  
Compute the length of the encoded audio sequence.

Args:
    audio_length (int): The length of the audio sequence.
    kernel_sizes (List[int]): The kernel sizes for the convolutional layers.
    strides (List[int]): The strides for the convolutional layers.
    use_causal_conv (bool): Whether to use causal convolutions.
r-   r5   )zipmathceil)audio_lengthr6   r7   r8   r9   
cur_lengthkernel_sizestridedilationeffective_kernel_sizepadding_totalpadding_rightpadding_leftn_framesideal_lengthextra_paddings                   r"   _get_encoded_length CsmProcessor._get_encoded_length   s     "
7?i6G?Kb-0	-R)K%01_$@1$D!'0M)Q.M(8L":]JfTWXXHyy*Q.H#,{:]JL(5M, -+ - =#2]BJ$;?'CCaGFRUVVJ' .S* r!   audiosaving_pathkwargsc                 z   [        5       (       d  [        S5      e[        U5      n[        U[        [
        45      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S5      e[        U5      [        U5      :w  a  [        S5      eU R                  " [        40 UD6nUS   nUS   n[        X5       Hg  u  px[        U[        R                  5      (       a,  UR!                  5       R#                  5       R%                  5       n[&        R(                  " XU5        Mi     g )Nz/Please install `soundfile` to save audio files.c              3   N   #    U  H  n[        U[        [        45      v   M     g 7fN)
isinstancer   r   ).0ps     r"   	<genexpr>*CsmProcessor.save_audio.<locals>.<genexpr>   s#     @qep`aAPSUY{A[A[eps   #%zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer'   r:   )r
   ImportErrorr   rl   r   r   listtupleall
ValueErrorlen_merge_kwargsr%   rU   torchTensorcpufloatnumpysfwrite)	rQ   rf   rg   rh   output_kwargsr'   r:   audio_valuern   s	            r"   
save_audioCsmProcessor.save_audio   s	    &''OPP #5) kC;//&-K[4-88S@qep@q=q=q`aau:[))TUU**

 %^4$_5!%5NK+u||44)oo/557==?HHQ]3 6r!   textoutput_labelsdepth_decoder_labels_ratioc                 
   U R                   " [        4SU R                  R                  0UD6nUS   nUS   nUS   n	U	R	                  SS5      n
U
S:w  a"  [        U R                  R                   S35      e[        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S	 U 5       5      (       d  [        S
5      eU Vs/ s H  oR                  U R                  5      PM     nnSnUb  [        U5      n[!        U5      n[#        U5      S:  a/  U[#        U5      :w  a   Uc  [        S5      e[        SU SU S35      eUGb  UR	                  S0 5      nU Vs/ s H"  oR$                  " UR&                  S   40 UD6PM$     nnUR)                  5       n/ nU H  n/ nU R                  U;   a`  UR	                  S5      nU R                  U-  nUR+                  U5        UR-                  U R                  SS5      nU R                  U;   a  M`  SU;   a*  UR-                  SUR	                  S5      S5      nSU;   a  M*  UR+                  U5        M     UnU R                  " U40 UD6n0 nUR/                  U5        UGb  UR	                  SS5        / / nnSnU GH*  nUS:X  aM  UR+                  [0        R2                  " S5      5        UR+                  [4        R6                  " S/5      5        MW  UR+                  [0        R8                  " UUUU-     Vs/ s HB  n[        U[4        R:                  5      (       a  UR=                  5       R?                  5       OUPMD     snSS95        UR+                  [4        R6                  " UUUU-     Vs/ s H  nUR&                  S   PM     sn5      RA                  SS95        UU-  nGM-     U RB                  " U40 UD6nUR	                  SS5        UR/                  U5        [E        S U 5       5      nU Vs/ s H>  n[4        RF                  RH                  RK                  USUUR&                  S   -
  4SS9PM@     nn[4        RL                  " USS9US'   U(       a  US   U RN                  :H  RQ                  5       n U R&                  S   n!US::  a.  [4        RR                  " U!5      S[U        U!SU-
  -  5       n"U U"   n#OU n#[4        RV                  " US   U RN                  :H  US   S5      n$SU$U#SS2S4   U#SS2S4   4'   U$US'   [Y        UU
S 9$ s  snf s  snf s  snf s  snf s  snf )!a	  
Main method to prepare text(s) and audio to be fed as input to the model. This method forwards the `text`
arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode
the text. To prepare the audio, this method forwards the `audio` arguments to
EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`]. Please refer
to the docstring of the above two methods for more information.

Args:
    audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
        The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
        tensor.
    text (`str`, `List[str]`, `List[List[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    output_labels (bool, *optional*, default=False):
        Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
        - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
        - `-100` will be ignored in the loss computation
        - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
    depth_decoder_labels_ratio (float, *optional*, default=1.0):
        The ratio of audio frames to keep for the depth decoder labels.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
            - `'tf'`: Return TensorFlow `tf.constant` objects.
            - `'pt'`: Return PyTorch `torch.Tensor` objects.
            - `'np'`: Return NumPy `np.ndarray` objects.
            - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **input_values** -- List of audio values to be fed to a model. Returned when `audio` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **labels** -- List of labels for the audio frames. Returned when `output_labels=True`.
tokenizer_init_kwargsr=   r'   r>   r;   Nr<   z% only supports `return_tensors='pt'`.c              3   B   #    U  H  n[        U[        5      v   M     g 7frk   )rl   r   )rm   ts     r"   ro   (CsmProcessor.__call__.<locals>.<genexpr>  s     9[VZQR*Q:L:LVZs   zAInvalid input text. Please provide a string, or a list of stringsr   z@No audio were provided, but there are audio tokens in the promptz)The number of audio tokens in each text (z7) should be the same as the number of provided audios (z).r   z<placeholder>r-   return_attention_mask)axis)dimpadding_maskc              3   >   #    U  H  oR                   S    v   M     g7f)r   N)shape)rm   cut_idxss     r"   ro   r   S  s     R=Q..,=Qs   )valueinput_values_cutoffs	input_ids      ?iilabels)datatensor_type)-rw   r%   rD   init_kwargspopru   rR   r   rl   r   rr   rs   rt   countrI   r   rv   sumrd   r   copyappendreplaceupdatenpzerosrx   tensorconcatenatery   rz   r|   cumsumrC   maxnn
functionalpadstackrM   nonzerorandpermintwherer   )%rQ   r   rf   r   r   rh   r   r=   r'   r>   r;   r   n_audio_in_textn_audior   audio_arraynum_audio_tokens_listnum_audio_tokens_list_copyexpanded_textsamplereplace_strnum_audio_tokensexpanded_audio_tokenencodingr   concatenated_audior   offsetelaudio_inputsmax_lenr   audio_frame_idxsn_audio_frames	rand_idxsskip_frames_idxsr   s%                                        r"   __call__CsmProcessor.__call__   s~   ^ **
"&.."<"<
 
 $M2$^4%o6&**+;TBT! 7 788]^__dC  6DTD%=11c9[VZ9[6[6[`aa>BCd774#3#34dC&u-E%jG!#33G(G} !cdd ??P Q229"> 
 $0$4$45Lb$Q!lq%lq]h(():):2)>XBWXlq " % *?)C)C)E& M &&&0'A'E'Ea'H$+/+;+;>N+N(&&';<#^^D,<,<oqQF &&&0 &/#^^O[__Q=OQRSF &/$$V,  !D>>$6+6H4d;792 4F*a<&--bhhqk:(//bT0BC&-- +0'9I*J*JB 5?r5<<4P4P 0VX X*J "$ )//U6FU\L\=]%^=]rbhhrl=]%^_ffkmfn g%F# +&  112DUUL^T2KK% R=QRRG !5$ 4H ##''1gr@R6R2S[]'^ 4 ! $ ,1;;7KQR+SD'( $[ 1T5H5H HQQS-33A6N)S0!NN>:;sSSTWqSqAr=st	#3I#> #3 [[k!2d6I6I!I4P[K\^bcFEIF#AqD)+;AqD+AAB#DN>BB D$%L &_$s    $T>*)U%A	UU$AU)rJ   rN   rI   rM   rk   )NNNN)NFr   )r   r   r   r   __doc__
attributesvalid_kwargsfeature_extractor_classtokenizer_classrP   staticmethodrd   r   r   r   r   r   r   r%   r   r   r   r   boolr{   r   r    __classcell__)rR   s   @r"   rA   rA   A   s   !F &{3J#$L7/O 	T, $ $L 4 4 3d5d+;&<<= 4 +,	 4J '+(-69`CuY(94	?DQbLccde`C 
#`C  ~	`C
 %-UO`C +,`C `Cr!   rA   )"rV   pathlibr   typingr   r   r   r   r   r|   r   utilsr
   r   rx   	soundfiler}   audio_utilsr   r   feature_extraction_utilsr   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r%   rA   __all__r   r!   r"   <module>r      s|       3 3  ?  9 4 U U4[ 4) *hC> hCV	 
r!   