
    fThw5                         S r SSKrSSKrSSKJr  SSKrSSKJr  SSK	J
r
  SSKJr  SSKJr  S	S
KJr  \R"                  " \5      r " S S\
5      rS/rg)z
Processor class for Bark
    N)Optional   )BatchFeature)ProcessorMixin)logging)cached_file   )AutoTokenizerc                      ^  \ rS rSrSrSrS/rSSSS.rSU 4S jjr\	 SS	 j5       r
   SS
\4U 4S jjjrSS\\   4S jjrSS\\   4S jjr       SS jrSrU =r$ )BarkProcessor#   a  
Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.

Args:
    tokenizer ([`PreTrainedTokenizer`]):
        An instance of [`PreTrainedTokenizer`].
    speaker_embeddings (`Dict[Dict[str]]`, *optional*):
        Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
        `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
        embeddings. The values correspond to the path of the corresponding `np.ndarray`. See
        [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
        a list of `voice_preset_names`.

r
   	tokenizer   r	   semantic_promptcoarse_promptfine_promptc                 0   > [         TU ]  U5        X l        g N)super__init__speaker_embeddings)selfr   r   	__class__s      `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/bark/processing_bark.pyr   BarkProcessor.__init__<   s    #"4    c                 b   Ub  [        UUUR                  SS5      UR                  SS5      UR                  SS5      UR                  SS5      UR                  SS5      UR                  SS5      UR                  S	S5      UR                  S
S5      SSSS9nUc9  [        R                  S[        R
                  R                  X5       S35        SnO-[        U5       n[        R                  " U5      nSSS5        OSn[        R                  " U40 UD6nU " UWS9$ ! , (       d  f       N,= f)ac  
Instantiate a Bark processor associated with a pretrained model.

Args:
    pretrained_model_name_or_path (`str` or `os.PathLike`):
        This can be either:

        - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on
          huggingface.co.
        - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`]
          method, e.g., `./my_model_directory/`.
    speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
        The name of the `.json` file containing the speaker_embeddings dictionary located in
        `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded.
    **kwargs
        Additional keyword arguments passed along to both
        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
N	subfolder	cache_dirforce_downloadFproxiesresume_downloadlocal_files_onlyuse_auth_tokenrevisionr   r    r!   r"   r#   r$   tokenr&    _raise_exceptions_for_gated_repo%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors`z` does not exists
                    , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json
                    dictionary if wanted, otherwise set `speaker_embeddings_dict_path=None`.)r   r   )r   poploggerwarningospathjoinopenjsonloadr
   from_pretrained)cls!pretrained_processor_name_or_pathspeaker_embeddings_dict_pathkwargsspeaker_embeddings_pathr   speaker_embeddings_jsonr   s           r   r6   BarkProcessor.from_pretrainedA   s1   . (3&11, **[$7 **[$7%zz*:EB

9d3 &

+<d C!',>!Fjj!148J5166;8='# '."'',,'Hgh i] `
 &*"126M)-3J)K& 32 "&!112S^W]^	Y;MNN 32s    D  
D.push_to_hubc                   > U R                   Gb.  [        R                  " [        R                  R	                  XS5      SS9  0 nXS'   U R                    H  nUS:w  d  M  U R                  U5      n0 n	U R                   U    Hh  n
[        R                  " [        R                  R	                  US   X7 SU
 35      X   SS9  [        R                  R	                  X7 SU
 S	35      X'   Mj     XU'   M     [        [        R                  R	                  X5      S
5       n[        R                  " Xk5        SSS5        [        TU ]0  " X40 UD6  g! , (       d  f       N = f)a  
Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded
using the [`~BarkProcessor.from_pretrained`] method.

Args:
    save_directory (`str` or `os.PathLike`):
        Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created
        if it does not exist).
    speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`):
        The name of the `.json` file that will contains the speaker_embeddings nested path dictionary, if it
        exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`.
    speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`):
        The name of the folder in which the speaker_embeddings arrays will be saved.
    push_to_hub (`bool`, *optional*, defaults to `False`):
        Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
        repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
        namespace).
    kwargs:
        Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
Nv2T)exist_okrepo_or_path_F)allow_picklez.npyw)r   r0   makedirsr1   r2   _load_voice_presetnpsaver3   r4   dumpr   save_pretrained)r   save_directoryr9   speaker_embeddings_directoryr>   r:   embeddings_dict
prompt_keyvoice_presettmp_dictkeyfpr   s               r   rK   BarkProcessor.save_pretrainedy   sA   8 "".KK^SWXcgh O.<N+"55
/#'#:#::#FL!H#66zBGGLL / ?A]ammnorns_t )-). )+5QUaabcfbggkSl(m  C 3;J/ 6" bggll>PRUVZ\		/. W 	FvF WVs   E
ErP   c                    U R                   U   n0 nS GH3  nXS;  a  [        SU SU S35      e[        U R                   R                  SS5      X5   UR	                  SS 5      UR	                  SS 5      UR	                  S	S
5      UR	                  SS 5      UR	                  SS 5      UR	                  SS
5      UR	                  SS 5      UR	                  SS 5      S
S
S
S9nUcL  [        S[
        R                  R                  U R                   R                  SS5      X5   5       SU S35      e[        R                  " U5      XE'   GM6     U$ )Nr   #Voice preset unrecognized, missing z% as a key in self.speaker_embeddings[z].rB   /r   r    r!   Fr"   r#   r$   r%   r&   r'   r,   z{` does not exists
                    , no preloaded voice preset will be used - Make sure to provide correct paths to the z 
                    embeddings.)
r   
ValueErrorr   getr-   r0   r1   r2   rH   r5   )r   rP   r:   voice_preset_pathsvoice_preset_dictrR   r1   s          r   rG    BarkProcessor._load_voice_preset   sh   !44\BFC, 9#>cdpcqqst  ''++NC@"' **[$7 **[$7%zz*:EB

9d3 &

+<d C!',>!Fjj!148J5166;8=D | "'',,t'>'>'B'B>SV'WYkYpqr sjjviw x #  &(WWT]"9 G< ! r   c           	      l   S H  nX!;  a  [        SU S35      e[        X   [        R                  5      (       d'  [	        U S[        U R                  U   5       S35      e[        X   R                  5      U R                  U   :w  d  M  [        U S[        U R                  U   5       S35      e   g )Nr   rV   z
 as a key.z voice preset must be a z
D ndarray.)	rX   
isinstancerH   ndarray	TypeErrorstrpreset_shapelenshape)r   rP   rR   s      r   _validate_voice_preset_dict)BarkProcessor._validate_voice_preset_dict   s    FC& #Fse:!VWWl/<<3%'?DDUDUVYDZ@[?\\f ghh<$**+t/@/@/EE C5(@TEVEVWZE[A\@]]g!hii Gr   c           
         Ub  [        U[        5      (       d  [        U[        5      (       a.  U R                  b!  X R                  ;   a  U R	                  U5      nOF[        U[        5      (       a  UR                  S5      (       d  US-   n[        R                  " U5      nUb  U R                  " U40 UD6  [        X#S9nU R                  " U4USUUUUS.UD6n	Ub  X)S'   U	$ )a.  
Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs`
arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a
voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded
to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename.

Args:
    text (`str`, `List[str]`, `List[List[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    voice_preset (`str`, `Dict[np.ndarray]`):
        The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
        `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
        it can be a valid file name of a local `.npz` single voice preset.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.

Returns:
    Tuple([`BatchEncoding`], [`BatchFeature`]): A tuple composed of a [`BatchEncoding`], i.e the output of the
    `tokenizer` and a [`BatchFeature`], i.e the voice preset with the right tensors type.
z.npz)datatensor_type
max_length)return_tensorspaddingrj   return_attention_maskreturn_token_type_idsadd_special_tokenshistory_prompt)r^   dictra   r   rG   endswithrH   r5   re   r   r   )
r   textrP   rk   rj   ro   rm   rn   r:   encoded_texts
             r   __call__BarkProcessor.__call__   s    H #J|T,J,J<--++7 $;$;;#66|D lC009N9Nv9V9V#/&#8L!ww|4#,,\DVD'\VL~~	
) !"7"71	
 	
 #-9)*r   )r   r   )speaker_embeddings_path.json)rw   r   F)NNpt   FTF)__name__
__module____qualname____firstlineno____doc__tokenizer_class
attributesrb   r   classmethodr6   boolrK   r   ra   rG   rq   re   ru   __static_attributes____classcell__)r   s   @r   r   r   #   s     &OJ L5
 Mk5O 5Ot &D%9!7G
 7G 7Gr"!x} "!H	j 	j  "#D Dr   r   )r~   r4   r0   typingr   numpyrH   feature_extraction_utilsr   processing_utilsr   utilsr   	utils.hubr   autor
   
get_loggerrz   r.   r   __all__ r   r   <module>r      sR     	   4 .  $   
		H	%BN BJ 
r   