
    fTh$                         S r SSKrSSKrSSKrSSKJrJrJrJrJ	r	J
r
  SSKJr  SSKJrJrJr  \" 5       (       a  SSKr\" 5       (       a  SSKr\R(                  " \5      rSS0rS	 r " S
 S\5      rS/rg)zTokenization class for VITS.    N)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)is_phonemizer_availableis_uroman_availablelogging
vocab_filez
vocab.jsonc                 \    [         R                  " S5      nUR                  U 5      nUS LnU$ )Nz[^\x00-\x7F])recompilesearch)input_stringnon_roman_patternmatchhas_non_romans       b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vits/tokenization_vits.pyhas_non_roman_charactersr   %   s3    

?3 $$\2E%M    c                     ^  \ rS rSrSr\rSS/r       S SU 4S jjjr\	S 5       r
S	 rS
 rS r SS\S\S\\   S\\\\\4   4   4S jjrS\S\\   4S jrS\\   S\4S jrS rS rSS\S\\   S\\\   S4   4S jjrSrU =r$ )VitsTokenizer/   a|  
Construct a VITS tokenizer. Also supports MMS-TTS.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    language (`str`, *optional*):
        Language identifier.
    add_blank (`bool`, *optional*, defaults to `True`):
        Whether to insert token id 0 in between the other tokens.
    normalize (`bool`, *optional*, defaults to `True`):
        Whether to normalize the input text by removing all casing and punctuation.
    phonemize (`bool`, *optional*, defaults to `True`):
        Whether to convert the input text into phonemes.
    is_uroman (`bool`, *optional*, defaults to `False`):
        Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
	input_idsattention_maskNreturnc	                 d  > [        USS9 n
[        R                  " U
5      U l        S S S 5        U R                  R	                  5        VVs0 s H  u  pX_M	     snnU l        X@l        XPl        X`l        Xpl	        Xl
        [        TU ]0  " SUUUUUUUS.U	D6  g ! , (       d  f       Ny= fs  snnf )Nutf-8encoding)	pad_token	unk_tokenlanguage	add_blank	normalize	phonemize	is_uroman )openjsonloadencoderitemsdecoderr&   r'   r(   r)   r*   super__init__)selfr   r$   r%   r&   r'   r(   r)   r*   kwargsvocab_handlekv	__class__s                r   r3   VitsTokenizer.__init__H   s     *w/<99\2DL 0 *.););)=>)=)=> """" 		
		
 		
 0/ ?s   BB,
B)c                 ,    [        U R                  5      $ N)lenr/   )r4   s    r   
vocab_sizeVitsTokenizer.vocab_sizej   s    4<<  r   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf r<   )ranger>   convert_ids_to_tokensupdateadded_tokens_encoder)r4   ivocabs      r   	get_vocabVitsTokenizer.get_vocabn   sL    ;@;QR;Qa++A.1;QRT../ Ss   Ac                    [        U R                  R                  5       5      [        U R                  R                  5       5      -   nSnSnU[	        U5      :  ag  SnU H-  nXU[	        U5      -    U:X  d  M  X6-  nU[	        U5      -  nSn  O   U(       d  X1U   R                  5       -  nUS-  nU[	        U5      :  a  Mg  U$ )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased. r   FT   )listr/   keysrD   r=   lower)r4   r   all_vocabularyfiltered_textrE   found_matchwords          r   normalize_textVitsTokenizer.normalize_texts   s    dll//12T$:S:S:X:X:Z5[[#l##K&AD	M2d:!)MTNA"&K ' a!6!6!88Q #l## r   c                 J    U R                   S:X  a  UR                  SS5      nU$ )z4Special treatment of characters in certain languagesronu   țu   ţ)r&   replace)r4   texts     r   _preprocess_charVitsTokenizer._preprocess_char   s#    ==E!<<d+Dr   rX   is_split_into_wordsr(   c           	        ^  Ub  UOT R                   nU(       a  T R                  U5      nT R                  U5      n[        U5      (       a\  T R                  (       aK  [        5       (       d  [        R                  S5        O&[        R                  " 5       nUR                  U5      nT R                  (       aN  [        5       (       d  [        S5      e[        R                  " USSSSSS9n[        R                   " SSU5      nXT4$ U(       a6  S	R#                  [%        ['        U 4S
 jU5      5      5      R)                  5       nXT4$ )am  
Performs any necessary transformations before tokenization.

This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
`kwargs` at the end of the encoding process to be sure all the arguments have been used.

Args:
    text (`str`):
        The text to prepare.
    is_split_into_words (`bool`, *optional*, defaults to `False`):
        Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
        tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
        which it will tokenize.
    normalize (`bool`, *optional*, defaults to `None`):
        Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
        trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
        text consists only of lower-case characters.
    kwargs (`Dict[str, Any]`, *optional*):
        Keyword arguments to use for the tokenization.

Returns:
    `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
aC  Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` Note `uroman` requires python version >= 3.10Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uromanzEPlease install the `phonemizer` Python package to use this tokenizer.zen-usespeakT)r&   backendstrippreserve_punctuationwith_stressz\s+ rJ   c                 "   > U TR                   ;   $ r<   )r/   )charr4   s    r   <lambda>8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>   s    TT\\=Qr   )r(   rS   rY   r   r*   r   loggerwarningurUromanromanize_stringr)   r   ImportError
phonemizerr   subjoinrL   filterr_   )r4   rX   r[   r(   r5   rP   uromans   `      r   prepare_for_tokenization&VitsTokenizer.prepare_for_tokenization   s   4 "+!6IDNN	&&t,D--d3#M22t~~&((y  & 6 6} E>>*,,!"ijj&00  %) M FF63>M
 $$	 GGD0QS`)a$bciikM$$r   c                     [        U5      nU R                  (       a-  U R                  S5      /[        U5      S-  S-   -  nX#SSS2'   UnU$ )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r      rK   N)rL   r'   _convert_id_to_tokenr=   )r4   rX   tokensintersperseds       r   	_tokenizeVitsTokenizer._tokenize   sO    d>> 55a89S[1_q=PQL!'A!Fr   rw   c                 t    U R                   (       a  [        U5      S:  a  USS S2   nSR                  U5      $ )NrK   ru   rJ   )r'   r=   ro   )r4   rw   s     r   convert_tokens_to_string&VitsTokenizer.convert_tokens_to_string   s0    >>c&kAoADqD\Fwwvr   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r/   getr%   )r4   tokens     r   _convert_token_to_id"VitsTokenizer._convert_token_to_id   s*    ||||'7'7'GHHr   c                 8    U R                   R                  U5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r1   r   )r4   indexs     r   rv   "VitsTokenizer._convert_id_to_token   s    ||&&r   save_directoryfilename_prefixc           
         [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[        USSS9 nUR                  [        R                  " U R                  S	S
SS9S-   5        S S S 5        U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-rJ   r   wr!   r"   ru   TF)indent	sort_keysensure_ascii
)ospathisdirrg   errorro   VOCAB_FILES_NAMESr,   writer-   dumpsr/   )r4   r   r   r   fs        r   save_vocabularyVitsTokenizer.save_vocabulary   s    ww}}^,,LL,^,<<STUWW\\o_s22QbcoQpp

 *cG4GGDJJt||ATYZ]aab 5 } 54 }s   ?4B>>
C)r'   r1   r/   r*   r&   r(   r)   )z<pad>z<unk>NTTTF)r   N)FNr<   )__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr3   propertyr>   rG   rS   rY   strboolr   r   r   r   rr   r   ry   r|   r   rv   r   r   __static_attributes____classcell__)r9   s   @r   r   r   /   s   * *$&67
  
 
 
  
D ! !
* Y]?%?%.2?%GOPT~?%	sDcN"	#?%B	c 	d3i 	tCy S 
I'c HSM ]bchilcmoscs]t  r   r   )r   r-   r   r   typingr   r   r   r   r   r   tokenization_utilsr
   utilsr   r   r   rm   rq   ri   
get_loggerr   rg   r   r   r   __all__r+   r   r   <module>r      sv    #  	 	 : : 5 J J 			H	%!<0 D' DN 
r   