o
    Zh$                     @   s   d Z ddlZddlZddlZddlmZmZmZmZm	Z	m
Z
 ddlmZ ddlmZmZmZ e r5ddlZe r<ddlZeeZddiZd	d
 ZG dd deZdgZdS )zTokenization class for VITS.    N)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)is_phonemizer_availableis_uroman_availablelogging
vocab_filez
vocab.jsonc                 C   s    t d}|| }|d u}|S )Nz[^\x00-\x7F])recompilesearch)input_stringZnon_roman_patternmatchZhas_non_roman r   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/vits/tokenization_vits.pyhas_non_roman_characters%   s   

r   c                       s   e Zd ZdZeZddgZ							d&	d' fd
dZedd Z	dd Z
dd Zdd Z	d(dededee d	eeeeef f fddZded	ee fddZdee d	efddZdd Zd d! Zd)d"ed#ee d	eee df fd$d%Z  ZS )*VitsTokenizera  
    Construct a VITS tokenizer. Also supports MMS-TTS.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        language (`str`, *optional*):
            Language identifier.
        add_blank (`bool`, *optional*, defaults to `True`):
            Whether to insert token id 0 in between the other tokens.
        normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the input text by removing all casing and punctuation.
        phonemize (`bool`, *optional*, defaults to `True`):
            Whether to convert the input text into phonemes.
        is_uroman (`bool`, *optional*, defaults to `False`):
            Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
    Z	input_idsZattention_mask<pad><unk>NTFreturnc	              
      s   t |dd}
t|
| _W d    n1 sw   Y  dd | j D | _|| _|| _|| _|| _	|| _
t jd|||||||d|	 d S )Nutf-8encodingc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>W   s    z*VitsTokenizer.__init__.<locals>.<dictcomp>)	pad_token	unk_tokenlanguage	add_blank	normalize	phonemize	is_uromanr   )openjsonloadencoderitemsdecoderr#   r$   r%   r&   r'   super__init__)selfr   r!   r"   r#   r$   r%   r&   r'   kwargsZvocab_handle	__class__r   r   r/   H   s(   
zVitsTokenizer.__init__c                 C   s
   t | jS N)lenr+   r0   r   r   r   
vocab_sizej   s   
zVitsTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokens)r   ir6   r   r   r    o   s    z+VitsTokenizer.get_vocab.<locals>.<dictcomp>)ranger7   updateadded_tokens_encoder)r0   Zvocabr   r6   r   	get_vocabn   s   zVitsTokenizer.get_vocabc                 C   s   t | j t | j  }d}d}|t|k rMd}|D ]}|||t|  |kr8||7 }|t|7 }d} nq|sG|||  7 }|d7 }|t|k s|S )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased. r   FT   )listr+   keysr;   r5   lower)r0   r   Zall_vocabularyfiltered_textr8   found_matchwordr   r   r   normalize_texts   s"   zVitsTokenizer.normalize_textc                 C   s   | j dkr|dd}|S )z4Special treatment of characters in certain languagesZronu   țu   ţ)r#   replace)r0   textr   r   r   _preprocess_char   s   
zVitsTokenizer._preprocess_charrG   is_split_into_wordsr%   c                    s   |dur|n j }|r |} |}t|r. jr.t s%td n	t	 }|
|} jrNt s8tdtj|dddddd}tdd	|}||fS |r`d
tt fdd| }||fS )a  
        Performs any necessary transformations before tokenization.

        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
        `kwargs` at the end of the encoding process to be sure all the arguments have been used.

        Args:
            text (`str`):
                The text to prepare.
            is_split_into_words (`bool`, *optional*, defaults to `False`):
                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                which it will tokenize.
            normalize (`bool`, *optional*, defaults to `None`):
                Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
                trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
                text consists only of lower-case characters.
            kwargs (`Dict[str, Any]`, *optional*):
                Keyword arguments to use for the tokenization.

        Returns:
            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
        NaC  Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` Note `uroman` requires python version >= 3.10Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uromanzEPlease install the `phonemizer` Python package to use this tokenizer.zen-usZespeakT)r#   backendstripZpreserve_punctuationZwith_stressz\s+ r=   c                    s
   |  j v S r4   )r+   )charr6   r   r   <lambda>   s   
 z8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>)r%   rE   rH   r   r'   r   loggerwarningurZUromanZromanize_stringr&   r
   ImportError
phonemizerr   subjoinr?   filterrK   )r0   rG   rI   r%   r1   rB   uromanr   r6   r   prepare_for_tokenization   s6   


 z&VitsTokenizer.prepare_for_tokenizationc                 C   s@   t |}| jr| dgt|d d  }||ddd< |}|S )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r      r>   N)r?   r$   _convert_id_to_tokenr5   )r0   rG   tokensZinterspersedr   r   r   	_tokenize   s   zVitsTokenizer._tokenizer[   c                 C   s*   | j rt|dkr|dd d }d|S )Nr>   rY   r=   )r$   r5   rU   )r0   r[   r   r   r   convert_tokens_to_string   s   
z&VitsTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)r+   getr"   )r0   tokenr   r   r   _convert_token_to_id   s   z"VitsTokenizer._convert_token_to_idc                 C   s   | j |S )z=Converts an index (integer) in a token (str) using the vocab.)r-   r^   )r0   indexr   r   r   rZ      s   z"VitsTokenizer._convert_id_to_tokensave_directoryfilename_prefixc              	   C   s   t j|std| d d S t j||r|d ndtd  }t|ddd}|t	j
| jd	d
ddd  W d    |fS 1 sEw   Y  |fS )NzVocabulary path (z) should be a directory-r=   r   wr   r   rY   TF)indent	sort_keysensure_ascii
)ospathisdirrO   errorrU   VOCAB_FILES_NAMESr(   writer)   dumpsr+   )r0   rb   rc   r   fr   r   r   save_vocabulary   s    
zVitsTokenizer.save_vocabulary)r   r   NTTTF)r   N)FNr4   )__name__
__module____qualname____doc__rn   Zvocab_files_namesZmodel_input_namesr/   propertyr7   r<   rE   rH   strboolr   r   r   r   rX   r   r\   r]   r`   rZ   r   rr   __classcell__r   r   r2   r   r   /   sD    "

A0r   )rv   r)   rj   r   typingr   r   r   r   r   r   Ztokenization_utilsr	   utilsr
   r   r   rS   rW   rQ   Z
get_loggerrs   rO   rn   r   r   __all__r   r   r   r   <module>   s"    

 
H