
    fThD#                         S r SSKrSSKJr  SSKJrJrJrJrJ	r	  SSK
rSSKJr  SSKJr  SSKJr  S	S
KJr  \R(                  " \5      rSS0r\" SS9 " S S\5      5       rS/rg)z Tokenization class for SpeechT5.    N)copyfile)AnyDictListOptionalTuple   )PreTrainedTokenizer)logging)requires   )EnglishNumberNormalizer
vocab_filezspm_char.model)sentencepiece)backendsc            
       j  ^  \ rS rSrSr\rSS/r      SS\\	\
\4      SS4U 4S jjjrSS	 jr\S
 5       r\S 5       r\R"                  S 5       rS rS rS rS\
S\\
   4S jrS rS rS rS S\\   4S jjr S!S\\   S\\\      S\S\\   4U 4S jjjrS S\
S\\
   S\\
   4S jjrSr U =r!$ )"SpeechT5Tokenizer"   a  
Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
        contains the vocabulary necessary to instantiate a tokenizer.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The begin of sequence token.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    normalize (`bool`, *optional*, defaults to `False`):
        Whether to convert numeric quantities in the text to their spelt-out english counterparts.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

Attributes:
    sp_model (`SentencePieceProcessor`):
        The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
	input_idsattention_maskNsp_model_kwargsreturnc           
        > Uc  0 OUU l         Xl        X`l        S U l        [        R
                  " S0 U R                   D6U l        U R                  R                  U5        [        T	U ]$  " SUUUUUU R                   S.UD6  g )N)	bos_token	eos_token	unk_token	pad_token	normalizer    )
r   r   r   _normalizerspmSentencePieceProcessorsp_modelLoadsuper__init__)
selfr   r   r   r   r   r   r   kwargs	__class__s
            j/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/speecht5/tokenization_speecht5.pyr&   SpeechT5Tokenizer.__init__Q   s     &5%<r/$"22JT5I5IJ:& 	
 00	
 	
    c                     UR                  SU R                  5      nU(       a  SU-   nU(       a  U R                  U5      nX4$ )Nr    )popr   
normalizer)r'   textis_split_into_wordsr(   r   s        r*   prepare_for_tokenization*SpeechT5Tokenizer.prepare_for_tokenizationn   s;    JJ{DNN;	:D??4(D~r,   c                 6    U R                   R                  5       $ N)r#   get_piece_sizer'   s    r*   
vocab_sizeSpeechT5Tokenizer.vocab_sizev   s    }}++--r,   c                 R    U R                   c  [        5       U l         U R                   $ r6   )r    r   r8   s    r*   r0   SpeechT5Tokenizer.normalizerz   s%    #68Dr,   c                     Xl         g r6   )r    )r'   values     r*   r0   r<      s     r,   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf r6   )ranger9   convert_ids_to_tokensupdateadded_tokens_encoder)r'   ivocabs      r*   	get_vocabSpeechT5Tokenizer.get_vocab   sL    ;@;QR;Qa++A.1;QRT../ Ss   Ac                 D    U R                   R                  5       nS US'   U$ )Nr#   )__dict__copy)r'   states     r*   __getstate__SpeechT5Tokenizer.__getstate__   s#    ""$ jr,   c                     Xl         [        U S5      (       d  0 U l        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr   r   )rI   hasattrr   r!   r"   r#   r$   r   )r'   ds     r*   __setstate__SpeechT5Tokenizer.__setstate__   sP     t.//#%D 22JT5I5IJ4??+r,   r1   c                 >    U R                   R                  U[        S9$ )zPTake as input a string and return a list of strings (tokens) for words/sub-words)out_type)r#   encodestr)r'   r1   s     r*   	_tokenizeSpeechT5Tokenizer._tokenize   s    }}##D3#77r,   c                 8    U R                   R                  U5      $ )z0Converts a token (str) in an id using the vocab.)r#   piece_to_id)r'   tokens     r*   _convert_token_to_id&SpeechT5Tokenizer._convert_token_to_id   s    }}((//r,   c                 <    U R                   R                  U5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r#   	IdToPiece)r'   indexr[   s      r*   _convert_id_to_token&SpeechT5Tokenizer._convert_id_to_token   s    ''.r,   c                 "   / nSnSnU HW  nXPR                   ;   a2  U(       d  US-  nX0R                  R                  U5      U-   -  nSn/ nMD  UR                  U5        SnMY     X0R                  R                  U5      -  nUR	                  5       $ )z:Converts a sequence of tokens (string) in a single string. Fr.   T)all_special_tokensr#   decodeappendstrip)r'   tokenscurrent_sub_tokens
out_stringprev_is_specialr[   s         r*   convert_tokens_to_string*SpeechT5Tokenizer.convert_tokens_to_string   s    
E///&#%Jmm223EFNN
"&%'""))%0"'  	mm**+=>>
!!r,   c                 J    Uc  XR                   /-   $ X-   U R                   /-   $ )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r'   token_ids_0token_ids_1s      r*    build_inputs_with_special_tokens2SpeechT5Tokenizer.build_inputs_with_special_tokens   s1    "3"3!444(D,=,=+>>>r,   rq   rr   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ S/nUc  S/[        U5      -  U-   $ S/[        U5      -  S/[        U5      -  -   U-   $ )NT)rq   rr   ru   r   r   )r%   get_special_tokens_masklen)r'   rq   rr   ru   suffix_onesr)   s        r*   rw   )SpeechT5Tokenizer.get_special_tokens_mask   ss     &72']a 3   cC#k**k99c+&&A3[1A+AB[PPr,   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        U4$ U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-rd   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr#   serialized_model_protowrite)r'   r{   r|   out_vocab_fileficontent_spiece_models         r*   save_vocabulary!SpeechT5Tokenizer.save_vocabulary   s,   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F	)rI   r    r   r#   r   r   )z<s>z</s>z<unk>z<pad>FN)Fr6   )NF)"__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr   r   rV   r   r&   r3   propertyr9   r0   setterrF   rL   rQ   r   rW   r\   ra   rm   intrs   boolrw   r   r   __static_attributes____classcell__)r)   s   @r*   r   r   "   s]   (T *$&67
 48
 "$sCx.1
 

 
: . .    
 ! !

,8c 8d3i 80"&?QUVYQZ ? sxQ9Q3;DI3FQkoQ	cQ Q!c !HSM !]bcf]g ! !r,   r   )r   r   shutilr   typingr   r   r   r   r   r   r!   tokenization_utilsr
   utilsr   utils.import_utilsr   number_normalizerr   
get_loggerr   r   r   r   __all__r   r,   r*   <module>r      ss    ' 	  3 3  5  * 6 
		H	%!#34  
%&y!+ y! 'y!x 
r,   