
    fThLN                        S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	J
r
JrJr  SSKrSSKJr  SSKJr  SSKJr  \(       a  SS	KJr  SS
KJr  SSKJr  \R4                  " \5      rSS0rSr\" SS9 " S S\5      5       rS/r g)z Tokenization class for model T5.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)logging)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc            
         ^  \ rS rSrSr\rSS/r        S%S\\	\
\4      SS4U 4S jjjrS&S	 jr\S
 5       r\S 5       rS r S'S\\   S\\\      S\S\\   4U 4S jjjrS rS rS\\   S\\   4S jr S(S\\   S\\\      S\\   4S jjr S(S\\   S\\\      S\\   4S jjrS rS rSSS\\
   4U 4S jjr\S 5       rS r S r!S r"S  r#S(S!\
S"\\
   S\$\
   4S# jjr%S$r&U =r'$ ))T5Tokenizer,   a  
Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
        contains the vocabulary necessary to instantiate a tokenizer.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    extra_ids (`int`, *optional*, defaults to 100):
       Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are
        accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be
        retrieved by calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids
        method
     additional_special_tokens (`List[str]`, *optional*):
        Additional special tokens used by the tokenizer.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.
    legacy (`bool`, *optional*):
        Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
        and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
        example:

        - `legacy=True`:
        ```python
        >>> from transformers import T5Tokenizer

        >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
        >>> tokenizer.encode("Hello <extra_id_0>.")
        [8774, 32099, 3, 5, 1]
        ```
        - `legacy=False`:
        ```python
        >>> from transformers import T5Tokenizer

        >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
        >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
        [8774, 32099, 5, 1]
        ```
        Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
    add_prefix_space (`bool`, *optional*, defaults to `False`):
        Whether or not to add an initial space to the input. This allows to treat the leading word just as any
        other word.

Attributes:
    sp_model (`SentencePieceProcessor`):
        The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
	input_idsattention_maskNsp_model_kwargsreturnc
                   > [        U[        5      (       a
  [        USS9OUn[        U[        5      (       a
  [        USS9OUn[        U[        5      (       a
  [        USS9OUnUc  0 OUU l        Xl        XPl        [        R                  " S0 U R                  D6U l        U R                  R                  U5        Ub~  U Vs/ s H  nS[        U5      ;   d  M  UPM     nn[        U5      S:  a$  U[        U5       Vs/ s H	  nSU S3PM     sn-  nOIUS:  a!  U[        U5      :w  a  [        SU SU S	35      eO![        U5       Vs/ s H	  nSU S3PM     nnUn0 U l        [        [        U5      5       H>  n[        SU S3S
SSSS
S9U R                  [        U R                  5      S-
  U-   U-
  '   M@     Uc%  [        R                  SU R                    S35        SnXl        U R%                  U
R'                  SS
5      5      U l        Xl        XPl        Xl        [*        TU ]X  " SUUUUUU R                  UU	S.U
D6  g s  snf s  snf s  snf )NT)specialz
<extra_id_   >r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensF)single_wordlstriprstripr   
normalizedz2You are using the default legacy behaviour of the a_  . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565	from_slow)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   legacyadd_prefix_space )
isinstancestrr   r   r   
_extra_idsspmSentencePieceProcessorsp_modelLoadlenrange
ValueError_added_tokens_decoderloggerwarning_once	__class__r)   get_spm_processorpopr*   super__init__)selfr   r$   r%   r&   r'   r(   r   r)   r*   kwargsxextra_tokensir9   s                 ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/t5/tokenization_t5.pyr=   T5Tokenizer.__init__   s~    <FiQT;U;UJy$7[d	;EiQT;U;UJy$7[d	;EiQT;U;UJy$7[d	%4%<r/$#22JT5I5IJ:&$0'@['@!LTWXYTZDZA'@L[< 1$)yIY-ZIYA
1#Q.?IY-ZZ)Q9L0A#A &yk1RSlRm n   8=Y7GH7G!j1-7GLH(4% &("s<()AQ[QCq!uT$X\inRD&&s4=='9A'=	'IA'MN *
 >DT^^DT UJ J F..vzz+u/MN$# 0 
	
&? 00-
	
 
	
I \-Z Is   II IIc                 T   [         R                  " S0 U R                  D6nU R                  (       d  U(       a  UR	                  U R
                  5        U$ [        U R
                  S5       nUR                  5       n[        SU R                  R                   S35      nUR                  R                  U5      nUR                  5       nSUl        UR                  R!                  U5        UR#                  5       nUR%                  U5        S S S 5        U$ ! , (       d  f       U$ = f)NrbzThe new behaviour of z (with `self.legacy = False`)Fr+   )r/   r0   r   r)   r2   r   openreadr   r9   __name__
ModelProto
FromStringNormalizerSpecadd_dummy_prefixnormalizer_spec	MergeFromSerializeToStringLoadFromSerializedProto)r>   r#   	tokenizerfr1   	model_pb2modelrN   s           rC   r:   T5Tokenizer.get_spm_processor   s    ..F1E1EF	;;)NN4??+$//4(AvvxH'*?@W@W?XXu(vwI((33H=E'668O/4O,!!++O<..0H--h7 )  )( s   ,B"D
D'c                     U [         R                  ;   aH  [         R                  U    nUb  X!:w  a  U$ Uc(  [        R                  " SU SU  SU SU S3	[        5        U$ )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   max_model_input_sizeswarningswarnFutureWarning)pretrained_model_name_or_pathmax_model_lengthinit_max_model_lengthdeprecated_max_model_lengths       rC   !_eventually_correct_t5_max_length-T5Tokenizer._eventually_correct_t5_max_length   s    (K,M,MM*5*K*KLi*j'$05J5^,,&.34 5 66 734 5$$?#@ Agg "      c                 6    U R                   R                  5       $ N)r1   get_piece_sizer>   s    rC   
vocab_sizeT5Tokenizer.vocab_size   s    }}++--rb   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf rd   )r4   rg   convert_ids_to_tokensupdateadded_tokens_encoder)r>   rB   vocabs      rC   	get_vocabT5Tokenizer.get_vocab   sL    ;@;QR;Qa++A.1;QRT../ Ss   Atoken_ids_0token_ids_1already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/[        U5      -  S/-   $ S/[        U5      -  S/-   S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)rp   rq   rr   r   r   )r<   get_special_tokens_maskr3   )r>   rp   rq   rr   r9   s       rC   rt   #T5Tokenizer.get_special_tokens_mask   sw    $ &72']a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNrb   c                 T    [        [        [        S U R                  5      5      5      $ )Nc                 F    [        [        R                  " SU 5      5      S L$ )Nz<extra_id_\d+>)boolresearch)r@   s    rC   <lambda>1T5Tokenizer.get_sentinel_tokens.<locals>.<lambda>  s    bii0A1&E!Fd!Rrb   )listsetfilterr(   rf   s    rC   get_sentinel_tokensT5Tokenizer.get_sentinel_tokens  s&    RTXTrTrst
 	
rb   c                 j    U R                  5        Vs/ s H  oR                  U5      PM     sn$ s  snf rd   )r   convert_tokens_to_idsr>   tokens     rC   get_sentinel_token_ids"T5Tokenizer.get_sentinel_token_ids  s.    ?C?W?W?YZ?Ye**51?YZZZs   0	token_idsc                     [        U5      S:  a9  US   U R                  :X  a&  [        R                  " SU R                   S35        U$ XR                  /-   $ )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r3   eos_token_idrY   rZ   r$   )r>   r   s     rC   _add_eos_if_not_present#T5Tokenizer._add_eos_if_not_present  s[    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222rb   c                 r    U R                   /nUc  [        X-   5      S/-  $ [        X-   U-   U-   5      S/-  $ )aw  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of zeros.
r   )r   r3   )r>   rp   rq   eoss       rC   $create_token_type_ids_from_sequences0T5Tokenizer.create_token_type_ids_from_sequences)  sL        !{()QC//;${2S89QC??rb   c                 X    U R                  U5      nUc  U$ U R                  U5      nX-   $ )a"  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:

- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)r   )r>   rp   rq   s      rC    build_inputs_with_special_tokens,T5Tokenizer.build_inputs_with_special_tokens?  s9    & 22;?66{CK,,rb   c                 D    U R                   R                  5       nS US'   U$ )Nr1   )__dict__copy)r>   states     rC   __getstate__T5Tokenizer.__getstate__Y  s#    ""$ jrb   c                     Xl         [        U S5      (       d  0 U l        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr   r+   )r   hasattrr   r/   r0   r1   r2   r   )r>   ds     rC   __setstate__T5Tokenizer.__setstate__^  sP     t.//#%D 22JT5I5IJ4??+rb   textr   c                 T  > U R                   (       d  [        U5      S:X  a  [        TU ]  " U40 UD6$ UR	                  [
        S5      nU R                  (       a	  [
        U-   n[        TU ]  " U40 UD6n[        U5      S:  a%  US   [
        :X  a  US   U R                  ;   a  USS nU$ )z
Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
first token is special.
r    r   N)r)   r3   r<   tokenizereplaceSPIECE_UNDERLINEr*   all_special_tokens)r>   r   r?   tokensr9   s       rC   r   T5Tokenizer.tokenizeh  s    
 ;;#d)q.7#D3F33||,c2  #d*D!$1&1v;?vay,<<dNeNeAeABZFrb   c                 p    [        U R                  R                  [        U R                  5      5      5      $ rd   )r3   r1   encoder-   r%   rf   s    rC   unk_token_lengthT5Tokenizer.unk_token_lengthz  s%    4==''DNN(;<==rb   c                 @   U R                   (       d  UR                  [        S45      (       d  U R                  R	                  U[
        S9$ U R                  R	                  U R                  U-   [
        S9n[        U5      U R                  :  a  X0R                  S $ U$ )u  
Returns a tokenized string.

We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
`['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
`unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
r   )out_typeN)	r)   
startswithr   r1   r   r-   r%   r3   r   )r>   r   r?   r   s       rC   	_tokenizeT5Tokenizer._tokenize~  s     ;;doo/?.EFF==''s';; %%dnnt&;c%J25f+AVAV2Vv++-.b\bbrb   c                 8    U R                   R                  U5      $ )z0Converts a token (str) in an id using the vocab.)r1   piece_to_idr   s     rC   _convert_token_to_id T5Tokenizer._convert_token_to_id  s    }}((//rb   c                 <    U R                   R                  U5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r1   	IdToPiece)r>   indexr   s      rC   _convert_id_to_token T5Tokenizer._convert_id_to_token  s    ''.rb   c                    US   R                  [        5      (       a  U R                  (       a  US   SS US'   / nSnSnU HW  nXPR                  ;   a2  U(       d  US-  nX0R                  R                  U5      U-   -  nSn/ nMD  UR                  U5        SnMY     X0R                  R                  U5      -  nUR                  5       $ )z:Converts a sequence of tokens (string) in a single string.r   r   N Fr   T)r   r   r*   r   r1   decodeappendstrip)r>   r   current_sub_tokens
out_stringprev_is_specialr   s         rC   convert_tokens_to_string$T5Tokenizer.convert_tokens_to_string  s     !9 011d6K6Kq	!"F1I
E///&#%Jmm223EFNN
"&%'""))%0"'  	mm**+=>>
!!rb   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        U4$ U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirr7   errorjoinVOCAB_FILES_NAMESabspathr   isfiler   rG   r1   serialized_model_protowrite)r>   r   r   out_vocab_fileficontent_spiece_models         rC   save_vocabularyT5Tokenizer.save_vocabulary  s,   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F	)r   r6   r.   r*   r)   r1   r   r   )z</s>z<unk>z<pad>d   NNNT)F)NFrd   )(rI   
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr   r   r-   r   r=   r:   staticmethodr`   propertyrg   rn   r   intrx   rt   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r   __static_attributes____classcell__)r9   s   @rC   r   r   ,   s   L\ *$&67
 "&48H
 "$sCx.1H
 
H
 H
V"    * . . sxO9O3;DI3FOkoO	cO O8

[	3c 	3tCy 	3 JN@9@3;DI3F@	c@. JN-9-3;DI3F-	c-4
,[ tCy $ > >c$0
".!c !HSM !]bcf]g ! !rb   r   )!r   r   ry   rY   shutilr   typingr   r   r   r   r   r	   r   r/   convert_slow_tokenizerr   tokenization_utilsr   tokenization_utils_baser   r   utilsr   utils.import_utilsr   
get_loggerrI   r7   r   r   r   __all__r+   rb   rC   <module>r      s    ' 	 	   B B  5 5 1 4  * 
		H	%!>2    
%&R!% R! 'R!j /rb   