
    fThK'                         S r SSKrSSKJrJrJr  SSKJrJr  SSK	J
r
  \
R                  " \5      r " S S\5      rS/rg)	z"Tokenization class for model ByT5.    N)ListOptionalTuple   )
AddedTokenPreTrainedTokenizer)loggingc            
       r  ^  \ rS rSrSrSS/r     S SU 4S jjjr\S 5       rS r	 SS	\
\   S
\\
\      S\S\
\   4U 4S jjjrS\
\   S\
\   4S jr SS	\
\   S
\\
\      S\
\   4S jjr SS	\
\   S
\\
\      S\
\   4S jjrS\S\
\   4S jrS rS rS rSS\S\\   S\\   4S jjrSrU =r$ )ByT5Tokenizer   a7  
Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    extra_ids (`int`, *optional*, defaults to 125):
        Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
        accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
        indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
        like in ByT5 preprocessing see
        [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
    additional_special_tokens (`List[str]`, *optional*):
        Additional special tokens used by the tokenizer.
	input_idsattention_maskreturnc           	      L  > US:  a#  Uc   [        U5       Vs/ s H	  nSU S3PM     nnONUS:  aH  UbE  [        U5      S:  a6  [        [        [        S U5      5      5      nX:w  a  [	        SU SU S35      e[        U[        5      (       a  [        USSS	9OUn[        U[        5      (       a  [        USSS	9OUn[        U[        5      (       a  [        USSS	9OUnX1US
.U l        [        U R                  5      U l	        SU l
        [        T	U ]0  " SUUUSUS.UD6  g s  snf )Nr   z
<extra_id_>c                 0    [        S[        U 5      ;   5      $ )Nextra_id)boolstr)xs    b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/byt5/tokenization_byt5.py<lambda>(ByT5Tokenizer.__init__.<locals>.<lambda>L   s    Ds1v9M4N    zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to ByT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r            )	eos_token	unk_token	pad_token	extra_idsadditional_special_tokens )rangelensetfilter
ValueError
isinstancer   r   _added_tokens_decoderoffset_utf_vocab_sizesuper__init__)
selfr    r!   r"   r#   r$   kwargsiextra_tokens	__class__s
            r   r0   ByT5Tokenizer.__init__>   sT    q=6>DI)DT(UDTq:aS):DT%(U%]8DMfIgjkIks6*NPi#jklL( &yk1RSlRm n( (  HRR[]`GaGaJydCgp	GQR[]`GaGaJydCgp	GQR[]`GaGaJydCgp	)2Y%O"$445# 	
&?	
 	
' )Vs   D!c                     U R                   $ N)r.   )r1   s    r   
vocab_sizeByT5Tokenizer.vocab_sizee   s    ###r   c                     [        U R                  U R                  -   5       Vs0 s H  oR                  U5      U_M     nnUR	                  U R
                  5        U$ s  snf r8   )r&   r9   r-   convert_ids_to_tokensupdateadded_tokens_encoder)r1   r3   vocabs      r   	get_vocabByT5Tokenizer.get_vocabi   sX    ;@SWS^S^A^;_`;_a++A.1;_`T../ as   Atoken_ids_0token_ids_1already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/[        U5      -  S/-   $ S/[        U5      -  S/-   S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)rB   rC   rD   r   r   )r/   get_special_tokens_maskr'   )r1   rB   rC   rD   r5   s       r   rF   %ByT5Tokenizer.get_special_tokens_maskn   sw    $ &72']a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr   	token_idsc                     [        U5      S:  a9  US   U R                  :X  a&  [        R                  " SU R                   S35        U$ XR                  /-   $ )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r'   eos_token_idwarningswarnr    )r1   rH   s     r   _add_eos_if_not_present%ByT5Tokenizer._add_eos_if_not_present   s[    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r   c                 r    U R                   /nUc  [        X-   5      S/-  $ [        X-   U-   U-   5      S/-  $ )ay  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of zeros.
r   )rK   r'   )r1   rB   rC   eoss       r   $create_token_type_ids_from_sequences2ByT5Tokenizer.create_token_type_ids_from_sequences   sL        !{()QC//;${2S89QC??r   c                 X    U R                  U5      nUc  U$ U R                  U5      nX-   $ )a"  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:

- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)rN   )r1   rB   rC   s      r    build_inputs_with_special_tokens.ByT5Tokenizer.build_inputs_with_special_tokens   s9    & 22;?66{CK,,r   textc                 d    UR                  S5       Vs/ s H  n[        U5      PM     nnU$ s  snf )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsutf-8)encodechr)r1   rW   r3   tokenss       r   	_tokenizeByT5Tokenizer._tokenize   s/    "&++g"67"6Q#a&"67 8s   -c                 \    [        U5      S:w  a  SnU$ [        U5      U R                  -   nU$ )z0Converts a token (str) in an id using the vocab.r   N)r'   ordr-   )r1   tokentoken_ids      r   _convert_token_to_id"ByT5Tokenizer._convert_token_to_id   s4     u:?H  5zDKK/Hr   c                 4    [        XR                  -
  5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r[   r-   )r1   indexra   s      r   _convert_id_to_token"ByT5Tokenizer._convert_id_to_token   s    EKK'(r   c                    SnU Hk  nX0R                   ;   a  U R                   U   R                  S5      nO6X0R                  ;   a  UR                  S5      nO[        [	        U5      /5      nX$-  nMm     UR                  SSS9nU$ )z:Converts a sequence of tokens (string) in a single string.r   rY   ignore)errors)added_tokens_decoderrZ   r>   bytesr`   decode)r1   r\   bstringra   
tok_stringstrings         r   convert_tokens_to_string&ByT5Tokenizer.convert_tokens_to_string   s    E111!66u=DDWM
333"\\'2
"CJ<0
!G  9r   save_directoryfilename_prefixc                     g)Nr%   r%   )r1   rt   ru   s      r   save_vocabularyByT5Tokenizer.save_vocabulary   s    r   )r,   r.   r-   )z</s>z<unk>z<pad>}   N)r   N)NFr8   )__name__
__module____qualname____firstlineno____doc__model_input_namesr0   propertyr9   r@   r   intr   r   rF   rN   rR   rU   r   r]   rc   rg   rr   r   rw   __static_attributes____classcell__)r5   s   @r   r   r      sw   @ %&67 "&%
 
%
 %
N $ $ sxO9O3;DI3FOkoO	cO O8	3c 	3tCy 	3 JN@9@3;DI3F@	c@. JN-9-3;DI3F-	c-4c d3i 

c HSM ]bcf]g  r   r   )r~   rL   typingr   r   r   tokenization_utilsr   r   utilsr	   
get_loggerrz   loggerr   __all__r%   r   r   <module>r      sE    )  ( ( A  
		H	%N' Nb 
r   