
    fThu                     |    S r SSKJrJrJrJr  SSKJrJr  SSK	J
r
  \
R                  " \5      r " S S\5      rS/rg)	z!Tokenization class for Perceiver.    )DictListOptionalTuple   )
AddedTokenPreTrainedTokenizer)loggingc            
       6  ^  \ rS rSrSrSS/r       S SU 4S jjjrS\\\	4   4S jr
\S 5       r SS	\\	   S
\\\	      S\S\\	   4U 4S jjjr SS	\\	   S
\\\	      S\\	   4S jjrS\S\\   4S jrS rS rS rSS\S\\   S\\   4S jjrSrU =r$ )PerceiverTokenizer   a  
Construct a Perceiver tokenizer. The Perceiver simply uses raw bytes utf-8 encoding.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    pad_token (`str`, *optional*, defaults to `"[PAD]"`):
        The token used for padding, for example when batching sequences of different lengths.
    bos_token (`str`, *optional*, defaults to `"[BOS]"`):
        The BOS token (reserved in the vocab, but not actually used).
    eos_token (`str`, *optional*, defaults to `"[EOS]"`):
        The end of sequence token (reserved in the vocab, but not actually used).

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    mask_token (`str`, *optional*, defaults to `"[MASK]"`):
        The MASK token, useful for masked language modeling.
    cls_token (`str`, *optional*, defaults to `"[CLS]"`):
        The CLS token (reserved in the vocab, but not actually used).
    sep_token (`str`, *optional*, defaults to `"[SEP]"`):
        The separator token, which is used when building a sequence from two sequences.

	input_idsattention_maskreturnc                 ,  > [        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUnSU l        UUUUUUS.U l        [        U R                  5      U l        [        T	U ]   " SUUUUUUUS.UD6  g )NF)lstriprstrip   )r         r         )	pad_token	bos_token	eos_token
mask_token	cls_token	sep_tokenmodel_max_length )	
isinstancestrr   _utf_vocab_size_added_tokens_decoderlen_num_special_tokenssuper__init__)
selfr   r   r   r   r   r   r   kwargs	__class__s
            l/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/perceiver/tokenization_perceiver.pyr(   PerceiverTokenizer.__init__;   s-    JTT]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	KUV`beKfKfZ
5Glv
IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	# 6
" $'t'A'A#B  		
!-		
 		
    c                     0 n[        U R                  5       H  n[        U5      nX R                  -   X'   M      UR	                  U R
                  5        U$ N)ranger#   chrr&   updateadded_tokens_encoder)r)   vocabitokens       r,   	get_vocabPerceiverTokenizer.get_vocabd   sN    t++,AFE777EL - 	T../r.   c                     U R                   $ r0   )r#   )r)   s    r,   
vocab_sizePerceiverTokenizer.vocab_sizel   s    ###r.   token_ids_0token_ids_1already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/S/[        U5      -  -   S/-   $ S/S/[        U5      -  -   S/-   S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r=   r>   r?   r   r   )r'   get_special_tokens_maskr%   )r)   r=   r>   r?   r+   s       r,   rA   *PerceiverTokenizer.get_special_tokens_maskp   s    $ &72']a 3  
 3!s;///1#55sqcC,,-3sS=M7MNRSQTTTr.   c                     Uc  U R                   /U-   U R                  /-   $ U R                   /U-   U R                  /-   U-   U R                  /-   $ )a  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the
following format:

- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)cls_token_idsep_token_id)r)   r=   r>   s      r,    build_inputs_with_special_tokens3PerceiverTokenizer.build_inputs_with_special_tokens   sb    & %%&48I8I7JJJ%%&48I8I7JJ[X\`\m\m[nnnr.   textc                 d    UR                  S5       Vs/ s H  n[        U5      PM     nnU$ s  snf )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsutf-8)encoder2   )r)   rH   r6   tokenss       r,   	_tokenizePerceiverTokenizer._tokenize   s/    "&++g"67"6Q#a&"67 8s   -c                 p    [        U5      S:w  a  U R                  nU$ [        U5      U R                  -   nU$ )z0Converts a token (str) in an id using the vocab.r   )r%   unk_token_idordr&   )r)   r7   token_ids      r,   _convert_token_to_id'PerceiverTokenizer._convert_token_to_id   s:    u:?((H  5zD$<$<<Hr.   c                 4    [        XR                  -
  5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r2   r&   )r)   indexr7   s      r,   _convert_id_to_token'PerceiverTokenizer._convert_id_to_token   s    E4445r.   c                     SnU HF  nX0R                   ;   a  [        U5      R                  S5      nO[        [	        U5      /5      nX$-  nMH     UR                  SSS9nU$ )z:Converts a sequence of tokens (string) in a single string.r.   rJ   replace)errors)r4   r"   rK   bytesrQ   decode)r)   rL   bstringr7   
tok_stringstrings         r,   convert_tokens_to_string+PerceiverTokenizer.convert_tokens_to_string   sb    E111 Z..w7
"CJ<0
!G  	:r.   save_directoryfilename_prefixc                     g)Nr    r    )r)   rc   rd   s      r,   save_vocabulary"PerceiverTokenizer.save_vocabulary   s    r.   )r$   r&   r#   )z[PAD]z[BOS]z[EOS]z[MASK]z[CLS]z[SEP]i   )r   N)NFr0   )__name__
__module____qualname____firstlineno____doc__model_input_namesr(   r   r"   intr8   propertyr;   r   r   boolrA   rF   rM   rS   rW   ra   r   rf   __static_attributes____classcell__)r+   s   @r,   r   r      s;   < %&67 '
 
'
 '
R4S>  $ $ sxU9U3;DI3FUkoU	cU U: JNo9o3;DI3Fo	co0c d3i 

c HSM ]bcf]g  r.   r   N)rl   typingr   r   r   r   tokenization_utilsr   r	   utilsr
   
get_loggerrh   loggerr   __all__r    r.   r,   <module>ry      sB    ( . . A  
		H	%k, k\  
 r.   