
    fThg$                     0   % S r SSKJrJrJr  SSKJrJr  SSKJ	r	  \	R                  " \5      rSrSrSrSrS	rS
rSr\S\S\S\S\S\S0r\\\4   \S'   \R1                  5        V Vs0 s H  u  pX_M	     snn r\\\4   \S'    " S S\5      rS/rgs  snn f )z Tokenization classes for CANINE.    )DictListOptional   )
AddedTokenPreTrainedTokenizer)loggingi   i   i  i  i  i  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]SPECIAL_CODEPOINTSSPECIAL_CODEPOINTS_BY_NAMEc            
         ^  \ rS rSrSr\" \5      \" \5      \" \5      \" \5      \" \5      \" \	5      SS4U 4S jjr
\S\4S j5       rS rS	\S\\   4S
 jrS\S\4S jrS\S\4S jrS r SS\\   S\\\      S\\   4S jjr SS\\   S\\\      S\S\\   4U 4S jjjr SS\\   S\\\      S\\   4S jjrSS\S\\   4S jjrSrU =r$ )CanineTokenizer:   a  
Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
converts each character into its Unicode code point.

[`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

Args:
    model_max_length (`int`, *optional*, defaults to 2048):
            The maximum sentence length the model accepts.
Fi   c	                   > [        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn0 U l        [        R                  5        H  u  pXR                  U'   M     U R                  R                  5        VV
s0 s H  u  pX_M	     sn
nU l        [        U l        [        U R                  5      U l
        [        TU ]0  " SUUUUUUUUS.U	D6  g s  sn
nf )NF)lstriprstripT)	bos_token	eos_token	sep_token	cls_token	pad_token
mask_tokenadd_prefix_spacemodel_max_length )
isinstancestrr   _special_codepointsr
   items_special_codepoint_stringsUNICODE_VOCAB_SIZE_unicode_vocab_sizelen_num_special_tokenssuper__init__)selfr   r   r   r   r   r   r   r   kwargs	codepointname	__class__s               f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/canine/tokenization_canine.pyr%   CanineTokenizer.__init__H   sz    JTT]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	 KUU_adJeJeZ
4Fku
 46 1779OI-6$$T*  :
 483K3K3Q3Q3S;
3SIO3S;
' $6 #&t'?'?#@  
	
!--
	
 
	
;
s   E3returnc                     U R                   $ N)r!   )r&   s    r+   
vocab_sizeCanineTokenizer.vocab_sizev   s    '''    c                     [        U R                  5       Vs0 s H  n[        U5      U_M     nnUR                  U R                  5        U$ s  snf r/   )ranger0   chrupdateadded_tokens_encoder)r&   ivocabs      r+   	get_vocabCanineTokenizer.get_vocabz   sE    $)$//$:;$:qQ$:;T../ <s   Atextc                     [        U5      $ )z5Tokenize a string (i.e. perform character splitting).)list)r&   r<   s     r+   	_tokenizeCanineTokenizer._tokenize   s    Dzr2   tokenc                 T     [        U5      $ ! [         a    [        SU S35      ef = f)zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: '')ord	TypeError
ValueError)r&   rA   s     r+   _convert_token_to_id$CanineTokenizer._convert_token_to_id   s5    	:u: 	:/wa899	:s   
 'indexc                 x     U[         ;   a	  [         U   $ [        U5      $ ! [         a    [        SU 35      ef = f)z
Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
human-readable format.
zinvalid id: )r
   r5   rE   rF   )r&   rI   s     r+   _convert_id_to_token$CanineTokenizer._convert_id_to_token   sF    
	5**)%00u: 	5|E7344	5s     
  9c                 $    SR                  U5      $ )N )join)r&   tokenss     r+   convert_tokens_to_string(CanineTokenizer.convert_tokens_to_string   s    wwvr2   token_ids_0token_ids_1c                 \    U R                   /nU R                  /nXA-   U-   nUb  XRU-   -  nU$ )a8  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A CANINE sequence has the following format:

- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)sep_token_idcls_token_idr&   rS   rT   sepclsresults         r+    build_inputs_with_special_tokens0CanineTokenizer.build_inputs_with_special_tokens   sE    &   !  !"S("C''Fr2   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ S/S/[        U5      -  -   S/-   nUb  US/[        U5      -  S/-   -  nU$ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)rS   rT   r^      r   )r$   get_special_tokens_maskr"   )r&   rS   rT   r^   r[   r*   s        r+   ra   'CanineTokenizer.get_special_tokens_mask   sn    $ &72']a 3   c+../1#5"sS--!44Fr2   c                     U R                   /nU R                  /n[        XA-   U-   5      S/-  nUb  U[        X#-   5      S/-  -  nU$ )a[  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CANINE
sequence pair mask has the following format:

```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence    | second sequence |
```

If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
r   r`   )rV   rW   r"   rX   s         r+   $create_token_type_ids_from_sequences4CanineTokenizer.create_token_type_ids_from_sequences   s]    .   !  !S&,-3"c++,s22Fr2   save_directoryfilename_prefixc                     g)Nr   r   )r&   rf   rg   s      r+   save_vocabularyCanineTokenizer.save_vocabulary   s    r2   )r#   r   r   r!   r/   )NF)__name__
__module____qualname____firstlineno____doc__r5   CLSSEPPADMASKr%   propertyintr0   r:   r   r   r?   rG   rK   rQ   r   r\   boolra   rd   ri   __static_attributes____classcell__)r*   s   @r+   r   r   :   sk    c(c(c(c(c(t9,
\ (C ( (
c d3i :# :# :
5# 
5# 
5 JN93;DI3F	c8 sx93;DI3Fko	c : JN93;DI3F	c@c HSM  r2   r   N)ro   typingr   r   r   tokenization_utilsr   r   utilsr	   
get_loggerrk   loggerr    rr   rp   rq   BOSrs   RESERVEDr
   ru   r   __annotations__r   r   r   __all__)r(   r)   s   00r+   <module>r      s    ' ' ' A  
		H	%    (l& DcN   VhUmUmUo-pUo/)doUo-p DcN pw) wt 
{ .qs   'B