
    fTh7                         S SK r S SKJr  S SKJrJrJrJrJrJ	r	  S SK
rSSKJrJr  SSKJr  SSKJr  \(       a  SSKJr  \R*                  " \5      rS	S
0rSr\" SS9 " S S\5      5       rS/rg)    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )
AddedTokenPreTrainedTokenizer)logging)requires)	TextInput
vocab_fileztokenizer.modelu   ▁)sentencepiece)backendsc            
         ^  \ rS rSrSr\rSS/r          S S\\	\
\4      4U 4S jjjrS rS r\S	 5       rS
 rSSS\\
   4U 4S jjrS rS rS rS rS!S\\
   S\\
   4S jjrS!S jr S"S\\   S\\\      S\S\\   4U 4S jjjr S!S\\   S\\\      S\\   4S jjr  S#S\\   S\S\S\
4S jjrSr U =r!$ )$GemmaTokenizer+   aa
  
Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
no padding token in the original model.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
    eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
        The end of sequence token.
    pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
        A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
        attention mechanisms or loss computation.
    sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

    add_bos_token (`bool`, *optional*, defaults to `True`):
        Whether or not to add an `bos_token` at the start of sequences.
    add_eos_token (`bool`, *optional*, defaults to `False`):
        Whether or not to add an `eos_token` at the end of sequences.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
        extra spaces.
    use_default_system_prompt (`bool`, *optional*, defaults to `False`):
        Whether or not the default system prompt for Gemma should be used.
    spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to add spaces between special tokens.
	input_idsattention_masksp_model_kwargsc                   > Uc  0 OUU l         [        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUnXl        Xpl        Xl        Xl        [        R                  " S0 U R                   D6U l
        U R                  R                  U5        [        TU ]4  " SUUUUUUUU	U
US.
UD6  g )NFT)
normalizedspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokens )r   
isinstancestrr   r   r    r!   r#   spmSentencePieceProcessorsp_modelLoadsuper__init__)selfr   r   r   r   r   r   r    r!   r"   r#   r$   kwargs	__class__s                d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/gemma/tokenization_gemma.pyr-   GemmaTokenizer.__init__^   s    &5%<r/MWXacfMgMgJyUDImv	MWXacfMgMgJyUDImv	MWXacfMgMgJyUDImv	MWXacfMgMgJyUDImv	$**)B&22JT5I5IJ:& 	
''+)E&?*G	
 	
    c                 ~    U R                   R                  5       nS US'   U R                  R                  5       US'   U$ )Nr*   sp_model_proto)__dict__copyr*   serialized_model_proto)r.   states     r1   __getstate__GemmaTokenizer.__getstate__   s;    ""$ j"&--"F"F"Hr3   c                     U R                   R                  U5        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr%   )r6   updater(   r)   r   r*   LoadFromSerializedProtor5   )r.   ds     r1   __setstate__GemmaTokenizer.__setstate__   sG    Q22JT5I5IJ--d.A.ABr3   c                 6    U R                   R                  5       $ )zReturns vocab size)r*   get_piece_size)r.   s    r1   
vocab_sizeGemmaTokenizer.vocab_size   s     }}++--r3   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf )zReturns vocab as a dict)rangerD   convert_ids_to_tokensr=   added_tokens_encoder)r.   ivocabs      r1   	get_vocabGemmaTokenizer.get_vocab   sL    ;@;QR;Qa++A.1;QRT../ Ss   Atextr   returnc                 &   > [         TU ]  " U40 UD6$ )zE
Args:
    text: TextInput
Simply calls PreTrainedTokenizer's method
)r,   tokenize)r.   rN   r/   r0   s      r1   rQ   GemmaTokenizer.tokenize   s     w///r3   c                 >    U R                   R                  U[        S9$ )zf
Args:
    text: TextInput
Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
)out_type)r*   encoder'   )r.   rN   r/   s      r1   	_tokenizeGemmaTokenizer._tokenize   s     }}##D3#77r3   c                 8    U R                   R                  U5      $ )z0Converts a token (str) in an id using the vocab.)r*   piece_to_id)r.   tokens     r1   _convert_token_to_id#GemmaTokenizer._convert_token_to_id   s    }}((//r3   c                 <    U R                   R                  U5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r*   	IdToPiece)r.   indexrZ   s      r1   _convert_id_to_token#GemmaTokenizer._convert_id_to_token   s    ''.r3   c                     / nSnU HG  nX@R                   ;   a$  X0R                  R                  U5      U-   -  n/ nM6  UR                  U5        MI     X0R                  R                  U5      -  nU$ )z:Converts a sequence of tokens (string) in a single string. )_added_tokens_encoderr*   decodeappend)r.   tokenscurrent_sub_tokens
out_stringrZ   s        r1   convert_tokens_to_string'GemmaTokenizer.convert_tokens_to_string   st    
E222mm223EFNN
%'""))%0  	mm**+=>>
r3   filename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g[         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        SSS5        U4$ U4$ ! , (       d  f       U4$ = f)z
Save the vocabulary and special tokens file to a directory.

Args:
    save_directory (`str`):
        The directory in which to save the vocabulary.

Returns:
    `Tuple(str)`: Paths to the files saved.
zVocabulary path (z) should be a directoryN-rc   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr*   r8   write)r.   save_directoryrl   out_vocab_fileficontent_spiece_models         r1   save_vocabularyGemmaTokenizer.save_vocabulary   s.    ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F	c                     U R                   (       a  U R                  /O/ nU R                  (       a  U R                  /O/ nX1-   U-   nUb
  XS-   U-   U-   nU$ N)r    bos_token_idr!   eos_token_idr.   token_ids_0token_ids_1r   r   outputs         r1    build_inputs_with_special_tokens/GemmaTokenizer.build_inputs_with_special_tokens   s\    .2.@.@))*b.2.@.@))*b+l:"*[8<GFr3   r   r   already_has_special_tokensc                   > U(       a  [         TU ]  XSS9$ U R                  (       a  S/O/ nU R                  (       a  S/O/ nUc  US/[	        U5      -  -   U-   $ US/[	        U5      -  -   U-   U-   S/[	        U5      -  -   U-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r      r   )r,   get_special_tokens_maskr    r!   len)r.   r   r   r   r   r   r0   s         r1   r   &GemmaTokenizer.get_special_tokens_mask   s    $ &72']a 3   #00sb"00sbA3[)9#9:\IIsS%%'  sS%%	'
 	
r3   c                     U R                   (       a  U R                  /O/ nU R                  (       a  U R                  /O/ nS/[	        X1-   U-   5      -  nUb  US/[	        X2-   U-   5      -  -  nU$ )aM  
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:

```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence    | second sequence |
```

if token_ids_1 is None, only returns the first portion of the mask (0s).

Args:
    token_ids_0 (`List[int]`):
        List of ids.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
r   r   )r    r   r!   r   r   r   s         r1   $create_token_type_ids_from_sequences3GemmaTokenizer.create_token_type_ids_from_sequences  sv    . /3.@.@))*b.2.@.@))*bs<5DEE"qcC :\ IJJJFr3   	token_idsskip_special_tokensr$   c                 (   / n/ nU H  nU(       a  XpR                   ;   a  M  XpR                  ;   a]  U(       a*  UR                  U R                  R	                  U5      5        UR                  U R                  U   R
                  5        / nM  UR                  U5        M     U(       a*  UR                  U R                  R	                  U5      5        U(       a  SR                  U5      nOSR                  U5      nUR                  [        S5      $ )N rc   )	all_special_ids_added_tokens_decoderrf   r*   re   contentru   replaceSPIECE_UNDERLINE)r.   r   r   r$   r/   	sub_textscurrent_sub_textidss           r1   _decodeGemmaTokenizer._decode1  s     	C"s.B.B'B000#$$T]]%9%9:J%KL  !;!;C!@!H!HI#%  '',  T]]112BCD(+I	*I  !1377r3   )r    r!   r*   r   r#   r   )
z<unk>z<bos>z<eos>z<pad>NTFFFFr   )NF)FF)"__name__
__module____qualname____firstlineno____doc__rv   vocab_files_namesmodel_input_namesr   r   r'   r   r-   r:   r@   propertyrD   rL   r   rQ   rV   r[   r`   rj   r	   r   r   intboolr   r   r   __static_attributes____classcell__)r0   s   @r1   r   r   +   s   ,\ *$&67
 48%*"'&+(
 "$sCx.1(
 (
TC
 . .0[ 0tCy 080
!x} !X]^aXb !6	 sx#
9#
3;DI3F#
ko#
	c#
 #
L JN93;DI3F	cH %*.3	898 "8 (,	8 
8 8r3   r   )rp   shutilr   typingr   r   r   r   r   r	   r   r(   tokenization_utilsr   r   utilsr   utils.import_utilsr   tokenization_utils_baser   
get_loggerr   rs   rv   r   r   __all__r%   r3   r1   <module>r      sy   , 
  B B  A  * 4			H	%!#45   
%&`8( `8 '`8F	 
r3   