
    fTh>                        S r SSKrSSKrSSKrSSKrSSKJr  SSKJrJ	r	J
r
JrJrJr  SSKrSSKJr  SSKJr  SSKJr  \(       a  SS	KJr  SS
KJrJr  SSKJr  \R8                  " \5      rSS0rSr \" SS9 " S S\5      5       r!S/r"g)z$Tokenization class for SigLIP model.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)loggingrequires_backends)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc            
         ^  \ rS rSrSr\rSS/r       S%S\\	\
\4      SS4U 4S jjjrS	 r\S
 5       rS r S&S\\   S\\\      S\S\\   4U 4S jjjrS\\   S\\   4S jr S'S\\   S\\\      S\\   4S jjr S'S\\   S\\\      S\\   4S jjrS rS rS\
S\
4S jrSS.S jrS(SSS\\
   4U 4S jjjr\S 5       rS rS rS r S  r!S'S!\
S"\\
   S\"\
   4S# jjr#S$r$U =r%$ ))SiglipTokenizer-   ad  
Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
        contains the vocabulary necessary to instantiate a tokenizer.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"</s>"`):
        The token used for padding, for example when batching sequences of different lengths.
    additional_special_tokens (`List[str]`, *optional*):
        Additional special tokens used by the tokenizer.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.
    model_max_length (`int`, *optional*, defaults to 64):
        The maximum length (in number of tokens) for model inputs.
    do_lower_case (`bool`, *optional*, defaults to `True`):
        Whether or not to lowercase the input when tokenizing.
	input_idsattention_maskNsp_model_kwargsreturnc	                   > [        U S5        [        U[        5      (       a  [        USSSSS9OUn[        U[        5      (       a  [        USSSSS9OUn[        U[        5      (       a  [        USSSSS9OUnUc  0 OUU l        Xl        Xl        U R                  5       U l        Xl        [        T
U ](  " SUUUUU R                  UUS.U	D6  g )NprotobufTF)rstriplstrip
normalizedspecial)	eos_token	unk_token	pad_tokenadditional_special_tokensr   model_max_lengthdo_lower_case )r   
isinstancestrr   r   r'   r   get_spm_processorsp_modelsuper__init__)selfr   r"   r#   r$   r%   r   r&   r'   kwargs	__class__s             f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/siglip/tokenization_siglip.pyr.   SiglipTokenizer.__init__Z   s     	$
+ )S)) yduVZ[ 	 )S)) yduVZ[ 	 )S)) yduVZ[ 	 &5%<r/*$..0$ 		
&? 00-'		
 		
    c                    [         R                  " S0 U R                  D6n[        U R                  S5       nUR                  5       n[        5       nUR                  R                  U5      nUR                  5       nSUl
        UR                  R                  U5        UR                  5       nUR                  U5        S S S 5        U$ ! , (       d  f       U$ = f)NrbFr(   )spmSentencePieceProcessorr   openr   readr   
ModelProto
FromStringNormalizerSpecadd_dummy_prefixnormalizer_spec	MergeFromSerializeToStringLoadFromSerializedProto)r/   	tokenizerfr,   	model_pb2modelr?   s          r2   r+   !SiglipTokenizer.get_spm_processor   s    ..F1E1EF	$//4(AvvxH')I((33H=E'668O/4O,!!++O<..0H--h7 )  )( s   B	C


Cc                 6    U R                   R                  5       $ N)r,   get_piece_sizer/   s    r2   
vocab_sizeSiglipTokenizer.vocab_size   s     }}++--r4   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf rI   )rangerL   convert_ids_to_tokensupdateadded_tokens_encoder)r/   ivocabs      r2   	get_vocabSiglipTokenizer.get_vocab   sL    ;@;QR;Qa++A.1;QRT../ Ss   Atoken_ids_0token_ids_1already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/[        U5      -  S/-   $ S/[        U5      -  S/-   S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)rW   rX   rY   r      )r-   get_special_tokens_masklen)r/   rW   rX   rY   r1   s       r2   r\   'SiglipTokenizer.get_special_tokens_mask   sw    $ &72']a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr4   	token_idsc                     [        U5      S:  a9  US   U R                  :X  a&  [        R                  " SU R                   S35        U$ XR                  /-   $ )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r]   eos_token_idwarningswarnr"   )r/   r_   s     r2   _add_eos_if_not_present'SiglipTokenizer._add_eos_if_not_present   s[    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r4   c                 r    U R                   /nUc  [        X-   5      S/-  $ [        X-   U-   U-   5      S/-  $ )aw  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of zeros.
r   )rb   r]   )r/   rW   rX   eoss       r2   $create_token_type_ids_from_sequences4SiglipTokenizer.create_token_type_ids_from_sequences   sL        !{()QC//;${2S89QC??r4   c                 X    U R                  U5      nUc  U$ U R                  U5      nX-   $ )a"  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:

- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)re   )r/   rW   rX   s      r2    build_inputs_with_special_tokens0SiglipTokenizer.build_inputs_with_special_tokens   s9    & 22;?66{CK,,r4   c                 D    U R                   R                  5       nS US'   U$ )Nr,   )__dict__copy)r/   states     r2   __getstate__SiglipTokenizer.__getstate__   s#    ""$ jr4   c                     Xl         [        U S5      (       d  0 U l        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr   r(   )ro   hasattrr   r7   r8   r,   Loadr   )r/   ds     r2   __setstate__SiglipTokenizer.__setstate__  sP     t.//#%D 22JT5I5IJ4??+r4   textc                 j    UR                  [        R                  SS[        R                  5      5      $ )N )	translater*   	maketransstringpunctuation)r/   rz   s     r2   remove_punctuation"SiglipTokenizer.remove_punctuation  s$    ~~cmmBF4F4FGHHr4   keep_punctuation_exact_stringc                   ^  U(       a+  UR                  U 4S jUR                  U5       5       5      nOT R                  U5      n[        R                  " SSU5      nUR                  5       nU$ )a]  Returns canonicalized `text` (puncuation removed).

Args:
    text (`str`):
        String to be canonicalized.
    keep_punctuation_exact_string (`str`, *optional*):
        If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
        (but will still remove '{' and '}' that appear separately).
c              3   F   >#    U  H  nTR                  U5      v   M     g 7frI   )r   ).0partr/   s     r2   	<genexpr>4SiglipTokenizer.canonicalize_text.<locals>.<genexpr>  s#      6:c$''--:cs   !z\s+ )joinsplitr   resubstrip)r/   rz   r   s   `  r2   canonicalize_text!SiglipTokenizer.canonicalize_text  sc     )055 6:>**Eb:c6 D **40Dvvfc4(zz|r4   r   c                    > [         TU ]  " [        UR                  [        S5      -   40 UD6n[	        U5      S:  a%  US   [        :X  a  US   U R
                  ;   a  USS nU$ )z(
Converts a string to a list of tokens.
r   r[   r   N)r-   tokenizeSPIECE_UNDERLINEreplacer]   all_special_tokens)r/   rz   add_special_tokensr0   tokensr1   s        r2   r   SiglipTokenizer.tokenize(  se     !"2T\\BRTW5X"Xc\bcv;?vay,<<dNeNeAeABZFr4   c                 p    [        U R                  R                  [        U R                  5      5      5      $ rI   )r]   r,   encoder*   r#   rK   s    r2   unk_token_length SiglipTokenizer.unk_token_length2  s'     4==''DNN(;<==r4   c                    U R                  USS9nU R                  R                  U[        S9nU R                  R                  U R                  U-   [        S9n[        U5      U R                  :  a  X0R                  S $ U$ )u  
Returns a tokenized string.

We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
SPIECE_UNDERLINE.

For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.

Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
`self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
Nr   )out_type)r   r,   r   r*   r#   r]   r   )r/   rz   r0   r   s       r2   	_tokenizeSiglipTokenizer._tokenize7  s     %%d$%O%%dS%9 %%dnnt&;c%J25f+AVAV2Vv++-.b\bbr4   c                 8    U R                   R                  U5      $ )z0Converts a token (str) in an id using the vocab.)r,   piece_to_id)r/   tokens     r2   _convert_token_to_id$SiglipTokenizer._convert_token_to_idL  s    }}((//r4   c                 <    U R                   R                  U5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r,   	IdToPiece)r/   indexr   s      r2   _convert_id_to_token$SiglipTokenizer._convert_id_to_tokenQ  s    ''.r4   c                 "   / nSnSnU HW  nXPR                   ;   a2  U(       d  US-  nX0R                  R                  U5      U-   -  nSn/ nMD  UR                  U5        SnMY     X0R                  R                  U5      -  nUR	                  5       $ )z:Converts a sequence of tokens (string) in a single string.r|   Fr   T)r   r,   decodeappendr   )r/   r   current_sub_tokens
out_stringprev_is_specialr   s         r2   convert_tokens_to_string(SiglipTokenizer.convert_tokens_to_stringV  s    
E///&#%Jmm223EFNN
"&%'""))%0"'  	mm**+=>>
!!r4   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        U4$ U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-r|   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   r9   r,   serialized_model_protowrite)r/   r   r   out_vocab_fileficontent_spiece_models         r2   save_vocabularySiglipTokenizer.save_vocabularyj  s,   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F	)ro   r'   r,   r   r   )</s>z<unk>r   NN@   T)NFrI   )F)&__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr   r   r*   r   r.   r+   propertyrL   rU   r   intboolr\   re   ri   rl   rr   rx   r   r   r   r   r   r   r   r   r	   r   __static_attributes____classcell__)r1   s   @r2   r   r   -   s   &P *$&67
 "&48/
 "$sCx.1/
 
/
 /
b . . sxO9O3;DI3FOkoO	cO O:	3c 	3tCy 	3 JN@9@3;DI3F@	c@0 JN-9-3;DI3F-	c-6,Is Is I HL *[ QUVYQZ   > >c*0

"(!c !HSM !]bcf]g ! !r4   r   )#r   r   r   r   rc   shutilr   typingr   r   r   r   r   r	   r   r7   convert_slow_tokenizerr   tokenization_utilsr   tokenization_utils_baser   r   utilsr   r   utils.import_utilsr   
get_loggerr   r   r   r   r   __all__r(   r4   r2   <module>r      s    + 	 	    B B  5 5 1 4 / * 
		H	%!>2    
%&K!) K! 'K!\
 
r4   