o
    Zh>                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZmZ ddlZddlmZ ddlmZ ddlmZ erFdd	lmZ dd
lmZmZ ddlmZ eeZddiZdZ eddG dd deZ!dgZ"dS )z$Tokenization class for SigLIP model.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)loggingrequires_backends)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc                
       s  e Zd ZdZeZddgZ							d;d	eee	e
f  d
df fddZdd Zedd Zdd Z	d<dee deee  ded
ee f fddZdee d
ee fddZ	d=dee deee  d
ee fddZ	d=dee deee  d
ee fddZd d! Zd"d# Zd$e	d
e	fd%d&Zdd'd(d)Zd>d$d*d
ee	 f fd+d,Zed-d. Zd/d0 Zd1d2 Zd3d4 Zd5d6 Z d=d7e	d8ee	 d
e!e	 fd9d:Z"  Z#S )?SiglipTokenizera  
    Construct a Siglip tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"</s>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        model_max_length (`int`, *optional*, defaults to 64):
            The maximum length (in number of tokens) for model inputs.
        do_lower_case (`bool`, *optional*, defaults to `True`):
            Whether or not to lowercase the input when tokenizing.
    Z	input_idsZattention_mask</s><unk>N@   Tsp_model_kwargsreturnc	           
   
      s   t | d t|trt|dddddn|}t|tr#t|dddddn|}t|tr3t|dddddn|}|d u r;i n|| _|| _|| _|  | _|| _t	 j
d||||| j||d|	 d S )NprotobufTF)rstriplstrip
normalizedZspecial)	eos_token	unk_token	pad_tokenadditional_special_tokensr   model_max_lengthdo_lower_case )r   
isinstancestrr   r   r#   r   get_spm_processorsp_modelsuper__init__)
selfr   r   r   r    r!   r   r"   r#   kwargs	__class__r$   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/siglip/tokenization_siglip.pyr*   Z   s:   


zSiglipTokenizer.__init__c                 C   s   t jdi | j}t| jd,}| }t }|j|}|	 }d|_
|j| | }|| W d    |S 1 s>w   Y  |S )NrbFr$   )spmSentencePieceProcessorr   openr   readr
   Z
ModelProtoZ
FromStringZNormalizerSpecZadd_dummy_prefixnormalizer_specZ	MergeFromZSerializeToStringZLoadFromSerializedProto)r+   Z	tokenizerfr(   Z	model_pb2modelr5   r$   r$   r/   r'      s   
		z!SiglipTokenizer.get_spm_processorc                 C   s
   | j  S N)r(   Zget_piece_sizer+   r$   r$   r/   
vocab_size   s   
zSiglipTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r$   )Zconvert_ids_to_tokens).0ir9   r$   r/   
<dictcomp>   s    z-SiglipTokenizer.get_vocab.<locals>.<dictcomp>)ranger:   updateZadded_tokens_encoder)r+   Zvocabr$   r9   r/   	get_vocab   s   zSiglipTokenizer.get_vocabFtoken_ids_0token_ids_1already_has_special_tokensc                    sZ   |rt  j||ddS |du rdgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rA   rB   rC   Nr      )r)   get_special_tokens_masklen)r+   rA   rB   rC   r-   r$   r/   rE      s   (z'SiglipTokenizer.get_special_tokens_mask	token_idsc                 C   s>   t |dkr|d | jkrtd| j d |S || jg S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)rF   eos_token_idwarningswarnr   )r+   rG   r$   r$   r/   _add_eos_if_not_present   s   z'SiglipTokenizer._add_eos_if_not_presentc                 C   s<   | j g}|du rt|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )rI   rF   )r+   rA   rB   Zeosr$   r$   r/   $create_token_type_ids_from_sequences   s   z4SiglipTokenizer.create_token_type_ids_from_sequencesc                 C   s(   |  |}|du r|S |  |}|| S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)rL   )r+   rA   rB   r$   r$   r/    build_inputs_with_special_tokens   s
   

z0SiglipTokenizer.build_inputs_with_special_tokensc                 C   s   | j  }d |d< |S )Nr(   )__dict__copy)r+   stater$   r$   r/   __getstate__   s   
zSiglipTokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjdi | j| _| j| j d S )Nr   r$   )rO   hasattrr   r1   r2   r(   Loadr   )r+   dr$   r$   r/   __setstate__  s
   
zSiglipTokenizer.__setstate__textc                 C   s   | tddtjS )N )	translater&   	maketransstringpunctuation)r+   rW   r$   r$   r/   remove_punctuation  s   z"SiglipTokenizer.remove_punctuationkeep_punctuation_exact_stringc                   sH   |r|  fdd||D }n |}tdd|}| }|S )a  Returns canonicalized `text` (puncuation removed).

        Args:
            text (`str`):
                String to be canonicalized.
            keep_punctuation_exact_string (`str`, *optional*):
                If provided, then this exact string is kept. For example providing '{}' will keep any occurrences of '{}'
                (but will still remove '{' and '}' that appear separately).
        c                 3   s    | ]}  |V  qd S r8   )r]   )r;   partr9   r$   r/   	<genexpr>  s    

z4SiglipTokenizer.canonicalize_text.<locals>.<genexpr>z\s+ )joinsplitr]   resubstrip)r+   rW   r_   r$   r9   r/   canonicalize_text  s   


z!SiglipTokenizer.canonicalize_textr   c                    sV   t  jt|td fi |}t|dkr)|d tkr)|d | jv r)|dd }|S )z8
        Converts a string to a list of tokens.
        rb   rD   r   N)r)   tokenizeSPIECE_UNDERLINEreplacerF   all_special_tokens)r+   rW   Zadd_special_tokensr,   tokensr-   r$   r/   ri   (  s    &zSiglipTokenizer.tokenizec                 C   s   t | jt| jS r8   )rF   r(   encoder&   r   r9   r$   r$   r/   unk_token_length2  s   z SiglipTokenizer.unk_token_lengthc                 K   sT   | j |dd}| jj|td}| jj| j| td}t|| jkr(|| jd S |S )u*  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE.

        For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give `['H', 'e', 'y']` instead of `['▁He', 'y']`.

        Thus we always encode `f"{unk_token}text"` and strip the `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        Nr^   )Zout_type)rh   r(   rn   r&   r   rF   ro   )r+   rW   r,   rm   r$   r$   r/   	_tokenize7  s    zSiglipTokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r(   Zpiece_to_id)r+   tokenr$   r$   r/   _convert_token_to_idL  s   z$SiglipTokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r(   Z	IdToPiece)r+   indexrq   r$   r$   r/   _convert_id_to_tokenQ  s   z$SiglipTokenizer._convert_id_to_tokenc                 C   sp   g }d}d}|D ]#}|| j v r$|s|d7 }|| j|| 7 }d}g }q|| d}q|| j|7 }| S )z:Converts a sequence of tokens (string) in a single string.rX   Frb   T)rl   r(   decodeappendrg   )r+   rm   Zcurrent_sub_tokensZ
out_stringZprev_is_specialrq   r$   r$   r/   convert_tokens_to_stringV  s   

z(SiglipTokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d    |fS 1 sbw   Y  |fS )NzVocabulary path (z) should be a directory-rX   r   wb)ospathisdirloggererrorrc   VOCAB_FILES_NAMESabspathr   isfiler   r3   r(   Zserialized_model_protowrite)r+   rx   ry   Zout_vocab_filefiZcontent_spiece_modelr$   r$   r/   save_vocabularyj  s"   (

zSiglipTokenizer.save_vocabulary)r   r   r   NNr   T)NFr8   )F)$__name__
__module____qualname____doc__r   Zvocab_files_namesZmodel_input_namesr   r   r&   r   r*   r'   propertyr:   r@   r   intboolrE   rL   rM   rN   rR   rV   r]   rh   ri   ro   rp   rr   rt   rw   r   r   __classcell__r$   r$   r-   r/   r   -   sv    (1








(r   )#r   r|   re   r[   rJ   shutilr   typingr   r   r   r   r   r   r   r1   Zconvert_slow_tokenizerr
   Ztokenization_utilsr   Ztokenization_utils_baser   r   utilsr   r   Zutils.import_utilsr   Z
get_loggerr   r   r   rj   r   __all__r$   r$   r$   r/   <module>   s.    
  
P