o
    Zh7                     @   s   d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
ZddlmZmZ ddlmZ ddlmZ er:ddlmZ eeZd	d
iZdZeddG dd deZdgZdS )    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )
AddedTokenPreTrainedTokenizer)logging)requires)	TextInput
vocab_fileztokenizer.modelu   ▁)sentencepiece)backendsc                
       sX  e Zd ZdZeZddgZ								
	
	
	
d4deee	e
f  f fddZdd Zdd Zedd Zdd Zdddee	 f fddZdd Zdd Zdd  Zd!d" Zd5d#ee	 dee	 fd$d%Zd5d&d'Z	
d6d(ee d)eee  d*edee f fd+d,Z	d5d(ee d)eee  dee fd-d.Z	
	
d7d/ee d0ed1ede	fd2d3Z  ZS )8GemmaTokenizera
  
    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Gemma should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
    Z	input_idsZattention_mask<unk><bos><eos><pad>NTFsp_model_kwargsc                    s   |d u ri n|| _ t|trt|dddn|}t|tr#t|dddn|}t|tr1t|dddn|}t|tr?t|dddn|}|| _|| _|| _|
| _tj	di | j | _
| j
| t jd||||||||	|
|d
| d S )NFT)
normalizedZspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokens )r   
isinstancestrr
   r   r   r   r    spmSentencePieceProcessorsp_modelLoadsuper__init__)selfr   r   r   r   r   r   r   r   r   r    r!   kwargs	__class__r"   [/var/www/auris/lib/python3.10/site-packages/transformers/models/gemma/tokenization_gemma.pyr*   ^   s2   
zGemmaTokenizer.__init__c                 C   s$   | j  }d |d< | j |d< |S )Nr'   sp_model_proto)__dict__copyr'   serialized_model_proto)r+   stater"   r"   r/   __getstate__   s   
zGemmaTokenizer.__getstate__c                 C   s2   | j | tjdi | j| _| j| j d S )Nr"   )r1   updater%   r&   r   r'   ZLoadFromSerializedProtor0   )r+   dr"   r"   r/   __setstate__   s   zGemmaTokenizer.__setstate__c                 C   s
   | j  S )zReturns vocab size)r'   Zget_piece_sizer+   r"   r"   r/   
vocab_size   s   
zGemmaTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )zReturns vocab as a dictc                    s   i | ]}  ||qS r"   )Zconvert_ids_to_tokens).0ir9   r"   r/   
<dictcomp>   s    z,GemmaTokenizer.get_vocab.<locals>.<dictcomp>)ranger:   r6   Zadded_tokens_encoder)r+   Zvocabr"   r9   r/   	get_vocab   s   zGemmaTokenizer.get_vocabtextr   returnc                    s   t  j|fi |S )ze
        Args:
            text: TextInput
        Simply calls PreTrainedTokenizer's method
        )r)   tokenizer+   r@   r,   r-   r"   r/   rB      s   zGemmaTokenizer.tokenizec                 K   s   | j j|tdS )z
        Args:
            text: TextInput
        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
        )Zout_type)r'   encoder$   rC   r"   r"   r/   	_tokenize   s   zGemmaTokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r'   Zpiece_to_id)r+   tokenr"   r"   r/   _convert_token_to_id   s   z#GemmaTokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r'   Z	IdToPiece)r+   indexrF   r"   r"   r/   _convert_id_to_token   s   z#GemmaTokenizer._convert_id_to_tokenc                 C   sT   g }d}|D ]}|| j v r|| j|| 7 }g }q|| q|| j|7 }|S )z:Converts a sequence of tokens (string) in a single string. )Z_added_tokens_encoderr'   decodeappend)r+   tokensZcurrent_sub_tokensZ
out_stringrF   r"   r"   r/   convert_tokens_to_string   s   
z'GemmaTokenizer.convert_tokens_to_stringfilename_prefixc                 C   s   t j|std| d dS t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d   |fS 1 sbw   Y  |fS )a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directoryN-rJ   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr'   r3   write)r+   Zsave_directoryrO   Zout_vocab_filefiZcontent_spiece_modelr"   r"   r/   save_vocabulary   s"   (

zGemmaTokenizer.save_vocabularyc                 C   sL   | j r| jgng }| jr| jgng }|| | }|d ur$|| | | }|S N)r   bos_token_idr   eos_token_idr+   token_ids_0token_ids_1r`   ra   outputr"   r"   r/    build_inputs_with_special_tokens   s   z/GemmaTokenizer.build_inputs_with_special_tokensrc   rd   already_has_special_tokensc                    s   |rt  j||ddS | jrdgng }| jrdgng }|du r*|dgt|  | S |dgt|  | | dgt|  | S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rc   rd   rg      Nr   )r)   get_special_tokens_maskr   r   len)r+   rc   rd   rg   r`   ra   r-   r"   r/   ri      s(   z&GemmaTokenizer.get_special_tokens_maskc                 C   s`   | j r| jgng }| jr| jgng }dgt|| |  }|dur.|dgt|| |  7 }|S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   Nrh   )r   r`   r   ra   rj   rb   r"   r"   r/   $create_token_type_ids_from_sequences  s   z3GemmaTokenizer.create_token_type_ids_from_sequences	token_idsskip_special_tokensr!   c                 K   s   g }g }|D ]+}|r|| j v rq|| jv r,|r || j| || j| j g }q|| q|r=|| j| |rEd|}nd|}|tdS )N rJ   )	Zall_special_idsZ_added_tokens_decoderrL   r'   rK   contentrW   replaceSPIECE_UNDERLINE)r+   rl   rm   r!   r,   Z	sub_textsZcurrent_sub_textZidsr"   r"   r/   _decode1  s"   

zGemmaTokenizer._decode)
r   r   r   r   NTFFFFr_   )NF)FF) __name__
__module____qualname____doc__rX   Zvocab_files_namesZmodel_input_namesr   r   r$   r   r*   r5   r8   propertyr:   r?   r   rB   rE   rG   rI   rN   r   r^   rf   intboolri   rk   rr   __classcell__r"   r"   r-   r/   r   +   st    .*


&

$r   )rR   shutilr   typingr   r   r   r   r   r   r   r%   Ztokenization_utilsr
   r   utilsr   Zutils.import_utilsr   Ztokenization_utils_baser   Z
get_loggerrs   rU   rX   rq   r   __all__r"   r"   r"   r/   <module>   s"    
  
%