o
    Zh3                      @   s   d dl Z d dlmZ d dlmZmZ d dlmZ ddlm	Z	 ddl
mZmZ e r0dd	lmZ ndZeeZd
ddZG dd de	ZdgZdS )    N)copyfile)OptionalTuple)
processors   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )GemmaTokenizerztokenizer.modelztokenizer.json)
vocab_filetokenizer_filec                       s   e Zd ZdZeZeZdZddgZ									
		d fdd	Z
edefddZdd Zedd Zedd Zejdd Zejdd Zd dedee dee fddZd ddZ  ZS )!GemmaTokenizerFastu
  
    Construct a Gemma tokenizer fast. Based on byte-level Byte-Pair-Encoding.

    This uses notably ByteFallback and no prefix space. Normalization is applied to replace  `" "` with `"▁"`

    ```python
    >>> from transformers import GemmaTokenizerFast

    >>> tokenizer = GemmaTokenizerFast.from_pretrained("hf-internal-testing/dummy-gemma")
    >>> tokenizer.encode("Hello this is a test")
    [2, 4521, 736, 603, 476, 2121]
    ```

    If you want to change the `bos_token` or the `eos_token`, make sure to specify them when initializing the model, or
    call `tokenizer.update_post_processor()` to make sure that the post-processing is correctly done (otherwise the
    values of the first token and final token of an encoded sequence will not be correct). For more details, checkout
    [post-processors] (https://huggingface.co/docs/tokenizers/api/post-processors) documentation.


    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`, *optional*):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        tokenizer_file (`str`, *optional*):
            [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
            contains everything needed to load the tokenizer.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The padding token
        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
    leftZ	input_idsZattention_maskNF<unk><bos><eos><pad>Tc
                    sD   t  jd|||||||||	d	|
 || _|	| _|   || _d S )N)	r   r   clean_up_tokenization_spaces	unk_token	bos_token	eos_token	pad_tokenadd_bos_tokenadd_eos_token )super__init___add_bos_token_add_eos_tokenupdate_post_processorr   )selfr   r   r   r   r   r   r   r   r   kwargs	__class__r   `/var/www/auris/lib/python3.10/site-packages/transformers/models/gemma/tokenization_gemma_fast.pyr   W   s"   

zGemmaTokenizerFast.__init__returnc                 C   s   | j r
tj| j S dS )NF)r   ospathisfiler!   r   r   r%   can_save_slow_tokenizeru   s   z*GemmaTokenizerFast.can_save_slow_tokenizerc                 C   s   | j }| j}|du r| jrtd| j}| j}|du r"| jr"td| jr)|d nd d| jr5d| d nd }| | jrDd| d	 nd d
| jrPd| d	 nd }g }| jr`|||f | jrj|||f tj	|||d| j
_dS )ze
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        Nz)add_bos_token = True but bos_token = Nonez)add_eos_token = True but eos_token = Nonez:0  z$A:0 z:0z:1z $B:1)singlepairspecial_tokens)r   bos_token_idr   
ValueErrorr   eos_token_idr   appendr   ZTemplateProcessingZ
_tokenizerZpost_processor)r!   Zbosr1   Zeosr3   r.   r/   r0   r   r   r%   r    z   s$   .6z(GemmaTokenizerFast.update_post_processorc                 C      | j S N)r   r*   r   r   r%   r         z GemmaTokenizerFast.add_eos_tokenc                 C   r5   r6   )r   r*   r   r   r%   r      r7   z GemmaTokenizerFast.add_bos_tokenc                 C      || _ |   d S r6   )r   r    r!   valuer   r   r%   r         c                 C   r8   r6   )r   r    r9   r   r   r%   r      r;   save_directoryfilename_prefixc                 C   s~   | j stdtj|std| d d S tj||r"|d ndtd  }tj	| j
tj	|kr<t| j
| |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory-r,   r   )r+   r2   r'   r(   isdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r!   r<   r=   Zout_vocab_filer   r   r%   save_vocabulary   s   z"GemmaTokenizerFast.save_vocabularyc                 C   sL   | j r| jgng }| jr| jgng }|| | }|d ur$|| | | }|S r6   )r   r1   r   r3   )r!   Ztoken_ids_0Ztoken_ids_1r1   r3   outputr   r   r%    build_inputs_with_special_tokens   s   z3GemmaTokenizerFast.build_inputs_with_special_tokens)	NNFr   r   r   r   TFr6   )__name__
__module____qualname____doc__rC   Zvocab_files_namesr   Zslow_tokenizer_classZpadding_sideZmodel_input_namesr   propertyboolr+   r    r   r   setterstrr   r   rE   rG   __classcell__r   r   r#   r%   r   "   s:    /



 r   )r'   shutilr   typingr   r   Z
tokenizersr   Ztokenization_utils_fastr   utilsr   r	   Ztokenization_gemmar   Z
get_loggerrH   r@   rC   r   __all__r   r   r   r%   <module>   s   

 
&