o
    Zh                     @   s   d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	 ddl
mZ ddlmZmZ e r4d	d
lmZ ndZeeZdddZdZG dd deZdgZdS )z$Tokenization classes for FNet model.    N)copyfile)ListOptionalTuple   )
AddedToken)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )FNetTokenizerzspiece.modelztokenizer.json)
vocab_filetokenizer_fileu   ▁c                       s   e Zd ZdZeZddgZeZ										
	d fdd	Z	e
defddZ	ddee deee  dee fddZ	ddee deee  dee fddZddedee dee fddZ  ZS )FNetTokenizerFastaR	  
    Construct a "fast" FNetTokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
    [`AlbertTokenizerFast`]. Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        do_lower_case (`bool`, *optional*, defaults to `False`):
            Whether or not to lowercase the input when tokenizing.
        remove_space (`bool`, *optional*, defaults to `True`):
            Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
        keep_accents (`bool`, *optional*, defaults to `True`):
            Whether or not to keep accents when tokenizing.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
    Z	input_idsZtoken_type_idsNFT<unk>[SEP]<pad>[CLS][MASK]c                    s   t |
trt|
dddn|
}
t |	trt|	dddn|	}	t |tr(t|dddn|}t j|f||||||||	|
d	| || _|| _|| _|| _d S )NTF)lstriprstrip)	r   do_lower_caseremove_spacekeep_accents	unk_token	sep_token	pad_token	cls_token
mask_token)	
isinstancestrr   super__init__r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   kwargs	__class__ ^/var/www/auris/lib/python3.10/site-packages/transformers/models/fnet/tokenization_fnet_fast.pyr"   M   s,   
zFNetTokenizerFast.__init__returnc                 C   s   | j r
tj| j S dS )NF)r   ospathisfile)r#   r'   r'   r(   can_save_slow_tokenizert   s   z)FNetTokenizerFast.can_save_slow_tokenizertoken_ids_0token_ids_1c                 C   s8   | j g}| jg}|du r|| | S || | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An FNet sequence has the following format:

        - single sequence: `[CLS] X [SEP]`
        - pair of sequences: `[CLS] A [SEP] B [SEP]`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)sep_token_idcls_token_idr#   r.   r/   sepclsr'   r'   r(    build_inputs_with_special_tokensx   s
   z2FNetTokenizerFast.build_inputs_with_special_tokensc                 C   sV   | j g}| jg}|du rt|| | dg S t|| | dg t|| dg  S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        Nr   r   )r0   r1   lenr2   r'   r'   r(   $create_token_type_ids_from_sequences   s
   (z6FNetTokenizerFast.create_token_type_ids_from_sequencessave_directoryfilename_prefixc                 C   sp   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr5t	| j| |fS )NzVocabulary path (z) should be a directory- r   )
r*   r+   isdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r#   r8   r9   Zout_vocab_filer'   r'   r(   save_vocabulary   s   z!FNetTokenizerFast.save_vocabulary)
NNFTTr   r   r   r   r   )N)__name__
__module____qualname____doc__r@   Zvocab_files_namesZmodel_input_namesr   Zslow_tokenizer_classr"   propertyboolr-   r   intr   r5   r7   r    r   rB   __classcell__r'   r'   r%   r(   r   &   sF    "'



(r   )rF   r*   shutilr   typingr   r   r   Ztokenization_utilsr   Ztokenization_utils_fastr   utilsr	   r
   Ztokenization_fnetr   Z
get_loggerrC   r=   r@   ZSPIECE_UNDERLINEr   __all__r'   r'   r'   r(   <module>   s    

 
