o
    Zh                     @   s   d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	 ddl
mZmZ e r.dd	lmZ ndZeeZd
ddZG dd de	ZdgZdS )zTokenization classes for XGLM.    N)copyfile)ListOptionalTuple   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )XGLMTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_filec                       s   e Zd ZdZeZddgZeZ								d fd	d
	Z	e
defddZ	ddee deee  dee fddZ	ddee deee  dee fddZddedee dee fddZ  ZS )XGLMTokenizerFasta{	  
    Construct a "fast" XGLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from [`RobertaTokenizer`]
    and [`XLNetTokenizer`]. Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.
    Z	input_idsZattention_maskN<s></s><unk><pad>c	              
      sz   d| _ dd t| j D }
 dg pg  d<  d   fdd|
D 7  < t j|f|||||||d  || _d S )N   c                 S   s   g | ]}d | dqS )z<madeupword> ).0ir   r   ^/var/www/auris/lib/python3.10/site-packages/transformers/models/xglm/tokenization_xglm_fast.py
<listcomp>f   s    z.XGLMTokenizerFast.__init__.<locals>.<listcomp>additional_special_tokensc                    s   g | ]
}| d  vr|qS )r   r   )r   wordkwargsr   r   r   i   s    )r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token)Znum_madeup_wordsrangegetsuper__init__r   )selfr   r   r   r   r    r!   r"   r#   r   Zmadeup_words	__class__r   r   r'   X   s(   
	
zXGLMTokenizerFast.__init__returnc                 C   s   | j r
tj| j S dS )NF)r   ospathisfile)r(   r   r   r   can_save_slow_tokenizer{   s   z)XGLMTokenizerFast.can_save_slow_tokenizertoken_ids_0token_ids_1c                 C   s0   |du r
| j g| S | j g}|| | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)sep_token_idr(   r0   r1   sepr   r   r    build_inputs_with_special_tokens   s   z2XGLMTokenizerFast.build_inputs_with_special_tokensc                 C   s@   | j g}|du rt|| dg S t|| | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        Nr   )r2   lenr3   r   r   r   $create_token_type_ids_from_sequences   s   z6XGLMTokenizerFast.create_token_type_ids_from_sequencessave_directoryfilename_prefixc                 C   s~   | j stdtj|std| d d S tj||r"|d ndtd  }tj	| j
tj	|kr<t| j
| |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )r/   
ValueErrorr,   r-   isdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r(   r8   r9   Zout_vocab_filer   r   r   save_vocabulary   s   z!XGLMTokenizerFast.save_vocabulary)NNr   r   r   r   r   r   )N)__name__
__module____qualname____doc__rA   Zvocab_files_namesZmodel_input_namesr   Zslow_tokenizer_classr'   propertyboolr/   r   intr   r5   r7   strr   rC   __classcell__r   r   r)   r   r   $   sB    /#



(r   )rG   r,   shutilr   typingr   r   r   Ztokenization_utils_fastr   utilsr   r	   Ztokenization_xglmr   Z
get_loggerrD   r>   rA   r   __all__r   r   r   r   <module>   s   

 
!