o
    Zh'                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZmZ e r6dd	lmZ ndZeeZd
ddZG dd deZdgZdS )z Tokenization class for model T5.    N)copyfile)ListOptionalTuple   )PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )T5Tokenizerzspiece.modelztokenizer.json)
vocab_filetokenizer_filec                       s   e Zd ZU dZeZddgZeZg Z	e
e ed< 									d fd
d	ZedefddZedd Zd dedee dee fddZ	d de
e dee
e  de
e fddZ	d de
e dee
e  de
e fddZdd Zdd Z  ZS )!T5TokenizerFasta`  
    Construct a "fast" T5 tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
            Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are accessible as
            "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be retrieved by
            calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids method
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        add_prefix_space (`bool`, *optional*):
            Whether or not the tokenizer should automatically add a prefix space
        from_slow (`book`, *optional*, defaults to `False`):
            Whether or not the tokenizer should be converted from a slow one. If `add_prefix_space` is set, this will be set to `True`.
    Z	input_idsZattention_maskprefix_tokensN</s><unk><pad>d   c	                    s   |d ur3dd |D }
t |
dk r|dd t|D 7 }n!|dkr2|t |
kr2td| d| dnd	d t|D }
|
}|d urKtd
 d|	d< t jd||||||||d|	 || _|| _d S )Nc                 S   s   g | ]
}d t |v r|qS )
<extra_id_)str).0x r   Z/var/www/auris/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5_fast.py
<listcomp>d   s    z,T5TokenizerFast.__init__.<locals>.<listcomp>r
   c                 S      g | ]}d | dqS r   >r   r   ir   r   r   r   f       r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensc                 S   r   r   r   r   r   r   r   r   n   r    zXYou set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizersTZ	from_slow)r   r   	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensadd_prefix_spacer   )	lenrange
ValueErrorloggerZwarning_oncesuper__init__r   Z
_extra_ids)selfr   r   r!   r"   r#   r$   r%   r&   kwargsZextra_tokens	__class__r   r   r,   V   s<   	
zT5TokenizerFast.__init__returnc                 C   s   | j r
tj| j S dS )NF)r   ospathisfiler-   r   r   r   can_save_slow_tokenizer   s   z'T5TokenizerFast.can_save_slow_tokenizerc                 C   sZ   | t jv r+t j|  }|d ur||kr|S |d u r+td| d|  d| d| d	t |S )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   Zmax_model_input_sizeswarningswarnFutureWarning)Zpretrained_model_name_or_pathZmax_model_lengthZinit_max_model_lengthZdeprecated_max_model_lengthr   r   r   !_eventually_correct_t5_max_length   s$   

	z1T5TokenizerFast._eventually_correct_t5_max_lengthsave_directoryfilename_prefixc                 C   s   | j stdtj|std| d d S tj||r"|d ndtd  }tj	| j
tj	|krDt| j
| td|  |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory- r   zCopy vocab file to )r6   r)   r2   r3   isdirr*   errorjoinVOCAB_FILES_NAMESabspathr   r   info)r-   r;   r<   Zout_vocab_filer   r   r   save_vocabulary   s   zT5TokenizerFast.save_vocabularytoken_ids_0token_ids_1c                 C   s8   || j g }|du r| j| S || j g }| j| | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)eos_token_idr   )r-   rF   rG   r   r   r    build_inputs_with_special_tokens   s
   
z0T5TokenizerFast.build_inputs_with_special_tokensc                 C   s<   | j g}|du rt|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )rH   r'   )r-   rF   rG   Zeosr   r   r   $create_token_type_ids_from_sequences   s   z4T5TokenizerFast.create_token_type_ids_from_sequencesc                 C   s   t ttdd | jS )Nc                 S   s   t td| d uS )Nz<extra_id_\d+>)boolresearch)r   r   r   r   <lambda>   s    z5T5TokenizerFast.get_sentinel_tokens.<locals>.<lambda>)listsetfilterr%   r5   r   r   r   get_sentinel_tokens   s   z#T5TokenizerFast.get_sentinel_tokensc                    s    fdd   D S )Nc                    s   g | ]}  |qS r   )Zconvert_tokens_to_ids)r   tokenr5   r   r   r      s    z:T5TokenizerFast.get_sentinel_token_ids.<locals>.<listcomp>)rR   r5   r   r5   r   get_sentinel_token_ids   s   z&T5TokenizerFast.get_sentinel_token_ids)NNr   r   r   r   NN)N)__name__
__module____qualname____doc__rB   Zvocab_files_namesZmodel_input_namesr   Zslow_tokenizer_classr   r   int__annotations__r,   propertyrK   r6   staticmethodr:   r   r   r   rE   rI   rJ   rR   rT   __classcell__r   r   r/   r   r   )   sL   
 &0
 



r   )rX   r2   rL   r7   shutilr   typingr   r   r   Ztokenization_utils_fastr   utilsr   r	   Ztokenization_t5r   Z
get_loggerrU   r*   rB   r   __all__r   r   r   r   <module>   s    

 
E