o
    Zh 
                     @   sh   d Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	 e
eZdd	d
dZG dd deZdgZdS )z)Fast Tokenization classes for OpenAI GPT.    )OptionalTuple   )PreTrainedTokenizerFast)logging   )OpenAIGPTTokenizerz
vocab.jsonz
merges.txtztokenizer.json)
vocab_filemerges_filetokenizer_filec                       s^   e Zd ZdZeZddgZeZd fdd	Z	e
dd	 Zdd
edee dee fddZ  ZS )OpenAIGPTTokenizerFasta  
    Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
    the following peculiarities:

    - lower case all inputs
    - uses BERT's BasicTokenizer for pre-BPE tokenization

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        merges_file (`str`):
            Path to the merges file.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
    Z	input_idsZattention_maskN<unk>c                    s    t  j||f||d| d S )N)r   	unk_token)super__init__)selfr	   r
   r   r   kwargs	__class__ b/var/www/auris/lib/python3.10/site-packages/transformers/models/openai/tokenization_openai_fast.pyr   6   s    zOpenAIGPTTokenizerFast.__init__c                 C   s   dS )NTr   )r   r   r   r   do_lower_case9   s   z$OpenAIGPTTokenizerFast.do_lower_casesave_directoryfilename_prefixreturnc                 C   s   | j jj||d}t|S )N)name)Z
_tokenizermodelsavetuple)r   r   r   filesr   r   r   save_vocabulary=   s   z&OpenAIGPTTokenizerFast.save_vocabulary)NNNr   )N)__name__
__module____qualname____doc__VOCAB_FILES_NAMESZvocab_files_namesZmodel_input_namesr   Zslow_tokenizer_classr   propertyr   strr   r   r    __classcell__r   r   r   r   r      s    
(r   N)r$   typingr   r   Ztokenization_utils_fastr   utilsr   Ztokenization_openair   Z
get_loggerr!   loggerr%   r   __all__r   r   r   r   <module>   s   

%