o
    Zh                     @   s   d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZmZmZ ddlmZ G dd deZd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizers
processorstrainers)BPE)	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                       sJ  e Zd ZdZ									d!deeeeeef f  deeeee	eef e	eef f f  de
de
dee d	ee d
ee dee de
f fddZededefddZdddg fdeeee f dedede
deeeef  f
ddZdddg dfdeee eee  f dedede
deeeef  dee fdd Z  ZS )"ByteLevelBPETokenizerzjByteLevelBPETokenizer

    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
    NFvocabmergesadd_prefix_space	lowercasedropoutunicode_normalizercontinuing_subword_prefixend_of_word_suffixtrim_offsetsc
              	      s   |d ur|d urt t||||pd|pdd}
nt t }
g }|r(|t|g7 }|r0|t g7 }t|dkrGt|dkrBt||
_n|d |
_tj|d|
_	t
 |
_tj|	d|
_d|||||||	d}t |
| d S )	N )r   r   r   r   r   )r   )r   ZByteLevelBPE)modelr   r   r   r   r   r   r   )r	   r   r   r   lenr   Z
normalizerr   	ByteLevelZpre_tokenizerr
   decoderr   Zpost_processorsuper__init__)selfr   r   r   r   r   r   r   r   r   Z	tokenizerZnormalizers
parameters	__class__ X/var/www/auris/lib/python3.10/site-packages/tokenizers/implementations/byte_level_bpe.pyr$      sB   



zByteLevelBPETokenizer.__init__vocab_filenamemerges_filenamec                 K   s"   t | |\}}t||fi |S )N)r   	read_filer   )r+   r,   kwargsr   r   r)   r)   r*   	from_fileJ   s   zByteLevelBPETokenizer.from_filei0u     Tfiles
vocab_sizemin_frequencyshow_progressspecial_tokensc                 C   s>   t j||||tj d}t|tr|g}| jj||d dS )z%Train the model using the given filesr2   r3   r4   r5   Zinitial_alphabet)trainerN)	r   
BpeTrainerr   r!   alphabet
isinstancestr
_tokenizertrain)r%   r1   r2   r3   r4   r5   r7   r)   r)   r*   r=   O   s   

zByteLevelBPETokenizer.trainiteratorlengthc                 C   s0   t j||||tj d}| jj|||d dS )z(Train the model using the given iteratorr6   )r7   r?   N)r   r8   r   r!   r9   r<   train_from_iterator)r%   r>   r2   r3   r4   r5   r?   r7   r)   r)   r*   r@   d   s   
z)ByteLevelBPETokenizer.train_from_iterator)	NNFFNNNNF)__name__
__module____qualname____doc__r   r   r;   r   intr   boolfloatr$   staticmethodr/   r   r   r=   r   r@   __classcell__r)   r)   r'   r*   r   
   s    &	
:
r   N)typingr   r   r   r   r   r   Z
tokenizersr   r	   r
   r   r   r   Ztokenizers.modelsr   Ztokenizers.normalizersr   r   r   Zbase_tokenizerr   r   r)   r)   r)   r*   <module>   s      