o
    ZhZ                     @   s   d dl mZmZmZmZmZmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZmZ ddlmZ G dd	 d	eZd
S )    )DictIteratorListOptionalTupleUnion   )
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)BertNormalizer	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                       s  e Zd ZdZ									d&deeeeeef f  deeeee	eef e	eef f f  d	eee
f d
edee dedee dedef fddZededefddZdddgdg ddfdeeee f dededeeee
f  dedee d
ee defd d!Zdddgdg dddfd"eee eee  f dededeeee
f  dedee d
ee ded#ee fd$d%Z  ZS )'CharBPETokenizera  Original BPE Tokenizer

    Represents the BPE algorithm, as introduced by Rico Sennrich
    (https://arxiv.org/abs/1508.07909)

    The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
    Sennrich subword-nmt implementation by the following options that you can deactivate:
        - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
            * removing any control characters and replacing all whitespaces by the classic one.
            * handle chinese chars by putting spaces around them.
            * strip all accents.
        - spitting on punctuation in addition to whitespaces (deactivate it with
          `split_on_whitespace_only=True`)
    N<unk></w>FTvocabmerges	unk_tokensuffixdropout	lowercaseunicode_normalizerbert_normalizersplit_on_whitespace_onlyc
              	      s  |d ur|d urt t|||t||d}
nt tt|||d}
|
t|d ur2|
t|g g }|r=|t|g7 }|rG|tddg7 }|rO|t g7 }t|dkrft|dkrat	||
_
n|d |
_
|	rnt |
_nt |
_tj|d|
_d|||||||	d	}t |
| d S )
N)r   r   end_of_word_suffix)r   r   r!   F)r   r   r   )r   r   )modelr   r   r   r   r   r   r    )r
   r   strZtoken_to_idZadd_special_tokensr   r   r   lenr   Z
normalizerr   ZWhitespaceSplitZpre_tokenizerZBertPreTokenizerr   Z
BPEDecoderdecodersuper__init__)selfr   r   r   r   r   r   r   r   r    Z	tokenizernormalizers
parameters	__class__ X/var/www/auris/lib/python3.10/site-packages/tokenizers/implementations/char_level_bpe.pyr'      sL   


zCharBPETokenizer.__init__vocab_filenamemerges_filenamec                 K   s"   t | |\}}t||fi |S )N)r   	read_filer   )r/   r0   kwargsr   r   r-   r-   r.   	from_file\   s   zCharBPETokenizer.from_filei0u  r   i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc	           
   	   C   s<   t j|||||||d}	t|tr|g}| jj||	d dS )z%Train the model using the given filesr5   r6   r7   r8   r9   r!   r:   )trainerN)r   
BpeTrainer
isinstancer#   
_tokenizertrain)
r(   r4   r5   r6   r7   r8   r9   r   r:   r<   r-   r-   r.   r@   a   s   
	zCharBPETokenizer.trainiteratorlengthc
              	   C   s.   t j|||||||d}
| jj||
|	d dS )z(Train the model using the given iteratorr;   )r<   rB   N)r   r=   r?   train_from_iterator)r(   rA   r5   r6   r7   r8   r9   r   r:   rB   r<   r-   r-   r.   rC   {   s   	
z$CharBPETokenizer.train_from_iterator)	NNr   r   NFNTF)__name__
__module____qualname____doc__r   r   r#   r   intr   r	   floatboolr'   staticmethodr3   r   r@   r   rC   __classcell__r-   r-   r+   r.   r   	   s    &
	
C	
	
r   N)typingr   r   r   r   r   r    r	   r
   r   r   r   modelsr   r)   r   r   r   r   Zbase_tokenizerr   r   r-   r-   r-   r.   <module>   s     