o
    Zh                     @   s   d dl mZmZmZmZmZ d dlmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ ddlmZ G d	d
 d
eZdS )    )DictIteratorListOptionalUnion)
AddedToken	Tokenizerdecoderstrainers)	WordPiece)BertNormalizer)BertPreTokenizer)BertProcessing   )BaseTokenizerc                       s~  e Zd ZdZ												d*d
eeeeeef f  deee	f deee	f deee	f deee	f deee	f de
de
dee
 de
def fddZed
efddZdddg g ddd	fdeeee f deded ed!ee d"eeee	f  d#e
defd$d%Zdddg g ddd	dfd&eee eee  f deded ed!ee d"eeee	f  d#e
ded'ee fd(d)Z  ZS )+BertWordPieceTokenizerzBert WordPiece TokenizerN[UNK][SEP][CLS][PAD][MASK]T##vocab	unk_token	sep_token	cls_token	pad_token
mask_token
clean_texthandle_chinese_charsstrip_accents	lowercasewordpieces_prefixc                    s  |d urt t|t|d}n	t tt|d}|t|d ur)|t|g |t|d ur:|t|g |t|d urK|t|g |t|d ur\|t|g |t|d urm|t|g t|||	|
d|_t |_|d ur|t|}|d u rt	d|t|}|d u rt	dt
t||ft||f|_tj|d|_d||||||||	|
|d}t || d S )N)r   )r   r   r    r!   z%sep_token not found in the vocabularyz%cls_token not found in the vocabulary)prefixZBertWordPiece)modelr   r   r   r   r   r   r   r    r!   r"   )r   r   strZtoken_to_idZadd_special_tokensr   Z
normalizerr   Zpre_tokenizer	TypeErrorr   Zpost_processorr	   decodersuper__init__)selfr   r   r   r   r   r   r   r   r    r!   r"   Z	tokenizerZsep_token_idZcls_token_id
parameters	__class__ X/var/www/auris/lib/python3.10/site-packages/tokenizers/implementations/bert_wordpiece.pyr)      sT   zBertWordPieceTokenizer.__init__c                 K   s   t | } t| fi |S )N)r   	read_filer   )r   kwargsr.   r.   r/   	from_fileQ   s   
z BertWordPieceTokenizer.from_filei0u     i  )r   r   r   r   r   files
vocab_sizemin_frequencylimit_alphabetinitial_alphabetspecial_tokensshow_progressc	           
   	   C   s<   t j|||||||d}	t|tr|g}| jj||	d dS )z%Train the model using the given filesr5   r6   r7   r8   r9   r:   Zcontinuing_subword_prefix)trainerN)r
   WordPieceTrainer
isinstancer%   
_tokenizertrain)
r*   r4   r5   r6   r7   r8   r9   r:   r"   r<   r.   r.   r/   r@   V   s   
	zBertWordPieceTokenizer.trainiteratorlengthc
              	   C   s.   t j|||||||d}
| jj||
|	d dS )z(Train the model using the given iteratorr;   )r<   rB   N)r
   r=   r?   train_from_iterator)r*   rA   r5   r6   r7   r8   r9   r:   r"   rB   r<   r.   r.   r/   rC   v   s   	
z*BertWordPieceTokenizer.train_from_iterator)Nr   r   r   r   r   TTNTr   )__name__
__module____qualname____doc__r   r   r%   r   intr   boolr)   staticmethodr2   r   r@   r   rC   __classcell__r.   r.   r,   r/   r      s    




	
B
#r   N)typingr   r   r   r   r   Z
tokenizersr   r   r	   r
   Ztokenizers.modelsr   Ztokenizers.normalizersr   Ztokenizers.pre_tokenizersr   Ztokenizers.processorsr   Zbase_tokenizerr   r   r.   r.   r.   r/   <module>   s    