o
    Zh                     @   st   d dl mZmZmZmZmZmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ddlmZ G dd deZd	S )
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC   )BaseTokenizerc                       sb  e Zd ZdZ							d$deeeeeef f  deeeee	eef e	eef f f  d	eee
f d
ededee dee f fddZededefddZdddgdg dfdeeee f dededeeee
f  dedee defddZdddgdg ddfd eee eee  f dededeeee
f  dedee ded!ee fd"d#Z  ZS )%SentencePieceBPETokenizerzrSentencePiece BPE Tokenizer

    Represents the BPE algorithm, with the pretokenization used by SentencePiece
    N<unk>   ▁TFvocabmerges	unk_tokenreplacementadd_prefix_spacedropoutfuse_unkc                    s   |d ur|d urt t|||||d}n	t t|||d}|t|d ur.|t|g t |_|r6dnd}	tj||	d|_	t
j||	d|_d||||d}
t ||
 d S )N)r   r   r   alwaysnever)r   prepend_schemeZSentencePieceBPE)modelr   r   r   r   )r	   r   Ztoken_to_idstrZadd_special_tokensr   Z
normalizerr   Z	MetaspaceZpre_tokenizerr
   decodersuper__init__)selfr   r   r   r   r   r   r   Z	tokenizerr   
parameters	__class__ [/var/www/auris/lib/python3.10/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr"      s    
z"SentencePieceBPETokenizer.__init__vocab_filenamemerges_filenamec                 K   s"   t | |\}}t||fi |S )N)r   	read_filer   )r)   r*   kwargsr   r   r'   r'   r(   	from_file1   s   z#SentencePieceBPETokenizer.from_filei0u     i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc           	      C   s:   t j||||||d}t|tr|g}| jj||d dS )z%Train the model using the given filesr0   r1   r2   r3   r4   r5   )trainerN)r   
BpeTrainer
isinstancer   
_tokenizertrain)	r#   r/   r0   r1   r2   r3   r4   r5   r7   r'   r'   r(   r;   6   s   
zSentencePieceBPETokenizer.trainiteratorlengthc	           
      C   s,   t j||||||d}	| jj||	|d dS )z(Train the model using the given iteratorr6   )r7   r=   N)r   r8   r:   train_from_iterator)
r#   r<   r0   r1   r2   r3   r4   r5   r=   r7   r'   r'   r(   r>   N   s   
z-SentencePieceBPETokenizer.train_from_iterator)NNr   r   TNF)__name__
__module____qualname____doc__r   r   r   r   intr   r   boolfloatr"   staticmethodr-   r   r;   r   r>   __classcell__r'   r'   r%   r(   r   
   s    &
!
	r   N)typingr   r   r   r   r   r   Z
tokenizersr   r	   r
   r   r   Ztokenizers.modelsr   Ztokenizers.normalizersr   Zbase_tokenizerr   r   r'   r'   r'   r(   <module>   s     