
    eTh                     z    S SK JrJrJrJrJrJr  S SKJrJ	r	J
r
JrJr  S SKJr  S SKJr  SSKJr   " S S\5      rg	)
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)NFKC   )BaseTokenizerc                     ^  \ rS rSrSr       S S\\\\\\	4   4      S\\\\\
\	\	4   \
\	\	4   4   4      S\\\4   S	\S
\S\\   S\\   4U 4S jjjr\S\S\4S j5       rSSS/S/ S4S\\\\   4   S\	S\	S\\\\4      S\	S\\   S\4S jjrSSS/S/ SS4S\\\   \\\      4   S\	S\	S\\\\4      S\	S\\   S\S\\	   4S jjrSrU =r$ )!SentencePieceBPETokenizer
   zjSentencePiece BPE Tokenizer

Represents the BPE algorithm, with the pretokenization used by SentencePiece
N<unk>Tvocabmerges	unk_tokenreplacementadd_prefix_spacedropoutfuse_unkc           
        > Ub  Ub  [        [        XXcUS95      nO[        [        XcUS95      nUR                  [        U5      5      b  UR	                  [        U5      /5        [        5       Ul        U(       a  SOSn	[        R                  " XIS9Ul	        [        R                  " XIS9Ul        SUUUUS.n
[        TU ]5  X5        g )N)r   r   r   alwaysnever)r   prepend_schemeSentencePieceBPE)modelr   r   r   r   )r
   r   token_to_idstradd_special_tokensr   
normalizerr   	Metaspacepre_tokenizerr   decodersuper__init__)selfr   r   r   r   r   r   r   	tokenizerr    
parameters	__class__s              d/var/www/auris/envauris/lib/python3.13/site-packages/tokenizers/implementations/sentencepiece_bpe.pyr+   "SentencePieceBPETokenizer.__init__   s     !3!#eWdl"mnI!#gU]"^_I  Y0<((#i.)9:#v	%57"0":":{"r	$..;f	 ("& 0

 	/    vocab_filenamemerges_filenamec                 J    [         R                  " X5      u  p4[        X440 UD6$ )N)r   	read_filer   )r3   r4   kwargsr   r   s        r0   	from_file#SentencePieceBPETokenizer.from_file1   s"    nF(A&AAr2   i0u     i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc           	          [         R                  " UUUUUUS9n[        U[        5      (       a  U/nU R                  R                  XS9  g)z%Train the model using the given filesr<   r=   r>   r?   r@   rA   )trainerN)r   
BpeTrainer
isinstancer$   
_tokenizertrain)	r,   r;   r<   r=   r>   r?   r@   rA   rD   s	            r0   rH   SentencePieceBPETokenizer.train6   sQ     %%!'))-'
 eS!!GEe5r2   iteratorlengthc	           	      l    [         R                  " UUUUUUS9n	U R                  R                  UU	US9  g)z(Train the model using the given iteratorrC   )rD   rK   N)r   rE   rG   train_from_iterator)
r,   rJ   r<   r=   r>   r?   r@   rA   rK   rD   s
             r0   rM   -SentencePieceBPETokenizer.train_from_iteratorN   sH     %%!'))-'
 	++ 	, 	
r2    )NNr   u   ▁TNF)__name__
__module____qualname____firstlineno____doc__r   r   r$   r   intr   r	   boolfloatr+   staticmethodr8   r   rH   r   rM   __static_attributes____classcell__)r/   s   @r0   r   r   
   s    7;OS,3 !%#'#(0c4S>1230 sDsCx%S/)I$JJKL0 j)	0
 0 0 %0 4.0 0B B# B B B  8?y"&("6S$s)^$6 6 	6
 U3
?346 6 s)6 66  8?y"&(" $
x'>>?
 
 	

 U3
?34
 
 s)
 
 
 
r2   r   N)typingr   r   r   r   r   r   
tokenizersr	   r
   r   r   r   tokenizers.modelsr   tokenizers.normalizersr   base_tokenizerr   r   rO   r2   r0   <module>r`      s(    ? ? P P ! ' )]
 ]
r2   