
    eThZ                         S SK JrJrJrJrJrJr  SSKJrJ	r	J
r
JrJr  SSKJr  SSKJrJrJrJr  SSKJr   " S S	\5      rg
)    )DictIteratorListOptionalTupleUnion   )
AddedToken	Tokenizerdecoderspre_tokenizerstrainers)BPE)BertNormalizer	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                     ^  \ rS rSrSr         S#S\\\\\\	4   4      S\\\\\
\	\	4   \
\	\	4   4   4      S	\\\4   S
\S\\   S\S\\   S\S\4U 4S jjjr\S\S\4S j5       rSSS/S/ SS4S\\\\   4   S\	S\	S\\\\4      S\	S\\   S
\\   S\4S jjrSSS/S/ SSS4S\\\   \\\      4   S\	S\	S\\\\4      S\	S\\   S
\\   S\S \\	   4S! jjrS"rU =r$ )$CharBPETokenizer	   a  Original BPE Tokenizer

Represents the BPE algorithm, as introduced by Rico Sennrich
(https://arxiv.org/abs/1508.07909)

The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
Sennrich subword-nmt implementation by the following options that you can deactivate:
    - adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
        * removing any control characters and replacing all whitespaces by the classic one.
        * handle chinese chars by putting spaces around them.
        * strip all accents.
    - spitting on punctuation in addition to whitespaces (deactivate it with
      `split_on_whitespace_only=True`)
N<unk></w>Tvocabmerges	unk_tokensuffixdropout	lowercaseunicode_normalizerbert_normalizersplit_on_whitespace_onlyc
           
        > Ub#  Ub   [        [        UUU[        U5      US95      n
O[        [        [        U5      XTS95      n
U
R                  [        U5      5      b  U
R	                  [        U5      /5        / nU(       a  U[        U5      /-  nU(       a  U[        SS9/-  nU(       a  U[        5       /-  n[        U5      S:  a*  [        U5      S:  a  [        U5      U
l
        O
US   U
l
        U	(       a  [        R                  " 5       U
l        O[        R                  " 5       U
l        [        R                   " US9U
l        SUUUUUUU	S	.n[$        TU ]M  X5        g )
N)r   r   end_of_word_suffix)r   r   r%   F)r    r   r   )r   r   )modelr   r   r   r    r!   r"   r#   )r   r   strtoken_to_idadd_special_tokensr   r   r   lenr   
normalizerr   WhitespaceSplitpre_tokenizerBertPreTokenizerr   
BPEDecoderdecodersuper__init__)selfr   r   r   r   r   r    r!   r"   r#   	tokenizernormalizers
parameters	__class__s                a/var/www/auris/envauris/lib/python3.13/site-packages/tokenizers/implementations/char_level_bpe.pyr2   CharBPETokenizer.__init__   sW    !3!#!)n'-I "#I"klI  Y0<((#i.)9: 78JKLLKNU;<<KIK=(K {a;!#'/'<	$'21~	$#&4&D&D&FI#&4&E&E&GI#$//v>	 """4.(@	

 	/    vocab_filenamemerges_filenamec                 J    [         R                  " X5      u  p4[        X440 UD6$ )N)r   	read_filer   )r;   r<   kwargsr   r   s        r8   	from_fileCharBPETokenizer.from_file\   s"    nF888r:   i0u  r	   i  files
vocab_sizemin_frequencyspecial_tokenslimit_alphabetinitial_alphabetshow_progressc	           
          [         R                  " UUUUUUUS9n	[        U[        5      (       a  U/nU R                  R                  XS9  g)z%Train the model using the given filesrC   rD   rE   rF   rG   r%   rH   )trainerN)r   
BpeTrainer
isinstancer'   
_tokenizertrain)
r3   rB   rC   rD   rE   rF   rG   r   rH   rK   s
             r8   rO   CharBPETokenizer.traina   sT     %%!'))-%'
 eS!!GEe5r:   iteratorlengthc
           
      n    [         R                  " UUUUUUUS9n
U R                  R                  UU
U	S9  g)z(Train the model using the given iteratorrJ   )rK   rR   N)r   rL   rN   train_from_iterator)r3   rQ   rC   rD   rE   rF   rG   r   rH   rR   rK   s              r8   rT   $CharBPETokenizer.train_from_iterator{   sK     %%!'))-%'
 	++ 	, 	
r:    )	NNr   r   NFNTF)__name__
__module____qualname____firstlineno____doc__r   r   r'   r   intr   r
   floatboolr2   staticmethodr@   r   rO   r   rT   __static_attributes____classcell__)r7   s   @r8   r   r   	   sD   " 7;OS,3#',0 $).A0c4S>123A0 sDsCx%S/)I$JJKLA0 j)	A0
 A0 %A0 A0 %SMA0 A0 #'A0 A0F 9# 9 9 9  8?y"&( &"6S$s)^$6 6 	6
 U3
?346 6 s)6 6 6:  8?y"&( &" $
x'>>?
 
 	

 U3
?34
 
 s)
 
 
 
 
r:   r   N)typingr   r   r   r   r   r    r
   r   r   r   r   modelsr   r5   r   r   r   r   base_tokenizerr   r   rV   r:   r8   <module>rf      s+    ? ? H H  Z Z )M
} M
r:   