
    eTh                         S SK JrJrJrJrJrJr  S SKJrJ	r	J
r
JrJrJr  S SKJr  S SKJrJrJr  SSKJr   " S S\5      rg	)
    )DictIteratorListOptionalTupleUnion)
AddedToken	Tokenizerdecoderspre_tokenizers
processorstrainers)BPE)	LowercaseSequenceunicode_normalizer_from_str   )BaseTokenizerc                     ^  \ rS rSrSr         SS\\\\\\	4   4      S\\\\\
\	\	4   \
\	\	4   4   4      S\S\S\\   S	\\   S
\\   S\\   S\4U 4S jjjr\S\S\4S j5       rSSS/ 4S\\\\   4   S\	S\	S\S\\\\4      4
S jjrSSS/ S4S\\\   \\\      4   S\	S\	S\S\\\\4      S\\	   4S jjrSrU =r$ )ByteLevelBPETokenizer
   zbByteLevelBPETokenizer

Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
Nvocabmergesadd_prefix_space	lowercasedropoutunicode_normalizercontinuing_subword_prefixend_of_word_suffixtrim_offsetsc
           
      "  > Ub,  Ub)  [        [        UUUU=(       d    SU=(       d    SS95      n
O[        [        5       5      n
/ nU(       a  U[        U5      /-  nU(       a  U[        5       /-  n[	        U5      S:  a*  [	        U5      S:  a  [        U5      U
l        O
US   U
l        [        R                  " US9U
l	        [        R                  " 5       U
l        [        R                  " U	S9U
l        SUUUUUUU	S.n[        TU ]=  X5        g )	N )r   r   r   r   r   )r   )r    ByteLevelBPE)modelr   r   r   r   r   r   r    )r
   r   r   r   lenr   
normalizerr   	ByteLevelpre_tokenizerr   decoderr   post_processorsuper__init__)selfr   r   r   r   r   r   r   r   r    	tokenizernormalizers
parameters	__class__s                a/var/www/auris/envauris/lib/python3.13/site-packages/tokenizers/implementations/byte_level_bpe.pyr,   ByteLevelBPETokenizer.__init__   s    !3!#.G.M2'9'?RI "#%(I 78JKLLKIK=(K {a;!#'/'<	$'21~	$"0":":L\"]	$..0	#-#7#7\#R	  $ 0""4)B"4(	

 	/    vocab_filenamemerges_filenamec                 J    [         R                  " X5      u  p4[        X440 UD6$ )N)r   	read_filer   )r5   r6   kwargsr   r   s        r2   	from_fileByteLevelBPETokenizer.from_fileJ   s"    nF$U=f==r4   i0u     Tfiles
vocab_sizemin_frequencyshow_progressspecial_tokensc                     [         R                  " UUUU[        R                  R	                  5       S9n[        U[        5      (       a  U/nU R                  R                  XS9  g)z%Train the model using the given filesr>   r?   r@   rA   initial_alphabet)trainerN)	r   
BpeTrainerr   r'   alphabet
isinstancestr
_tokenizertrain)r-   r=   r>   r?   r@   rA   rE   s          r2   rK   ByteLevelBPETokenizer.trainO   s]     %%!'')+55>>@
 eS!!GEe5r4   iteratorlengthc                     [         R                  " UUUU[        R                  R	                  5       S9nU R
                  R                  UUUS9  g)z(Train the model using the given iteratorrC   )rE   rN   N)r   rF   r   r'   rG   rJ   train_from_iterator)r-   rM   r>   r?   r@   rA   rN   rE   s           r2   rP   )ByteLevelBPETokenizer.train_from_iteratord   sT     %%!'')+55>>@
 	++ 	, 	
r4    )	NNFFNNNNF)__name__
__module____qualname____firstlineno____doc__r   r   rI   r   intr   boolfloatr,   staticmethodr:   r   r	   rK   r   rP   __static_attributes____classcell__)r1   s   @r2   r   r   
   s    7;OS!&#',037,0"80c4S>12380 sDsCx%S/)I$JJKL80 	80
 80 %80 %SM80 $,C=80 %SM80 80 80t ># > > >  "796S$s)^$6 6 	6
 6 U3
?3460  "79 $
x'>>?
 
 	

 
 U3
?34
 
 
r4   r   N)typingr   r   r   r   r   r   
tokenizersr	   r
   r   r   r   r   tokenizers.modelsr   tokenizers.normalizersr   r   r   base_tokenizerr   r   rR   r4   r2   <module>rc      s+    ? ? \ \ ! S S )p
M p
r4   