
    eTh                         S SK JrJrJrJrJr  S SKJrJrJ	r	J
r
  S SKJr  S SKJr  S SKJr  S SKJr  SSKJr   " S	 S
\5      rg)    )DictIteratorListOptionalUnion)
AddedToken	Tokenizerdecoderstrainers)	WordPiece)BertNormalizer)BertPreTokenizer)BertProcessing   )BaseTokenizerc                     ^  \ rS rSrSr           S#S\\\\\\	4   4      S\\\
4   S\\\
4   S	\\\
4   S
\\\
4   S\\\
4   S\S\S\\   S\S\4U 4S jjjr\S\4S j5       rSSS/ / SQSS4S\\\\   4   S\	S\	S\	S\\   S\\\\
4      S\S\4S jjrSSS/ / SQSSS4S\\\   \\\      4   S\	S\	S\	S\\   S\\\\
4      S\S\S \\	   4S! jjrS"rU =r$ )$BertWordPieceTokenizer   zBert WordPiece TokenizerNT##vocab	unk_token	sep_token	cls_token	pad_token
mask_token
clean_texthandle_chinese_charsstrip_accents	lowercasewordpieces_prefixc                 8  > Ub  [        [        U[        U5      S95      nO[        [        [        U5      S95      nUR                  [        U5      5      b  UR	                  [        U5      /5        UR                  [        U5      5      b  UR	                  [        U5      /5        UR                  [        U5      5      b  UR	                  [        U5      /5        UR                  [        U5      5      b  UR	                  [        U5      /5        UR                  [        U5      5      b  UR	                  [        U5      /5        [        UUU	U
S9Ul        [        5       Ul        Ubw  UR                  [        U5      5      nUc  [        S5      eUR                  [        U5      5      nUc  [        S5      e[        [        U5      U4[        U5      U45      Ul        [        R                  " US9Ul        SUUUUUUUU	U
US.n[        TU ]=  X5        g )N)r   )r   r   r   r   z%sep_token not found in the vocabularyz%cls_token not found in the vocabulary)prefixBertWordPiece)modelr   r   r   r   r   r   r   r   r   r    )r	   r   strtoken_to_idadd_special_tokensr   
normalizerr   pre_tokenizer	TypeErrorr   post_processorr
   decodersuper__init__)selfr   r   r   r   r   r   r   r   r   r   r    	tokenizersep_token_idcls_token_id
parameters	__class__s                   a/var/www/auris/envauris/lib/python3.13/site-packages/tokenizers/implementations/bert_wordpiece.pyr.   BertWordPieceTokenizer.__init__   s    !)ES^"LMI!)c)n"EFI   Y0<((#i.)9:  Y0<((#i.)9:  Y0<((#i.)9:  Y0<((#i.)9:  Z1=((#j/):;-!!5'	 
	 #3"4	$00Y@L# GHH$00Y@L# GHH'5s9~|6TWZ[dWegsVt'uI$$..6GH	 %""""$$$8*"!2

 	/    c                 F    [         R                  " U 5      n [        U 40 UD6$ )N)r   	read_filer   )r   kwargss     r5   	from_file BertWordPieceTokenizer.from_fileQ   s"    ##E*%e6v66r7   i0u     i  )[PAD][UNK][CLS][SEP][MASK]files
vocab_sizemin_frequencylimit_alphabetinitial_alphabetspecial_tokensshow_progressc	           
          [         R                  " UUUUUUUS9n	[        U[        5      (       a  U/nU R                  R                  XS9  g)z%Train the model using the given filesrD   rE   rF   rG   rH   rI   continuing_subword_prefix)trainerN)r   WordPieceTrainer
isinstancer%   
_tokenizertrain)
r/   rC   rD   rE   rF   rG   rH   rI   r    rM   s
             r5   rQ   BertWordPieceTokenizer.trainV   sT    & ++!')-)'&7
 eS!!GEe5r7   iteratorlengthc
           
      n    [         R                  " UUUUUUUS9n
U R                  R                  UU
U	S9  g)z(Train the model using the given iteratorrK   )rM   rT   N)r   rN   rP   train_from_iterator)r/   rS   rD   rE   rF   rG   rH   rI   r    rT   rM   s              r5   rV   *BertWordPieceTokenizer.train_from_iteratorv   sK    ( ++!')-)'&7
 	++ 	, 	
r7    )Nr?   rA   r@   r>   rB   TTNTr   )__name__
__module____qualname____firstlineno____doc__r   r   r%   r   intr   boolr.   staticmethodr;   r   rQ   r   rV   __static_attributes____classcell__)r4   s   @r5   r   r      sK   " 7;,3,3,3,3-5%)(,!%@0c4S>123@0 j)@0 j)	@0
 j)@0 j)@0 #z/*@0 @0 #@0  ~@0 @0 @0 @0D 7 7 7  "&(8
 #!%6S$s)^$6 6 	6
 6 s)6 U3
?346 6 6F  "&(8
 #!% $!!
x'>>?!
 !
 	!

 !
 s)!
 U3
?34!
 !
 !
  !!
 !
r7   r   N)typingr   r   r   r   r   
tokenizersr   r	   r
   r   tokenizers.modelsr   tokenizers.normalizersr   tokenizers.pre_tokenizersr   tokenizers.processorsr   base_tokenizerr   r   rX   r7   r5   <module>rj      s.    8 8 @ @ ' 1 6 0 )K
] K
r7   