
    eTh                        S r SSKrSSKJr  SSKJr  SSKJrJrJ	r	J
r
JrJrJr  SSKJrJrJr  SSKJrJrJrJr  SS	KJr  \R2                  " \5      rSS
 jrS\S\4S jrS r  " S S5      r! " S S\!5      r"S\S\4S jr# " S S5      r$ " S S\$5      r% " S S\$5      r& " S S\$5      r' " S S\$5      r( " S S \$5      r) " S! S"\$5      r* " S# S$\$5      r+ " S% S&\$5      r, " S' S(\$5      r- " S) S*\$5      r. " S+ S,\$5      r/ " S- S.\$5      r0 " S/ S0\05      r1 " S1 S2\05      r2 " S3 S4\05      r3 " S5 S6\05      r4 " S7 S8\05      r5 " S9 S:\05      r6 " S; S<\05      r7 " S= S>\05      r8 " S? S@\05      r9 " SA SB\05      r: " SC SD\05      r; " SE SF\05      r< " SG SH\05      r= " SI SJ\05      r> " SK SL\05      r? " SM SN\05      r@ " SO SP\$5      rA " SQ SR\05      rB " SS ST\$5      rC " SU SV\$5      rD " SW SX\$5      rE " SY SZ\05      rF " S[ S\\05      rG " S] S^\05      rH " S_ S`\$5      rI " Sa Sb\05      rJ " Sc Sd\05      rKSe rL " Sf Sg5      rM0 Sh\1_Si\-_Sj\2_Sk\%_Sl\B_Sm\E_Sn\3_So\C_Sp\*_Sq\%_Sr\/_Ss\4_St\%_Su\%_Sv\%_Sw\%_Sx\%_0 Sy\1_Sz\'_S{\*_S|\+_S}\%_S~\%_S\-_S\9_S\-_S\-_S\%_S\I_S\5_S\6_S\(_S\%_S\-_E0 S\7_S\)_S\>_S\,_S\%_S\;_S\<_S\%_S\-_S\._S\8_S\%_S\?_S\@_S\A_S\9_S\:_E\&\F\H\H\G\HS.ErNSS\	4S jjrOg)z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)Optional)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERRORc                 8   [        5       (       a  SSKJn  U$ [        5       (       aV  SS Kn[
        R                  " UR                  R                  5      [
        R                  " S5      :  a  SSK	Jn  U$ SSK	J
n  U$ [        [        R                  " U 5      5      e)Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   googles      [/var/www/auris/envauris/lib/python3.13/site-packages/transformers/convert_slow_tokenizer.pyimport_protobufr#   #   sr    !##9&&==445g8NNB '& b&&/66}EFF    add_prefix_spacereturnc                 H    U (       a  Sn[        USS5      (       d  SnU$ SnU$ )NalwayslegacyTfirstnever)getattr)r%   original_tokenizerprepend_schemes      r"   _get_prepend_schemer/   4   s4    !)8T::$N  !r$   c                   ^  US LnU(       a  [        U5      OT n/ nUR                  5        Hm  u  pE/ n[        S[        U5      5       H.  nUS U XGS  pUT ;   d  M  U	T ;   d  M  UR	                  XU45        M0     [        UU 4S jS9nUR                  U5        Mo     [        US US9nU V
s/ s H  oS   U
S   4PM     nn
U$ s  sn
f )Nr   c                 $   > TU S      TU S      4$ Nr   r    )xvocabs    r"   <lambda>!generate_merges.<locals>.<lambda>I   s    U1Q4[%!+,Fr$   keyc                 B    U S   [        U S   5      [        U S   5      4$ )N   r   r   )lenvals    r"   r6   r7   L   s    SVSQ[#c!f+,Nr$   r9   reverser   )dictitemsranger<   appendsortedextend)r5   vocab_scoresr@   mergesmergepiece_scorelocalindexpiece_lpiece_rr>   s   `          r"   generate_mergesrO   >   s    $&G)04%eLF*0021c%j)E$Ve}eFmW%Gu$4g<= * u"FGe 3 F NX_`F*01&31vs1v&F1M 2s   -Cc                   R    \ rS rSrSrS\4S jrS	S\\\\	4   \
\   4   4S jjrSrg)
SentencePieceExtractorQ   zd
Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
modelc                 v    [        U S5        SSKJn  U" 5       U l        U R                  R	                  U5        g )Nr   r   )SentencePieceProcessor)r   r   rU   spLoad)selfrS   rU   s      r"   __init__SentencePieceExtractor.__init__V   s)    $08(*Ur$   Nr&   c                     U R                   n[        UR                  5       5       Vs0 s H  o2R                  U5      U_M     nn[	        XA5      nXE4$ s  snf )
By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
order the merges with respect to the piece scores instead.
)rV   rC   GetPieceSizeid_to_piecerO   rX   rG   rV   rL   r5   rH   s         r"   extractSentencePieceExtractor.extract]   sS    
 WW;@AR;ST;S%&-;ST 5}	 Us   A)rV   N)__name__
__module____qualname____firstlineno____doc__strrY   tuplerA   intlistr`   __static_attributes__r3   r$   r"   rQ   rQ   Q   s:    c 
E$sCx.$u+2M,N 
 
r$   rQ   c                   @    \ rS rSrSS\\\\4   \\   4   4S jjr	Sr
g)GemmaSentencePieceExtractorj   Nr&   c                     U R                   n[        UR                  5       5       Vs0 s H  o2R                  U5      U_M     nnSU;  a  UR	                  S5      US'   [        XA5      nXE4$ s  snf )r\   	<0x09>)rV   rC   r]   r^   getrO   r_   s         r"   r`   #GemmaSentencePieceExtractor.extractk   so    
 WW;@AR;ST;S%&-;ST u))H-E$K 5} Us   A-r3   rb   )rc   rd   re   rf   ri   rA   rh   rj   rk   r`   rl   r3   r$   r"   rn   rn   j   s)    E$sCx.$u+2M,N  r$   rn   piecec                 z    [        U 5      S:  =(       d'    U S   S:g  =(       d    U S   R                  5       (       + $ )Nr;   ,)r<   isdigit)ru   s    r"   check_number_commar{   {   s3    u:>HU2Y#-HU2Y5F5F5H1HHr$   c                   (    \ rS rSrS rS\4S jrSrg)	Converter   c                     Xl         g rb   r-   )rX   r-   s     r"   rY   Converter.__init__   s    "4r$   r&   c                     [        5       erb   )NotImplementedErrorrX   s    r"   	convertedConverter.converted   s    !##r$   r   N)rc   rd   re   rf   rY   r   r   rl   r3   r$   r"   r}   r}      s    5$9 $r$   r}   c                   "    \ rS rSrS\4S jrSrg)BertConverter   r&   c           	      b   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	3Xh4Xy4/S
9Ul        [0        R                  " SS9Ul        U$ )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr-   r5   r   r   rh   r   hasattrr   tokenize_chinese_charsr   do_lower_caser	   BertNormalizer
normalizerr
   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr   TemplateProcessingpost_processorr   decoder
rX   r5   	tokenizerr   r   r   clssepr   r   s
             r"   r   BertConverter.converted      ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	r$   r3   Nrc   rd   re   rf   r   r   rl   r3   r$   r"   r   r          #9 #r$   r   c                   "    \ rS rSrS\4S jrSrg)SplinterConverter   r&   c           
      v   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      n[	        U R                   R&                  5      nSn	U R                   R(                  n
U R                   R*                  nU R                   R,                  nU R                   R/                  S5      nU R                   R0                  S:X  a  U SU S	U	 S	U S
U S3
nOU SU S
U S	U	 S	U S3
n[2        R4                  " U SU S3UXj4X{4X4X4/S9Ul        [8        R                  " SS9Ul        U$ )Nr   Fr   Tr   .rightr    r   r   r   r   r   r   )r-   r5   r   r   rh   r   r   r   r   r   r   r	   r   r   r
   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider   r   r   r   r   )rX   r5   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   s                  r"   r   SplinterConverter.converted   s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334t..==>..;;..;; 33EE..DDSI""//7:U(8*AcU!C5RHDU(3%xz3%qRHD#-#@#@U(3%r*##-#		$
	  %..d;	r$   r3   Nr   r3   r$   r"   r   r      s    .9 .r$   r   c                   "    \ rS rSrS\4S jrSrg)FunnelConverter   r&   c           	      b   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	3Xh4Xy4/S
9Ul        [0        R                  " SS9Ul        U$ )Nr   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   s
             r"   r   FunnelConverter.converted   r   r$   r3   Nr   r3   r$   r"   r   r      r   r$   r   c                   "    \ rS rSrS\4S jrSrg)MPNetConverteri  r&   c                 h   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	U S
3Xh4Xy4/S9Ul        [0        R                  " SS9Ul        U$ )Nr   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   s
             r"   r   MPNetConverter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5SXcU"=##$
	  %..d;	r$   r3   Nr   r3   r$   r"   r   r     r   r$   r   c                   "    \ rS rSrS\4S jrSrg)OpenAIGPTConverteri.  r&   c                    U R                   R                  n[        U R                   R                  R	                  5       5      nU R                   R
                  n[        [        UUS [        U5      SSS95      nUR                  [        U5      5      b  UR                  [        U5      /5        [        R                  " SS9Ul        [        R                  " 5       Ul        ["        R$                  " SS9Ul        U$ )N</w>F)r5   rH   dropoutr   end_of_word_suffixfuse_unkT)r   suffix)r-   encoderrk   	bpe_rankskeysr   r   r   rh   token_to_idadd_special_tokensr	   r   r   r
   r   r   r   
BPEDecoderr   rX   r5   rH   r   r   s        r"   r   OpenAIGPTConverter.converted/  s    ''//d--77<<>?++55	i.#)	
	   Y0<((#i.)9:*99DI	"0"A"A"C	$//v>	r$   r3   Nr   r3   r$   r"   r   r   .  s    9 r$   r   c            	       V    \ rS rSr SS\\\\4      S\\\	\\4         S\
4S jjrSrg)	GPT2ConverteriI  Nr5   rH   r&   c                 Z   U(       d  U R                   R                  nU(       d  [        U R                   R                  5      n[	        [        UUS SSSS95      n[        U R                   SS5      n[        R                  " US9Ul	        [        R                  " 5       Ul        [        U R                   SS5      (       aQ  U R                   R                  nU R                   R                  n[        R                  " U S3U S3XV4/S	9Ul        U$ [        R                  " SS
9Ul        U$ )N Fr5   rH   r   continuing_subword_prefixr   r   r%   r%   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r-   r   rk   r   r   r   r,   r
   	ByteLevelr   r   r   	bos_tokenbos_token_idr   r   r   )rX   r5   rH   r   r%   bosr   s          r"   r   GPT2Converter.convertedJ  s    ++33E$11;;<F*,#%	
	 #4#:#:<NPUV"0":":L\"]	$..0	4**OUCC))33C22??L'1'D'DguL)' (I$  (2';';'OI$r$   r3   NNrc   rd   re   rf   r   rA   rh   rj   rk   ri   r   r   rl   r3   r$   r"   r   r   I  sI    `d$d38n-$>FtERUWZRZOG\>]$	$ $r$   r   c                   "    \ rS rSrS\4S jrSrg)HerbertConverteriq  r&   c           
      ~   SnSnU R                   R                  n[        U R                   R                  R	                  5       5      nXS   S   ;   a  USS  n[        [        UUS U R                   R                  US95      n[        R                  " SSS9Ul
        [        R                  " 5       Ul        [        R                  " US9Ul        ["        R$                  " U R                   R&                  U R                   R(                  4U R                   R*                  U R                   R,                  4S	9Ul        U$ )
Nz	#version:r   r   r   )r   r   r   F)r   r   r   )r   r   )r-   r   rk   r   r   r   r   r   r	   r   r   r
   r   r   r   r   r   r   BertProcessingr   r   r   r   r   )rX   tokenizer_info_strtoken_suffixr5   rH   r   s         r"   r   HerbertConverter.convertedr  s   (''//d--77<<>?1-ABZF11;;#/
	  +99EY^_	"0"A"A"C	$//|D	#-#<#<((22D4K4K4X4XY((22D4K4K4X4XY$
	 
 r$   r3   Nr   r3   r$   r"   r   r   q      9 r$   r   c            	       V    \ rS rSr SS\\\\4      S\\\	\\4         S\
4S jjrSrg)	Qwen2Converteri  Nr5   rH   r&   c                 8   U(       d  U R                   R                  nU(       d-  [        U R                   R                  R	                  5       5      n[        [        UUS S SSSSS95      n[        R                  " 5       Ul	        [        R                  " [        R                  " [        S5      SSS9[        R                  " [        U R                   SS5      SS9/5      Ul        ["        R                  " 5       Ul        [&        R                  " SS	9Ul        U$ )
Nr   F)r5   rH   r   r   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr%   r%   	use_regexr   )r-   r   rk   r   r   r   r   r	   NFCr   r
   SequenceSplitr   r   r,   r   r   r   r   r   )rX   r5   rH   r   s       r"   r   Qwen2Converter.converted  s    ++33E$11;;@@BCF*,#%#	
	  +0	"0"9"9$$ N (  ((%,T-D-DFXZ_%`##
	  %..0	#-#7#7U#K	 r$   r3   r   r   r3   r$   r"   r   r     sI    `d*d38n-*>FtERUWZRZOG\>]*	* *r$   r   c                   "    \ rS rSrS\4S jrSrg)RobertaConverteri  r&   c                    U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " UR                  S9Ul
        [        R                  " 5       Ul        [        R                  " UR                  UR                   4UR"                  UR$                  4UR                  SS9Ul        U$ )Nr   Fr   r   Tr   r   r%   r   )r-   r   rk   r   r   r   r   r
   r   r%   r   r   r   r   RobertaProcessingr   r   r   r   r   rX   otr5   rH   r   s        r"   r   RobertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#?#?r/r/00	$
	  r$   r3   Nr   r3   r$   r"   r  r        9 r$   r  c                   "    \ rS rSrS\4S jrSrg)RoFormerConverteri  r&   c           	      J   SSK Jn  U R                  R                  n[	        [        U[        U R                  R                  5      S95      nSnSn[        U R                  S5      (       a@  U R                  R                  R                  nU R                  R                  R                  n[        R                  " SSUUS9Ul        [        R                   R#                  U" U5      5      Ul        [        U R                  R&                  5      n[        U R                  R(                  5      nU R                  R*                  nU R                  R,                  n	[.        R0                  " U SU S	3U SU S
U S3Xh4Xy4/S9Ul        [4        R
                  " SS9Ul        U$ )Nr   )JiebaPreTokenizerr   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr  r-   r5   r   r   rh   r   r   r   r   r   r	   r   r   r
   PreTokenizercustomr   r   r   r   r   r   r   r   r   r   )
rX   r  r5   r   r   r   r   r   r   r   s
             r"   r   RoFormerConverter.converted  sx   I''--iT=T=T=^=^9_`a	4**,=>> 33CCQQM 33CCQQM*99!&'#	 
	 #1"="="D"DEVW\E]"^	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	r$   r3   Nr   r3   r$   r"   r  r    r   r$   r  c                   "    \ rS rSrS\4S jrSrg)DebertaConverteri  r&   c                    U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " UR                  S9Ul
        [        R                  " 5       Ul        [        R                  " SSSU R                   R                  S5      4SU R                   R                  S5      4/S	9Ul        U$ )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r-   r   rk   r   r   r   r   r
   r   r%   r   r   r   r   r   r   r   r
  s        r"   r   DebertaConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@)4$11GGPQ$11GGPQ$
	  r$   r3   Nr   r3   r$   r"   r  r    r   r$   r  c                   l   ^  \ rS rSrSr\r0 rU 4S jrS r	S r
S rS rS rS	 rS
 rS\4S jrSrU =r$ )SpmConverteri!  Fc                   > [        U S5        [        TU ]  " U6   [        5       nUR	                  5       n[        U R                  R                  S5       nUR                  UR                  5       5        S S S 5        X0l
        U R                  R                  R                  (       a)  U R                  (       d  [        R                  " S5        g g g ! , (       d  f       Nc= f)Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrY   r#   
ModelProtoopenr-   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rX   args	model_pb2mf	__class__s        r"   rY   SpmConverter.__init__&  s    $
+$ $%	  "$))44d;qaffh' <
::""009R9RMMe :S0	 <;s    C
C c                 p    UR                    Vs/ s H  o"R                  UR                  4PM     sn$ s  snf rb   piecesru   scorerX   r(  ru   s      r"   r5   SpmConverter.vocab;  s)    8=Euekk*EEEs   !3c                 .    UR                   R                  $ rb   )r)  unk_idrX   r(  s     r"   r:  SpmConverter.unk_id>  s    !!(((r$   c                    UR                   R                  nU R                  U5      nUS:X  a.  [        [	        UU R                  U5      U R                  S95      nOUS:X  a  U R                  U R                  R                  5      R                  U5      u  pV[        U5       VVV	s0 s H
  u  nu  pX_M     n
nnn	[        [        U
UUR                   R                  SU R                  S S95      nO[        S5      e[        UR                  5       VVs/ s HR  u  pUR                   S;   d  M  XR"                  UR                   S:H  =(       d    UR"                  U R$                  ;   4PMT     nnnUR'                  [)        US	 S
9 VVVs/ s H  u  pn[+        USUS9PM     snnn5        U$ s  sn	nnf s  snnf s  snnnf )Nr   r:  r   r;   T)r   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithm      r@  c                     U S   $ Nr   r3   r4   s    r"   r6   (SpmConverter.tokenizer.<locals>.<lambda>m      QRSTQUr$   r8   F
normalizedspecial)r)  
model_typer5   r   r   r:  r*  SpmExtractorr-   r%  r`   	enumerater   	unk_piece	Exceptionr5  typeru   r   
add_tokensrE   r   )rX   r(  rJ  rG   r   _rH   iwordr6  	bpe_vocabidpspm_added_tokenstokenrI  s                   r"   r   SpmConverter.tokenizerA  s   ''22
zz%(?! ;;u-"&";";I 1_))$*A*A*L*LMUUVbcIA9B<9PQ9P%5Q9PIQ!#00::!"&";"; 	I o  #5<<0
0vv IR!&&A+GD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGD*V	
 C R*
s   'F.F5/?F5F;c                 .   UR                   R                  n[        R                  " SSS9[        R                  " [        S5      S5      /nU(       d  [        R                  " U5      $ [        R                  " [        R                  " U5      /U-   5      $ )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr	   StripReplacer   r  PrecompiledrX   r(  r_  _normalizerss       r"   r   SpmConverter.normalizers  s{    $44II55g6
 $''55'')@)@AU)V(WZf(fggr$   c                 T    [        X R                  5      n[        R                  " XS9$ Nreplacementr.   )r/   r-   r
   	MetaspacerX   ri  r%   r.   s       r"   r   SpmConverter.pre_tokenizer~  s$    ,-=?V?VW''K__r$   c                     g rb   r3   r   s    r"   r   SpmConverter.post_processor  s    r$   c                 T    [        X R                  5      n[        R                  " XS9$ rg  )r/   r-   r   rj  rk  s       r"   r   SpmConverter.decoder  s$    ,-=?V?VW!!kYYr$   r&   c                    U R                  U R                  5      nU R                  U R                  5      nUb  X!l        SnSn[        U R                  S5      (       a  U R                  R
                  nU R                  X45      nUb  XQl        U R                  X45      Ul        U R                  5       nU(       a  Xal        U$ )Nr]  Tr%   )	r   r(  r   r   r-   r%   r   r   r   )rX   r   r   ri  r%   r   r   s          r"   r   SpmConverter.converted  s    NN4::.	 __TZZ0
!#- 4**,>??#66GG**;I$&3# LLG	,,.'5$r$   r(  )rc   rd   re   rf   r*  rQ   rK  r   rY   r5   r:  r   r   r   r   r   r   r   rl   __classcell__r1  s   @r"   r  r  !  sQ     )LN*F)0d	h`Z9  r$   r  c                   &    \ rS rSrS rS rS rSrg)AlbertConverteri  c                     UR                    Vs/ s HP  n[        UR                  5      (       a  UR                  UR                  4OUR                  UR                  S-
  4PMR     sn$ s  snf Nd   r5  r{   ru   r6  r7  s      r"   r5   AlbertConverter.vocab  f     
% +=U[[*I*IU[[%++&PUP[P[]b]h]hkn]nOoo%
 	
 
   AA)c                    [         R                  " SS5      [         R                  " SS5      /nU R                  R                  (       dH  UR	                  [         R
                  " 5       5        UR	                  [         R                  " 5       5        U R                  R                  (       a$  UR	                  [         R                  " 5       5        UR                  R                  nU(       a%  UR	                  [         R                  " U5      5        UR	                  [         R                  " [        S5      S5      5        [         R                  " U5      $ Nz``"z''r\  r   r	   ra  r-   keep_accentsrD   NFKDStripAccentsr   	Lowercaser^  r_  rb  r   r  rX   r(  list_normalizersr_  s       r"   r   AlbertConverter.normalizer      c*c*
 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR 3 3E'NC HI##$455r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ Nr  r  r  r  r   r   r   r-   r   r   s    r"   r   AlbertConverter.post_processor  R    ,,)4$11GGPQ$11GGPQ
 	
r$   r3   Nrc   rd   re   rf   r5   r   r   rl   r3   r$   r"   rw  rw        
6&
r$   rw  c                        \ rS rSrS rS rSrg)BarthezConverteri  c                 
    SnU$ Nr@  r3   rX   r(  r:  s      r"   r:  BarthezConverter.unk_id      r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   r  r   s    r"   r   BarthezConverter.post_processor  R    ,, +//EEeLM00FFvNO
 	
r$   r3   N)rc   rd   re   rf   r:  r   rl   r3   r$   r"   r  r    s    
r$   r  c                   &    \ rS rSrS rS rS rSrg)CamembertConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nUS/-  nU$ s  snf )N))z
<s>NOTUSED        <pad>r  )z</s>NOTUSEDr  z<unk>r  )z<unk>NOTUSEDir   z<mask>r  r4  rX   r(  r5   ru   s       r"   r5   CamembertConverter.vocab  sR    
 	,,qr:JK:J;;,:JKK/"" L   !Ac                     gr  r3   r;  s     r"   r:  CamembertConverter.unk_id  s    r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    r"   r   !CamembertConverter.post_processor  r  r$   r3   Nrc   rd   re   rf   r5   r:  r   rl   r3   r$   r"   r  r    s    
r$   r  c                   &    \ rS rSrS rS rS rSrg)DebertaV2Converteri  c                    / nU R                   R                  (       a#  UR                  [        R                  " SS95        [        X R                   5      nUR                  [        R                  " XS95        [        R                  " U5      $ )Nr   )r   rh  )r-   split_by_punctrD   r
   Punctuationr/   rj  r  )rX   ri  r%   list_pretokenizersr.   s        r"   r    DebertaV2Converter.pre_tokenizer  sl    ""11%%n&@&@*&UV,-=?V?VW!!.":":{"rs&&'9::r$   c                    / nU R                   R                  (       a$  UR                  [        R                  " 5       5        UR                  [        R
                  " 5       5        UR                  R                  nU(       a%  UR                  [        R                  " U5      5        UR                  [        R                  " [        S5      S5      5        [        R                  " U5      $ )Nr\  r   )r-   r   rD   r	   r  r`  r^  r_  rb  ra  r   r  r  s       r"   r   DebertaV2Converter.normalizer  s    ""00##K$9$9$;< 1 1 34$44II##K$;$;<P$QR 3 3E'NC HI##$455r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    r"   r   !DebertaV2Converter.post_processor
  r  r$   r3   N)rc   rd   re   rf   r   r   r   rl   r3   r$   r"   r  r    s    ;6
r$   r  c                   &    \ rS rSrS rS rS rSrg)MBartConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU/ SQ-  nUS/-  nU$ s  snf )Nr  r  r  r  r  r  r@  )ar_ARr  cs_CZr  de_DEr  en_XXr  es_XXr  et_EEr  fi_FIr  fr_XXr  gu_INr  hi_INr  it_ITr  ja_XXr  kk_KZr  ko_KRr  lt_LTr  lv_LVr  my_MMr  ne_NPr  nl_XXr  ro_ROr  ru_RUr  si_LKr  tr_TRr  vi_VNr  zh_CNr  r  r4  r  s       r"   r5   MBartConverter.vocab  sc    
 	,,qr:JK:J;;,:JKK 
 	
6 	/""; L   !Ac                     gr  r3   r;  s     r"   r:  MBartConverter.unk_id<      r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz$A </s> en_XXz$A $B </s> en_XXr  r  r   r  r   s    r"   r   MBartConverter.post_processor?  R    ,,"#$11GGPQ00FFvNO
 	
r$   r3   Nr  r3   r$   r"   r  r    s    $L
r$   r  c                   &    \ rS rSrS rS rS rSrg)MBart50ConverteriJ  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU/ SQ-  nUS/-  nU$ s  snf )Nr  r@  )4r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )af_ZAr  )az_AZr  )bn_INr  )fa_IRr  )he_ILr  )hr_HRr  )id_IDr  )ka_GEr  )km_KHr  )mk_MKr  )ml_INr  )mn_MNr  )mr_INr  )pl_PLr  )ps_AFr  )pt_XXr  )sv_SEr  )sw_KEr  )ta_INr  )te_INr  )th_THr  )tl_XXr  )uk_UAr  )ur_PKr  )xh_ZAr  )gl_ESr  )sl_SIr  r  r4  r  s       r"   r5   MBart50Converter.vocabK  sc    
 	,,qr:JK:J;;,:JKK  R  	R/"" Lr  c                     gr  r3   r;  s     r"   r:  MBart50Converter.unk_idW  r  r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nzen_XX $A </s>zen_XX $A $B </s>r  r  r   r  r   s    r"   r   MBart50Converter.post_processorZ  r  r$   r3   Nr  r3   r$   r"   r  r  J  s    

r$   r  c                   &    \ rS rSrS rS rS rSrg)NllbConverterie  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU$ s  snf )Nr  r@  r4  r  s       r"   r5   NllbConverter.vocabf  E    
 	,,qr:JK:J;;,:JKK L   !>c                     gr  r3   r;  s     r"   r:  NllbConverter.unk_idp  r  r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnr  r   r  r   s    r"   r   NllbConverter.post_processors  sR    ,,%&T44JJ:VW00FFvNO
 	
r$   r3   Nr  r3   r$   r"   r  r  e  s    
r$   r  c                   &    \ rS rSrS rS rS rSrg)SeamlessM4TConverteri~  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU$ s  snf )N)r  r  r  r  r@  r4  r  s       r"   r5   SeamlessM4TConverter.vocab  r!  r"  c                 .    U R                   R                  $ rb   )r-   unk_token_idr;  s     r"   r:  SeamlessM4TConverter.unk_id  s    &&333r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__r  r   r  r   s    r"   r   #SeamlessM4TConverter.post_processor  sR    ,,$%D33II)TU00FFvNO
 	
r$   r3   Nr  r3   r$   r"   r)  r)  ~  s    4
r$   r)  c                   &    \ rS rSrS rS rS rSrg)XLMRobertaConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nUS/-  nU$ s  snf )Nr  r@  r  r4  r  s       r"   r5   XLMRobertaConverter.vocab  sR    
 	,,qr:JK:J;;,:JKK/"" Lr  c                 
    SnU$ r  r3   r  s      r"   r:  XLMRobertaConverter.unk_id  r  r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    r"   r   "XLMRobertaConverter.post_processor  r  r$   r3   Nr  r3   r$   r"   r3  r3        	
r$   r3  c                   &    \ rS rSrS rS rS rSrg)XLNetConverteri  c                     UR                    Vs/ s HP  n[        UR                  5      (       a  UR                  UR                  4OUR                  UR                  S-
  4PMR     sn$ s  snf ry  r{  r7  s      r"   r5   XLNetConverter.vocab  r}  r~  c                    [         R                  " SS5      [         R                  " SS5      /nU R                  R                  (       dH  UR	                  [         R
                  " 5       5        UR	                  [         R                  " 5       5        U R                  R                  (       a$  UR	                  [         R                  " 5       5        UR                  R                  nU(       a%  UR	                  [         R                  " U5      5        UR	                  [         R                  " [        S5      S5      5        [         R                  " U5      $ r  r  r  s       r"   r   XLNetConverter.normalizer  r  r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   r  r   s    r"   r   XLNetConverter.post_processor  r  r$   r3   Nr  r3   r$   r"   r<  r<    r  r$   r<  c                       \ rS rSrSrg)ReformerConverteri  r3   Nrc   rd   re   rf   rl   r3   r$   r"   rD  rD        r$   rD  c                        \ rS rSrS rS rSrg)RemBertConverteri  c                    [         R                  " SS5      [         R                  " SS5      [         R                  " [        S5      S5      /nU R                  R                  (       dH  UR                  [         R                  " 5       5        UR                  [         R                  " 5       5        U R                  R                  (       a$  UR                  [         R                  " 5       5        UR                  R                  nU(       a%  UR                  [         R                  " U5      5        [         R                  " U5      $ r  )r	   ra  r   r-   r  rD   r  r  r   r  r^  r_  rb  r  r  s       r"   r   RemBertConverter.normalizer  s    c*c*g4

 &&33##K$4$4$67##K$<$<$>?""00##K$9$9$;<$44II##K$;$;<P$QR##$455r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    r"   r   RemBertConverter.post_processor  r  r$   r3   N)rc   rd   re   rf   r   r   rl   r3   r$   r"   rH  rH    s    6&
r$   rH  c                       \ rS rSrSrg)BertGenerationConverteri  r3   NrE  r3   r$   r"   rN  rN    rF  r$   rN  c                   ,    \ rS rSrS rS rS rS rSrg)PegasusConverteri  c                    U R                   R                  S4U R                   R                  S4/nU R                   R                  b  X R                   R                  S4/-  nU R                   R                  bI  U R                   R
                  U R                   R                  :  a  X R                   R                  S4/-  nU[        SU R                   R                  5       Vs/ s H  nSU S3S4PM     sn-  nX!R                  SS   Vs/ s H  oDR                  UR                  4PM     sn-  nU$ s  snf s  snf )Nr  r;   z<unk_>g      Y)r-   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetrC   r5  ru   r6  )rX   r(  r5   rR  ru   s        r"   r5   PegasusConverter.vocab   s)   $$..4$$..4

 ""22>..>>DEEE ##..:''558O8O8V8VV..993?@@E%4;R;R;Y;Y2Z[2ZQU1#Q<(2Z[[,,qr:JK:J;;,:JKK \Ks   &D6!D;c                 \    UR                   R                  U R                  R                  -   $ rb   )r)  r:  r-   rX  r;  s     r"   r:  PegasusConverter.unk_id  s%    !!((4+B+B+I+IIIr$   c                     [        X R                  5      n[        R                  " [        R                  " 5       [        R
                  " XS9/5      $ rg  )r/   r-   r
   r  WhitespaceSplitrj  rk  s       r"   r   PegasusConverter.pre_tokenizer  sE    ,-=?V?VW&&..0(([`
 	
r$   c                     U R                   R                  nXR                   R                  4/n[        R                  " SU/SSU/US9$ )N$A$Br   )r-   rT  eos_token_idr   r   )rX   eosr   s      r"   r   PegasusConverter.post_processor  sP    %%//))667
 ,,T3KtTSVFWhvwwr$   r3   N)	rc   rd   re   rf   r5   r:  r   r   rl   r3   r$   r"   rP  rP    s    &J
xr$   rP  c                        \ rS rSrS rS rSrg)T5Converteri'  c                     U R                   R                  nUR                   Vs/ s H  o3R                  UR                  4PM     nnU[        US-
  SS5       Vs/ s H  nSU S3S4PM     sn-  nU$ s  snf s  snf )Nr   rw   z
<extra_id_rR  r  )r-   
_extra_idsr5  ru   r6  rC   )rX   r(  num_extra_idsru   r5   rR  s         r"   r5   T5Converter.vocab(  s|    //::9>F++u{{+FE-!:KRQS4TU4TqZs!$c*4TUU GUs   !A4A9c                 n    [         R                  " SS// SQSU R                  R                  S5      4/S9$ Nr`  r  )r`  r  ra  r  r   r  r   s    r"   r   T5Converter.post_processor.  =    ,,&>-00FFvNO
 	
r$   r3   N)rc   rd   re   rf   r5   r   rl   r3   r$   r"   rf  rf  '  s    
r$   rf  c                       \ rS rSrS rSrg)UdopConverteri8  c                 n    [         R                  " SS// SQSU R                  R                  S5      4/S9$ rl  r  r   s    r"   r   UdopConverter.post_processor9  rn  r$   r3   Nrc   rd   re   rf   r   rl   r3   r$   r"   rp  rp  8  s    
r$   rp  c                   "    \ rS rSrS\4S jrSrg)WhisperConverteriC  r&   c                    U R                   R                  n[        U R                   R                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " U R                   R                  S9Ul
        [        R                  " 5       Ul        U R                   R                  nU R                   R                  U5      nU R                   R                  nU R                   R                   nSR#                  U Vs/ s H  o S3PM	     sn5      n	[$        R&                  " U	 SU S3U	 SU S	3Xg4/[)        XT5      QS
9Ul        U$ s  snf )Nr   Fr   r   r   r   z $A:0 z $A:0 $B:1 r   r   )r-   r   rk   r   r   r   r   r
   r   r%   r   r   r   prefix_tokensconvert_ids_to_tokensrT  rb  joinr   r   zipr   )
rX   r5   rH   r   prefix_token_idsprefixesrc  rb  rX  prefix_templates
             r"   r   WhisperConverter.convertedD  sO   ''//d--77<<>?*,#%	
	 #1":":DLcLcLtLt"u	$..0	22@@**@@AQR%%//..;;((h#GhUgRLh#GH#-#@#@%&fSE4#$KuB7#X0$
	   $Hs   Er3   Nr   r3   r$   r"   ru  ru  C  s     9  r$   ru  c                       \ rS rSrS rSrg)BigBirdConverterig  c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ r  r  r   s    r"   r   BigBirdConverter.post_processorh  r  r$   r3   Nrs  r3   r$   r"   r  r  g  s    
r$   r  c                   "    \ rS rSrS\4S jrSrg)CLIPConverteris  r&   c                 j   U R                   R                  n[        U R                   R                  R	                  5       5      nU R                   R
                  n[        [        UUS SSS[        U5      S95      n[        R                  " [        R                  " 5       [        R                  " [        S5      S5      [        R                  " 5       /5      Ul        [         R                  " [         R"                  " [        S5      SS	S
9[         R$                  " SS9/5      Ul        [(        R$                  " 5       Ul        [,        R.                  " U R                   R0                  U R                   R2                  4U R                   R4                  U R                   R6                  4SSS9Ul        U$ )Nr   r   Fr5   rH   r   r   r   r   r   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r  )r-   r   rk   r   r   r   r   r   rh   r	   r  r  ra  r   r  r   r
   r  r   r   r   r   r   r	  rT  rb  r   r   r   r   s        r"   r   CLIPConverter.convertedt  sk   ''//d--77<<>?++55	*,#)i.

	  +33__ 3 3E&M3 GI^I^I`a 
	 #1"9"9$$Z[&
 ((%@	#
	 %..0	 $.#?#?((22D4K4K4X4XY((22D4K4K4X4XY"	$
	  r$   r3   Nr   r3   r$   r"   r  r  s  s    '9 'r$   r  c                   "    \ rS rSrS\4S jrSrg)LayoutLMv2Converteri  r&   c           	      b   U R                   R                  n[        [        U[	        U R                   R
                  5      S95      nSnSnSn[        U R                   S5      (       a`  U R                   R                  R                  nU R                   R                  R                  nU R                   R                  R                  n[        R                  " SUUUS9Ul        [        R                  " 5       Ul        [	        U R                   R"                  5      n[	        U R                   R$                  5      nU R                   R&                  nU R                   R(                  n	[*        R,                  " U SU S3U SU SU S	3Xh4Xy4/S
9Ul        [0        R                  " SS9Ul        U$ )Nr   FTr   r   r   r   r   r   r   r   r   r   r   s
             r"   r   LayoutLMv2Converter.converted  s   ''--iT=T=T=^=^9_`a	!&4**,=>>%)%<%<%L%L%c%c" 33CCQQM 33CCQQM*99!7'#	 
	 #1"A"A"C	$))334$))334..;;..;;#-#@#@U(3%r*5XcU"5##$
	  %..d;	r$   r3   Nr   r3   r$   r"   r  r    r   r$   r  c                   "    \ rS rSrS\4S jrSrg)BlenderbotConverteri  r&   c                    U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSS95      n[        R                  " UR                  S9Ul
        [        R                  " 5       Ul        [        R                  " SUR                   S3UR                  UR                   4/S9Ul        U$ )Nr   Fr   r   z$A:0 r   )r   r   )r-   r   rk   r   r   r   r   r
   r   r%   r   r   r   r   r   rT  rb  r   r
  s        r"   r   BlenderbotConverter.converted  s    $$

bll'')**,#%	
	 #1":":BL_L_"`	$..0	#-#@#@2<<.+r/$
	  r$   r3   Nr   r3   r$   r"   r  r    r  r$   r  c                   &    \ rS rSrS rS rS rSrg)XGLMConverteri  c                     / SQnX!R                   SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU/ SQ-  nU$ s  snf )Nr  r@  ))z<madeupword0>r  )z<madeupword1>r  )z<madeupword2>r  )z<madeupword3>r  )z<madeupword4>r  )z<madeupword5>r  )z<madeupword6>r  r4  r  s       r"   r5   XGLMConverter.vocab  sV    
 	,,qr:JK:J;;,:JKK  z  	z Ls   !Ac                 
    SnU$ r  r3   r  s      r"   r:  XGLMConverter.unk_id  r  r$   c           	          [         R                  " SSSU R                  R                  S5      4SU R                  R                  S5      4/S9$ )Nz</s> $Az</s> $A </s> </s> $Br  r  r   r  r   s    r"   r   XGLMConverter.post_processor  sR    ,,'//EEeLM00FFvNO
 	
r$   r3   Nr  r3   r$   r"   r  r    r:  r$   r  c                   D    \ rS rSrSr\rSS1r S rS r	S r
S rS	 rS
rg)GemmaConverteri  Tz<start_of_turn>z<end_of_turn>c                 0    [         R                  " SS5      $ Nr   r]  )r	   ra  r;  s     r"   r   GemmaConverter.normalizer  s    ""3..r$   c                    U R                   R                  S4U R                   R                  S4U R                   R                  S4/nX!R                  SS   Vs/ s H  o3R
                  UR                  4PM     sn-  n[        S U 5       5      (       d#  [        S [        U5       5       S 5      nUb  SX$'   U$ s  snf )Nr  r@  c              3   0   #    U  H  oS    S:H  v   M     g7f)r   rq   Nr3   ).0r4   s     r"   	<genexpr>'GemmaConverter.vocab.<locals>.<genexpr>  s     /AQ44<s   c              3   @   #    U  H  u  pUS    S:X  d  M  Uv   M     g7f)r   rr   Nr3   )r  rR  r4   s      r"   r  r    s!     "V1AQqTXEU111As   	)rq   r  )
r-   rS  rT  r   r5  ru   r6  anynextrL  )rX   r(  r5   ru   override_indexs        r"   r5   GemmaConverter.vocab  s    $$..4$$..4$$..4

 	,,qr:JK:J;;,:JKK ////!"V51A"VX\]N)(3% Ls   !B;c                 0    [         R                  " SS5      $ )Nr   merged_with_previous)r
   r  rX   ri  r%   s      r"   r   GemmaConverter.pre_tokenizer   s    ##C)?@@r$   c                 
    SnU$ r  r3   r  s      r"   r:  GemmaConverter.unk_id#  r  r$   c                     [         R                  " [         R                  " SS5      [         R                  " 5       [         R                  " 5       /5      $ )Nr]  r   )r   r  ra  ByteFallbackFuser  s      r"   r   GemmaConverter.decoder'  s?        ,%%'
 	
r$   r3   N)rc   rd   re   rf   r*  rn   rK  r   r   r5   r   r:  r   rl   r3   r$   r"   r  r    s6    .L'9N/ A
r$   r  c                   <    \ rS rSrSrS rS rS rS rS r	S r
S	rg
)LlamaConverteri1  Tc                 *   U R                   R                  S5      S4U R                   R                  S5      S4U R                   R                  S5      S4/nX!R                  SS   Vs/ s H  o3R                  UR                  4PM     sn-  nU$ s  snf )Nr   r  r   r;   r@  )r-   rx  r5  ru   r6  r  s       r"   r5   LlamaConverter.vocab4  s    $$::1=sC$$::1=sC$$::1=sC

 	,,qr:JK:J;;,:JKK Ls   (!Bc                 
    SnU$ rC  r3   r  s      r"   r:  LlamaConverter.unk_id=  r  r$   c                     [         R                  " SS5      [         R                  " 5       [         R                  " 5       /nU(       a  U[         R                  " SSS9/-  n[         R
                  " U5      $ Nr]  r   r   )contentr[  r   ra  r  r  r`  r  rX   ri  r%   sequences       r"   r   LlamaConverter.decoderA  \    UC(!!#MMO

 !<==H  **r$   c                    [        U R                  SS5      (       ae  / n[        U R                  SS5      (       a  U[        R                  " SS9/-  nU[        R                  " SSS9/-  n[        R
                  " U5      $ g )Nr)   Tr%   r]  )prependr   )patternr  )r,   r-   r	   Prependra  r  )rX   r(  r  s      r"   r   LlamaConverter.normalizerK  sx    4**Hd;;Ht..0BDII[00?@@,,S%HIIH''11r$   c                     [        U R                  SS5      (       d*  [        X R                  5      n[        R                  " XSS9$ g )Nr)   TFri  r.   split)r,   r-   r/   r
   rj  rk  s       r"   r   LlamaConverter.pre_tokenizerT  s?    t..$??01ACZCZ[N!++joppr$   c                     g rb   r3   r   s    r"   r   LlamaConverter.post_processorZ  s    r$   r3   N)rc   rd   re   rf   r*  r5   r:  r   r   r   r   rl   r3   r$   r"   r  r  1  s&    +r$   r  c                   "    \ rS rSrS\4S jrSrg)MarkupLMConverteri_  r&   c                 z   U R                   nUR                  n[        UR                  R	                  5       5      n[        [        UUS SSSU R                   R                  S95      n[        R                  " UR                  S9Ul        [        R                  " 5       Ul        [        U R                   R                  5      n[        U R                   R                   5      nU R                   R"                  nU R                   R$                  n[&        R(                  " U SU 3U SU SU 3XW4Xh4/S9Ul        U$ )Nr   Fr  r   z $A z $B r   )r-   r   rk   r   r   r   r   r   r
   r   r%   r   r   r   rh   r   r   r   r   r   r   r   )	rX   r  r5   rH   r   r   r   r   r   s	            r"   r   MarkupLMConverter.converted`  s(   $$

bll'')**,#%11;;

	 #1":":BL_L_"`	$..0	$))334$))334..;;..;;#-#@#@U$se$5SEcU+##$
	  r$   r3   Nr   r3   r$   r"   r  r  _  s    "9 "r$   r  c                   4    \ rS rSrSrS	S jrS rS rS rSr	g)
MoshiConverteri  TNc                    [        U S5        [        R                  X5        [        5       nUR	                  5       n[        US5       nUR                  UR                  5       5        S S S 5        XPl        g ! , (       d  f       N= fNr   r!  	r   r}   rY   r#   r#  r$  r&  r'  r(  )rX   r%  model_max_lengthkwargsr.  r/  r0  s          r"   rY   MoshiConverter.__init__  se    $
+4, $%	  "*d#qaffh' $
 $#    A77
Bc                     UR                   R                  n[        R                  " SS5      /nU(       d  [        R                  " U5      $ [        R                  " [        R
                  " U5      /U-   5      $ r  )r^  r_  r	   ra  r  rb  rc  s       r"   r   MoshiConverter.normalizer  sg    $44IIU+
 $''55'')@)@AU)V(WZf(fggr$   c                     [         R                  " SS5      [         R                  " 5       [         R                  " 5       /nU(       a  U[         R                  " SSS9/-  n[         R
                  " U5      $ r  r  r  s       r"   r   MoshiConverter.decoder  r  r$   c                 0    Sn[         R                  " XSS9$ )Nr*   Fr  )r
   rj  rk  s       r"   r   MoshiConverter.pre_tokenizer  s     ''Kfkllr$   rs  rb   )
rc   rd   re   rf   r*  rY   r   r   r   rl   r3   r$   r"   r  r    s    h+mr$   r  c                   L    \ rS rSrSrSS jrS rS rS rS r	S	 r
S
 rS rSrg)HeliumConverteri  TNc                    [        U S5        [        R                  X5        [        5       nUR	                  5       n[        US5       nUR                  UR                  5       5        S S S 5        X@l        g ! , (       d  f       N= fr  r  )rX   r%  r-  r.  r/  r0  s         r"   rY   HeliumConverter.__init__  sc    $
+4,#%	  "*d#qaffh' $
 $#r  c                 R   U R                  U5      n[        [        UU R                  U5      U R                  S95      n[        UR                  5       VVs/ s HR  u  pEUR                  S;   d  M  XER                  UR                  S:H  =(       d    UR                  U R                  ;   4PMT     nnnUR                  [        US S9 VVVs/ s H  u  pGn[        USUSS9PM     snnn5        UR                  [        S	SSS
9/5        UR                  SSS9  U$ s  snnf s  snnnf )Nr>  r?  r@  c                     U S   $ rC  r3   rD  s    r"   r6   +HeliumConverter.tokenizer.<locals>.<lambda>  rF  r$   r8   FT)rH  rI  single_word
rG  r  )rS  pad_id)r5   r   r   r:  r*  rL  r5  rO  ru   r   rP  rE   r   enable_padding)	rX   r(  rG   r   rU  rV  rW  rX  rI  s	            r"   r   HeliumConverter.tokenizer  s    zz%({{5)"77
	 #5<<0
0vv IR!&&A+GD4G4G)GH0 	 

 	 +11A~*V*V&Bw 5UGQUV*V	
 	j%OPQ  71 =
s   D1?DD"c                     / nUR                    HB  nUR                  S:X  a  USUR                  4/-  nM'  X#R                  UR                  4/-  nMD     U$ )Nz<0x0A>r  r4  r  s       r"   r5   HeliumConverter.vocab  sV    \\E{{h&4-..;;455	 "
 r$   c                 
    SnU$ rC  r3   r  s      r"   r:  HeliumConverter.unk_id  r  r$   c                     [         R                  " SS5      [         R                  " 5       [         R                  " 5       /nU[         R                  " SSS9/-  n[         R
                  " U5      $ r  r  r  s       r"   r   HeliumConverter.decoder  sY    UC(!!#MMO

 	X^^Ca899  **r$   c                     [         R                  " [         R                  " S5      [         R                  " SS5      /5      $ r  )r	   r  r  ra  r;  s     r"   r   HeliumConverter.normalizer  s2    ##[%8%8%={?R?RSWY^?_$`aar$   c                 Z    [         R                  " [         R                  " SS5      /5      $ )Nr  
contiguous)r
   r  r  r  s      r"   r   HeliumConverter.pre_tokenizer  s#    &&(<(<T<(P'QRRr$   c                 8    [         R                  " SS// SQS/S9$ )Nr  r`  )r  r`  r  ra  )r  r   r   )r   r   r   s    r"   r   HeliumConverter.post_processor  s/    ,, 
 	
r$   rs  rb   )rc   rd   re   rf   r*  rY   r   r5   r:  r   r   r   r   rl   r3   r$   r"   r  r    s2    
8+bS
r$   r  c            	         [        [        [        S5      [        S5      S-   5      5      [        [        [        S5      [        S5      S-   5      5      -   [        [        [        S5      [        S5      S-   5      5      -   n U SS nS	n[        S
5       H4  nX0;  d  M
  U R                  U5        UR                  S
U-   5        US-  nM6     U Vs/ s H  n[	        U5      PM     nn[        [        X5      5      $ s  snf )a  
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.

The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
!~r      ¡   ¬   ®   ÿNr      )rk   rC   ordrD   chrrA   rz  )bscsnbs       r"   bytes_to_unicoder    s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[;IIaLIIdQhFA	 
 	"Q#a&"B	B 
s   C:c                   Z   ^  \ rS rSrSr    S
U 4S jjrS\4S jrS rS\	4S jr
S	rU =r$ )TikTokenConverteri   z
A general tiktoken converter.
c                    > [         TU ]  " U6   Xl        X l        X0l        [        U[        5      (       a  UR                  5       U l        g UU l        g rb   )	r"  rY   r%  r  r%   
isinstancerA   r   additional_special_tokens)rX   r%  r  r%   r
  r-  r  r1  s          r"   rY   TikTokenConverter.__init__%  sU     	$$ 0 3T:: &**, 	& + 	&r$   tiktoken_urlc                 >  ^^  SSK Jn  U" U5      m[	        5       mU4S jn/ n0 nTR                  5        H  u  pgXuU" U5      '   [        U5      S:X  a  M   / n[        S[        U5      5       H8  n	US U	 XiS  pU
T;   d  M  UT;   d  M  X-   T;   d  M%  UR                  XU45        M:     [        UU4S jSS9nUR                  U5        M     [        US	 SS9nU Vs/ s H  o" US   5      U" US   5      4PM     nnXT4$ ! [         a    [        S5      ef = fs  snf )
Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c           	         > SR                  U R                  S5       Vs/ s H  nT[        U5         PM     sn5      $ s  snf )Nr   zlatin-1)ry  decoder  )r  charbyte_encoders     r"   token_bytes_to_stringPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringC  s8    77@ST@SLT3@STUUTs   ?r   c                 $   > TU S      TU S      4$ r2   r3   )r4   r   s    r"   r6   CTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>Q  s    1Q4)AaD/0Rr$   Fr?   c                     U S   $ )Nr;   r3   r=   s    r"   r6   r  S  s    Ar$   )tiktoken.loadr  rN  
ValueErrorr  rB   r<   rC   rD   rE   rF   )rX   r  r  r  rH   r5   rX  rankrK   rL   rM   rN   r>   r   r  s                @@r"   extract_vocab_merges_from_model1TikTokenConverter.extract_vocab_merges_from_model8  sC   	7 &l3	')	V $??,KE26'./5zQEq#e*-#(%=%-i'Gy,@gFW\eEeLL'D!9: . 5&R\abEMM%  - $6F\bc\bUX(Q02GA2OP\bc}5  	k 	2 ds   D DDc                     U R                  U R                  5      u  p[        [        XSS95      n[	        UR
                  S5      (       a  SUR
                  l        U$ )NF)r   ignore_mergesT)r  r%  r   r   r   rS   r  )rX   rG   rH   r   s       r"   r   TikTokenConverter.tokenizerW  sM    #CCDOOTc,GH	9??O44,0IOO)r$   r&   c                    U R                  5       n[        R                  " [        R                  " [	        U R
                  5      SSS9[        R                  " U R                  SS9/5      Ul        [        R                  " 5       Ul
        UR                  U R                   Vs/ s H  n[        USSS9PM     sn5        [        R                  " SS9Ul        U$ s  snf )Nr   Fr   r   TrG  r   )r   r
   r  r  r   r  r   r%   r   r   r   r   r
  r   r   r   )rX   r   rX  s      r"   r   TikTokenConverter.converted^  s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$LPLjLjkLj5Z%>Ljk	
 $.#7#7U#K	  ls   %C)r%   r
  r  r%  )Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)rc   rd   re   rf   rg   rY   rh   r  r   r   r   rl   rt  ru  s   @r"   r  r     s@      K"&
&C >9  r$   r  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3Tokenizerc                    U R                   R                  nU[        ;   a&  U(       d  [        U   nU" U 5      R                  5       $  [        R                  S5        [        U R                  U R                  S9R                  5       $ ! [         a*    [        S[        [        R                  5       5       35      ef = f)a\  
Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

Args:
    transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
        Instance of a slow tokenizer to convert in the backend tokenizer for
        [`~tokenization_utils_base.PreTrainedTokenizerFast`].
   from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
        Defaults to False.

Return:
    A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
    [`~tokenization_utils_base.PreTrainedTokenizerFast`]
zConverting from Tiktoken)r%  r
  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r1  rc   SLOW_TO_FAST_CONVERTERSr   loggerinfor  r%  r
  rN  r  rk   r   )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classs       r"   convert_slow_tokenizerrc    s      1::CC66}12FG45??AA	KK23$0;;*?*Y*Y ik  	>>BCZC_C_Ca>b=ce 	s   A B	 	4B=)r   )F)Prg   r+  typingr   	packagingr   
tokenizersr   r   r   r   r	   r
   r   tokenizers.modelsr   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerrc   r]  r#   boolrh   r/   rO   rQ   rn   r{   r}   r   r   r   r   r   r   r   r   r  r  r  r  rw  r  r  r  r  r  r  r)  r3  r<  rD  rH  rN  rP  rf  rp  ru  r  r  r  r  r  r  r  r  r  r  r  r  r\  rc  r3   r$   r"   <module>rl     sh      f f f 5 5 ` ` 5 
		H	%G"$ s & 2"8 "Ic Id I$ $$I $N/	 /d$i $N$Y $N 6%I %Py >+Y +\y :$	 $Ny >~9 ~B"
l "
J
| 
 
 
:
 
B2
\ 2
j
| 
6
L 
2
< 
2
, 
6"
\ "
J	 	
| 
@	l 	%x| %xP
, 
"
L 
!y !H	
| 	
(I (V$) $N) :
L 
61
\ 1
h+\ +\#	 #L&m\ &mRV
l V
t0N Nb::%: (: ]	:
 (: .: ,: ]: : : (: ,: =: -: "=:  !-!:" #:$ _%:& ':( ]):* (+:, -:. =/:0 +1:2 -3:4 +5:6 $7:8 }9:: *;:< n=:> (?:@ nA:B =C:D $E:F ]G:H ,I:J (K:L nM:N mO:P *Q:R (S:T -U:V (W:X *Y:Z 0[:\ M]:^ ;_:` ]a:b (c:d .e:f ng:h +"$($#s: z!) !r$   