o
    Zh                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZmZmZ ddlmZmZmZmZ dd	lmZ eeZdddZdedefddZdd Z G dd dZ!G dd de!Z"dedefddZ#G dd dZ$G dd de$Z%G dd de$Z&G d d! d!e$Z'G d"d# d#e$Z(G d$d% d%e$Z)G d&d' d'e$Z*G d(d) d)e$Z+G d*d+ d+e$Z,G d,d- d-e$Z-G d.d/ d/e$Z.G d0d1 d1e$Z/G d2d3 d3e$Z0G d4d5 d5e0Z1G d6d7 d7e0Z2G d8d9 d9e0Z3G d:d; d;e0Z4G d<d= d=e0Z5G d>d? d?e0Z6G d@dA dAe0Z7G dBdC dCe0Z8G dDdE dEe0Z9G dFdG dGe0Z:G dHdI dIe0Z;G dJdK dKe0Z<G dLdM dMe0Z=G dNdO dOe0Z>G dPdQ dQe0Z?G dRdS dSe0Z@G dTdU dUe$ZAG dVdW dWe0ZBG dXdY dYe$ZCG dZd[ d[e$ZDG d\d] d]e$ZEG d^d_ d_e0ZFG d`da dae0ZGG dbdc dce0ZHG ddde dee$ZIG dfdg dge0ZJG dhdi die0ZKdjdk ZLG dldm dmZMi dne1doe-dpe2dqe%dreBdseEdte3dueCdve*dwe%dxe/dye4dze%d{e%d|e%d}e%d~e%i de1de'de*de+de%de%de-de9de-de-de%deIde5de6de(de%de-i de7de)de>de,de%de;de<de%de-de.de8de%de?de@deAde9de:e&eFeHeHeGeHdZNdde	fddZOdS )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)Optional)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR c                 C   sj   t  rddlm} |S t r.dd l}t|jjtdk r&ddl	m} |S ddl	m
} |S tt| )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   Zgoogle.protobufr   parseprotobuf__version__Ztransformers.utilsr   ImportErrorr   format)error_messager   Zgoogle r   R/var/www/auris/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.pyimport_protobuf#   s   r    add_prefix_spacereturnc                 C   s$   | rd}t |ddsd}|S d}|S )NalwayslegacyTfirstnever)getattr)r!   original_tokenizerprepend_schemer   r   r   _get_prepend_scheme4   s   r*   c           
         s   |d u}|r
t |n }g }| D ]<\}}g }tdt|D ]}|d | ||d  }}	| v r>|	 v r>|||	|f qt| fddd}|| qt|dd |d}dd |D }|S )	Nr   c                        | d   | d  fS Nr   r   r   xvocabr   r   <lambda>I       z!generate_merges.<locals>.<lambda>keyc                 S   s   | d t | d t | d fS )N   r   r   )lenvalr   r   r   r1   L   s    r4   reversec                 S   s   g | ]
}|d  |d fqS r   r   r   .0r8   r   r   r   
<listcomp>M       z#generate_merges.<locals>.<listcomp>)dictitemsranger6   appendsortedextend)
r0   vocab_scoresr:   mergesmergeZpiece_scorelocalindexpiece_lpiece_rr   r/   r   generate_merges>   s   rM   c                   @   sB   e Zd ZdZdefddZd	deeeef e	e f fddZ
dS )
SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 C   s.   t | d ddlm} | | _| j| d S )Nr   r   )SentencePieceProcessor)r   r   rP   spLoad)selfrO   rP   r   r   r   __init__V   s   
zSentencePieceExtractor.__init__Nr"   c                    s2   | j   fddt  D }t||}||fS )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        c                       i | ]}  ||qS r   Zid_to_piecer=   rJ   rQ   r   r   
<dictcomp>c   r2   z2SentencePieceExtractor.extract.<locals>.<dictcomp>)rQ   rB   GetPieceSizerM   rS   rF   r0   rG   r   rY   r   extract]   s   
zSentencePieceExtractor.extractN)__name__
__module____qualname____doc__strrT   tupler@   intlistr]   r   r   r   r   rN   Q   s    (rN   c                   @   s0   e Zd Zddeeeef ee f fddZdS )GemmaSentencePieceExtractorNr"   c                    sH   | j   fddt  D }d|vr|d|d< t||}||fS )rU   c                    rV   r   rW   rX   rY   r   r   rZ   q   r2   z7GemmaSentencePieceExtractor.extract.<locals>.<dictcomp>	<0x09>)rQ   rB   r[   getrM   r\   r   rY   r   r]   k   s   
z#GemmaSentencePieceExtractor.extractr^   )	r_   r`   ra   rd   r@   rc   re   rf   r]   r   r   r   r   rg   j   s    (rg   piecec                 C   s&   t | dk p| d dkp| d   S )Nr5   ,)r6   isdigit)rk   r   r   r   check_number_comma{   s   &rp   c                   @   s"   e Zd Zdd ZdefddZdS )	Converterc                 C   s
   || _ d S r^   )r(   )rS   r(   r   r   r   rT      s   
zConverter.__init__r"   c                 C   s   t  r^   )NotImplementedErrorrS   r   r   r   	converted   s   zConverter.convertedN)r_   r`   ra   rT   r   rt   r   r   r   r   rq      s    rq   c                   @      e Zd ZdefddZdS )BertConverterr"   c           
      C      | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )N	unk_tokenFbasic_tokenizerTZ
clean_textZhandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr(   r0   r   r   rc   ry   hasattrrz   tokenize_chinese_charsr|   do_lower_caser   BertNormalizer
normalizerr	   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr
   TemplateProcessingpost_processorr   decoder
rS   r0   	tokenizerr   r|   r   clssepr   r   r   r   r   rt      :   



zBertConverter.convertedNr_   r`   ra   r   rt   r   r   r   r   rv          rv   c                   @   ru   )SplinterConverterr"   c              
   C   sZ  | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}t| j j}d}	| j j}
| j j}| j j}| j d}| j jdkrx| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}tj| d| d|||
f||f||f|	|fgd|_tjdd|_|S )Nrx   Frz   Tr{   .rightr~    r   r   r   r   r   r   )r(   r0   r   r   rc   ry   r   rz   r   r|   r   r   r   r   r	   r   r   r   r   Zquestion_tokenr   r   question_token_idconvert_tokens_to_idsZpadding_sider
   r   r   r   r   )rS   r0   r   r   r|   r   r   r   questiondotr   r   r   Zdot_token_idr   r   r   r   rt      sL   



$"
zSplinterConverter.convertedNr   r   r   r   r   r      r   r   c                   @   ru   )FunnelConverterr"   c           
      C   rw   )Nrx   Frz   Tr{   z:2 $A:0 r   r   r   r   r   r   r   r   r   r   r   rt      r   zFunnelConverter.convertedNr   r   r   r   r   r      r   r   c                   @   ru   )MPNetConverterr"   c           
   
   C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	| d
||f||	fgd|_tjdd|_|S )Nrx   Frz   Tr{   r~   r   z:0 r   r   r   r   r   r   r   r   r   r   rt     s:   



zMPNetConverter.convertedNr   r   r   r   r   r     r   r   c                   @   ru   )OpenAIGPTConverterr"   c              	   C   s   | j j}t| j j }| j j}tt||d t|ddd}|	t|d ur/|
t|g tjdd|_t |_tjdd|_|S )N</w>F)r0   rG   dropoutry   end_of_word_suffixfuse_unkT)r}   suffix)r(   encoderrf   	bpe_rankskeysry   r   r   rc   Ztoken_to_idadd_special_tokensr   r   r   r	   r   r   r   
BPEDecoderr   rS   r0   rG   ry   r   r   r   r   rt   /  s&   
zOpenAIGPTConverter.convertedNr   r   r   r   r   r   .  r   r   c                	   @   B   e Zd Z	ddeeeef  deeeeef   de	fddZ
dS )GPT2ConverterNr0   rG   r"   c              	   C   s   |s| j j}|st| j j}tt||d dddd}t| j dd}tj|d|_	t
 |_t| j ddrP| j j}| j j}tj| d| d||fgd	|_|S tjdd
|_|S )Nr   Fr0   rG   r   continuing_subword_prefixr   r   r!   r!   Zadd_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r(   r   rf   r   r   r   r'   r	   	ByteLevelr   r   r   	bos_tokenbos_token_idr
   r   r   )rS   r0   rG   r   r!   Zbosr   r   r   r   rt   J  s:   
zGPT2Converter.convertedNNr_   r`   ra   r   r@   rc   re   rf   rd   r   rt   r   r   r   r   r   I      r   c                   @   ru   )HerbertConverterr"   c                 C   s   d}d}| j j}t| j j }||d d v r|dd  }tt||d | j j|d}tj	ddd|_
t |_tj|d|_tj| j j| j jf| j j| j jfd	|_|S )
Nz	#version:r   r   r   )r   ry   r   F)r}   r|   r   )r   r   )r(   r   rf   r   r   r   r   ry   r   r   r   r	   r   r   r   r   r   r
   ZBertProcessingr   r   r   r   r   )rS   Ztokenizer_info_strZtoken_suffixr0   rG   r   r   r   r   rt   r  s.   

zHerbertConverter.convertedNr   r   r   r   r   r   q  r   r   c                	   @   r   )Qwen2ConverterNr0   rG   r"   c                 C   s   |s| j j}|st| j j }tt||d d ddddd}t |_	t
t
jtddddt
jt| j ddddg|_t |_tjdd	|_|S )
Nr   F)r0   rG   r   ry   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr!   r!   	use_regexr   )r(   r   rf   r   r   r   r   r   NFCr   r	   SequenceSplitr   r   r'   r   r   r   r
   r   )rS   r0   rG   r   r   r   r   rt     sD   

zQwen2Converter.convertedr   r   r   r   r   r   r     r   r   c                   @   ru   )RobertaConverterr"   c              	   C   sv   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tj|j|jf|j|jf|j	dd|_|S )Nr   Fr   r   Tr   r   r!   r   )r(   r   rf   r   r   r   r   r	   r   r!   r   r   r   r
   RobertaProcessingr   r   r   r   r   rS   otr0   rG   r   r   r   r   rt     s,   


zRobertaConverter.convertedNr   r   r   r   r   r     r   r   c                   @   ru   )RoFormerConverterr"   c           
      C   s   ddl m} | jj}tt|t| jjd}d}d}t| jdr*| jj	j
}| jj	j}tjdd||d|_tj|||_t| jj}t| jj}| jj}| jj}	tj| d| d	| d| d
| d||f||	fgd|_tjdd|_|S )Nr   )JiebaPreTokenizerrx   Frz   Tr{   r~   r   r   r   r   r   r   )Z"models.roformer.tokenization_utilsr   r(   r0   r   r   rc   ry   r   rz   r|   r   r   r   r   r	   ZPreTokenizerZcustomr   r   r   r   r   r
   r   r   r   r   )
rS   r   r0   r   r|   r   r   r   r   r   r   r   r   rt     s8   

zRoFormerConverter.convertedNr   r   r   r   r   r     r   r   c                   @   ru   )DebertaConverterr"   c              	   C   s~   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjddd| j dfd| j dfgd	|_|S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r(   r   rf   r   r   r   r   r	   r   r!   r   r   r   r
   r   r   r   r   r   r   r   rt     s.   
	zDebertaConverter.convertedNr   r   r   r   r   r     r   r   c                       sn   e Zd ZdZeZi Z fddZdd Zdd Z	dd	 Z
d
d Zdd Zdd Zdd ZdefddZ  ZS )SpmConverterFc                    s   t | d t j|  t }| }t| jjd}||	  W d    n1 s+w   Y  || _
| j
jjrB| jsDtd d S d S d S )Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrT   r    
ModelProtoopenr(   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rS   args	model_pb2mf	__class__r   r   rT   &  s   
zSpmConverter.__init__c                 C      dd |j D S )Nc                 S      g | ]}|j |jfqS r   rk   scorer=   rk   r   r   r   r>   <  r2   z&SpmConverter.vocab.<locals>.<listcomp>piecesrS   r   r   r   r   r0   ;     zSpmConverter.vocabc                 C   s   |j jS r^   )r   unk_idr   r   r   r   r   >     zSpmConverter.unk_idc           	   	      s   |j j} |}|dkrtt| | jd}n-|dkrD  jj	
|\}}dd t|D }tt|||j jd jd d}ntd fd	d
t|jD }|dd
 t|dd dD  |S )Nr   r   r   r5   c                 S   s   i | ]	\}\}}||qS r   r   )r=   iwordr   r   r   r   rZ   P      z*SpmConverter.tokenizer.<locals>.<dictcomp>T)ry   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                    8   g | ]\}}|j d v r||j|j dkp|j jv fqS )      r   typerk   r   r=   idprs   r   r   r>   e  
    
z*SpmConverter.tokenizer.<locals>.<listcomp>c                 S   s    g | ]\}}}t |d |dqS )F
normalizedspecialr   r=   r  tokenr	  r   r   r   r>   k  s    c                 S      | d S Nr   r   r-   r   r   r   r1   m      z(SpmConverter.tokenizer.<locals>.<lambda>r3   )r   
model_typer0   r   r   r   r   SpmExtractorr(   r   r]   	enumerater   Z	unk_piece	Exceptionr   
add_tokensrD   )	rS   r   r  rF   r   _rG   Z	bpe_vocabspm_added_tokensr   rs   r   r   A  sF   

zSpmConverter.tokenizerc                 C   sJ   |j j}tjdddttddg}|st|S tt|g| S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr   StripReplacer   r   PrecompiledrS   r   r  Z_normalizersr   r   r   r   s  s   
zSpmConverter.normalizerc                 C      t || j}tj||dS Nreplacementr)   )r*   r(   r	   	MetaspacerS   r#  r!   r)   r   r   r   r   ~     zSpmConverter.pre_tokenizerc                 C      d S r^   r   rs   r   r   r   r        zSpmConverter.post_processorc                 C   r   r!  )r*   r(   r   r$  r%  r   r   r   r     r&  zSpmConverter.decoderr"   c                 C   s   |  | j}| | j}|d ur||_d}d}t| jdr!| jj}| ||}|d ur.||_| |||_|  }|r>||_|S )Nr  Tr!   )	r   r   r   r   r(   r!   r   r   r   )rS   r   r   r#  r!   r   r   r   r   r   rt     s    zSpmConverter.converted)r_   r`   ra   r   rN   r  r   rT   r0   r   r   r   r   r   r   r   rt   __classcell__r   r   r   r   r   !  s    2r   c                   @   $   e Zd Zdd Zdd Zdd ZdS )AlbertConverterc                 C   r   )Nc                 S   2   g | ]}t |jr|j|jfn|j|jd  fqS d   rp   rk   r   r   r   r   r   r>         $z)AlbertConverter.vocab.<locals>.<listcomp>r   r   r   r   r   r0        zAlbertConverter.vocabc                 C      t ddt ddg}| jjs|t   |t   | jjr)|t   |j	j
}|r7|t | |t tdd t |S Nz``"z''r  r   r   r  r(   keep_accentsrC   NFKDStripAccentsr   	Lowercaser  r  r  r   r   rS   r   Zlist_normalizersr  r   r   r   r        


zAlbertConverter.normalizerc                 C   ,   t jddd| jdfd| jdfgdS Nr   r   r   r   r   r
   r   r(   r   rs   r   r   r   r        zAlbertConverter.post_processorNr_   r`   ra   r0   r   r   r   r   r   r   r+        r+  c                   @      e Zd Zdd Zdd ZdS )BarthezConverterc                 C      d}|S Nr   r   rS   r   r   r   r   r   r        zBarthezConverter.unk_idc                 C   r<  Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   r>  rs   r   r   r   r     r?  zBarthezConverter.post_processorN)r_   r`   ra   r   r   r   r   r   r   rC    s    rC  c                   @   r*  )CamembertConverterc                 C   2   g d}|dd |j dd  D 7 }|dg7 }|S )N))z
<s>NOTUSED        <pad>rM  )z</s>NOTUSEDrM  z<unk>rM  )z<unk>NOTUSEDic                 S   r   r   r   r   r   r   r   r>     r2   z,CamembertConverter.vocab.<locals>.<listcomp>r   z<mask>rM  r   rS   r   r0   r   r   r   r0     s   
zCamembertConverter.vocabc                 C      dS rE  r   r   r   r   r   r        zCamembertConverter.unk_idc                 C   r<  rH  r>  rs   r   r   r   r     r?  z!CamembertConverter.post_processorNr_   r`   ra   r0   r   r   r   r   r   r   rK    s    rK  c                   @   r*  )DebertaV2Converterc                 C   sH   g }| j jr|tjdd t|| j }|tj||d t|S )Nr   )r   r"  )r(   Zsplit_by_punctrC   r	   Punctuationr*   r$  r   )rS   r#  r!   Zlist_pretokenizersr)   r   r   r   r     s   
z DebertaV2Converter.pre_tokenizerc                 C   sd   g }| j jr|t  |t  |jj}|r"|t| |t	t
dd t|S )Nr  r   )r(   r   rC   r   r9  r  r  r  r  r  r   r   r:  r   r   r   r     s   
zDebertaV2Converter.normalizerc                 C   r<  r=  r>  rs   r   r   r   r   
  r?  z!DebertaV2Converter.post_processorN)r_   r`   ra   r   r   r   r   r   r   r   rV    s    rV  c                   @   r*  )MBartConverterc                 C   >   g d}|dd |j dd  D 7 }|g d7 }|dg7 }|S )NrI  rM  rN  rJ  rM  rP  c                 S   r   r   r   r   r   r   r   r>     r2   z(MBartConverter.vocab.<locals>.<listcomp>r   )Zar_ARrM  cs_CZrM  de_DErM  en_XXrM  Zes_XXrM  et_EErM  fi_FIrM  Zfr_XXrM  gu_INrM  hi_INrM  it_ITrM  Zja_XXrM  kk_KZrM  ko_KRrM  lt_LTrM  lv_LVrM  Zmy_MMrM  ne_NPrM  Znl_XXrM  ro_ROrM  ru_RUrM  si_LKrM  tr_TRrM  vi_VNrM  zh_CNrM  rQ  r   rR  r   r   r   r0     s
   
zMBartConverter.vocabc                 C   rS  rE  r   r   r   r   r   r   <  r(  zMBartConverter.unk_idc                 C   r<  )Nz$A </s> en_XXz$A $B </s> en_XXrc  rJ  r   r>  rs   r   r   r   r   ?  r?  zMBartConverter.post_processorNrU  r   r   r   r   rX    s    &rX  c                   @   r*  )MBart50Converterc                 C   rY  )NrZ  c                 S   r   r   r   r   r   r   r   r>   R  r2   z*MBart50Converter.vocab.<locals>.<listcomp>r   )4r]  r^  r`  rb  rd  re  rg  ri  rj  rl  rn  rp  rq  rs  ru  rw  ry  rz  r|  r}  r  r  r  r  r  )af_ZArM  )az_AZrM  )bn_INrM  )fa_IRrM  )he_ILrM  )hr_HRrM  )id_IDrM  )ka_GErM  )Zkm_KHrM  )mk_MKrM  )ml_INrM  )mn_MNrM  )mr_INrM  )pl_PLrM  )ps_AFrM  )Zpt_XXrM  )sv_SErM  )sw_KErM  )ta_INrM  )te_INrM  )th_THrM  )Ztl_XXrM  )uk_UArM  )ur_PKrM  )xh_ZArM  )gl_ESrM  )sl_SIrM  rQ  r   rR  r   r   r   r0   K  s
   
zMBart50Converter.vocabc                 C   rS  rE  r   r   r   r   r   r   W  r(  zMBart50Converter.unk_idc                 C   r<  )Nzen_XX $A </s>zen_XX $A $B </s>rc  rJ  r   r>  rs   r   r   r   r   Z  r?  zMBart50Converter.post_processorNrU  r   r   r   r   r  J  s    r  c                   @   r*  )NllbConverterc                 C   (   g d}|dd |j dd  D 7 }|S )NrZ  c                 S   r   r   r   r   r   r   r   r>   m  r2   z'NllbConverter.vocab.<locals>.<listcomp>r   r   rR  r   r   r   r0   f     zNllbConverter.vocabc                 C   rS  rE  r   r   r   r   r   r   p  r(  zNllbConverter.unk_idc                 C   r<  )Nzeng_Latn $A </s>zeng_Latn $A $B </s>Zeng_LatnrJ  r   r>  rs   r   r   r   r   s  r?  zNllbConverter.post_processorNrU  r   r   r   r   r  e      
r  c                   @   r*  )SeamlessM4TConverterc                 C   r  )N)rN  rP  r[  r\  c                 S   r   r   r   r   r   r   r   r>     r2   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>r   r   rR  r   r   r   r0     r  zSeamlessM4TConverter.vocabc                 C   s   | j jS r^   )r(   Zunk_token_idr   r   r   r   r     r   zSeamlessM4TConverter.unk_idc                 C   r<  )Nz__eng__ $A </s>z__eng__ $A $B </s>Z__eng__rJ  r   r>  rs   r   r   r   r     r?  z#SeamlessM4TConverter.post_processorNrU  r   r   r   r   r  ~  r  r  c                   @   r*  )XLMRobertaConverterc                 C   rL  )NrZ  c                 S   r   r   r   r   r   r   r   r>     r2   z-XLMRobertaConverter.vocab.<locals>.<listcomp>r   rQ  r   rR  r   r   r   r0     s   
zXLMRobertaConverter.vocabc                 C   rD  rE  r   rF  r   r   r   r     rG  zXLMRobertaConverter.unk_idc                 C   r<  rH  r>  rs   r   r   r   r     r?  z"XLMRobertaConverter.post_processorNrU  r   r   r   r   r        r  c                   @   r*  )XLNetConverterc                 C   r   )Nc                 S   r,  r-  r/  r   r   r   r   r>     r0  z(XLNetConverter.vocab.<locals>.<listcomp>r   r   r   r   r   r0     r1  zXLNetConverter.vocabc                 C   r2  r3  r5  r:  r   r   r   r     r;  zXLNetConverter.normalizerc                 C   r<  )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   r>  rs   r   r   r   r     r?  zXLNetConverter.post_processorNr@  r   r   r   r   r    rA  r  c                   @      e Zd ZdS )ReformerConverterNr_   r`   ra   r   r   r   r   r        r  c                   @   rB  )RemBertConverterc                 C   s   t ddt ddt tddg}| jjs%|t   |t   | jjr0|t 	  |j
j}|r>|t | t |S r3  )r   r  r   r(   r6  rC   r7  r8  r   r9  r  r  r  r   r:  r   r   r   r     s   


zRemBertConverter.normalizerc                 C   r<  r=  r>  rs   r   r   r   r     r?  zRemBertConverter.post_processorN)r_   r`   ra   r   r   r   r   r   r   r    s    r  c                   @   r  )BertGenerationConverterNr  r   r   r   r   r    r  r  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
PegasusConverterc                 C   s   | j jdf| j jdfg}| j jd ur|| j jdfg7 }| j jd ur2| j j| j jk r2|| j jdfg7 }|dd td| j jD 7 }|dd |jdd  D 7 }|S )NrM  c                 S      g | ]
}d | ddfqS )z<unk_>g      Yr   r=   r   r   r   r   r>     r?   z*PegasusConverter.vocab.<locals>.<listcomp>r5   c                 S   r   r   r   r   r   r   r   r>     r2   )	r(   	pad_token	eos_tokenZmask_token_sentZ
mask_tokenZmask_token_idoffsetrB   r   rR  r   r   r   r0      s   

zPegasusConverter.vocabc                 C   s   |j j| jj S r^   )r   r   r(   r  r   r   r   r   r     r   zPegasusConverter.unk_idc                 C   s(   t || j}tt tj||dgS r!  )r*   r(   r	   r   ZWhitespaceSplitr$  r%  r   r   r   r     s   zPegasusConverter.pre_tokenizerc                 C   s0   | j j}|| j jfg}tjd|gdd|g|dS )N$A$Br   )r(   r  eos_token_idr
   r   )rS   eosr   r   r   r   r     s   
zPegasusConverter.post_processorN)r_   r`   ra   r0   r   r   r   r   r   r   r   r    s
    	r  c                   @   rB  )T5Converterc                 C   s:   | j j}dd |jD }|dd t|d ddD 7 }|S )Nc                 S   r   r   r   r   r   r   r   r>   *  r2   z%T5Converter.vocab.<locals>.<listcomp>c                 S   r  )z
<extra_id_r  rM  r   r  r   r   r   r>   +  r?   r   rl   )r(   Z
_extra_idsr   rB   )rS   r   Znum_extra_idsr0   r   r   r   r0   (  s   zT5Converter.vocabc                 C   &   t jddgg dd| jdfgdS Nr  rJ  )r  rJ  r  rJ  r   r>  rs   r   r   r   r   .     zT5Converter.post_processorN)r_   r`   ra   r0   r   r   r   r   r   r  '  s    r  c                   @      e Zd Zdd ZdS )UdopConverterc                 C   r  r  r>  rs   r   r   r   r   9  r  zUdopConverter.post_processorNr_   r`   ra   r   r   r   r   r   r  8      r  c                   @   ru   )WhisperConverterr"   c           	   	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j j}| j |}| j j}| j j}ddd |D }tj| d| d	| d
| d||fgt||d|_|S )Nr   Fr   r   r   c                 S   s   g | ]}| d qS )r   r   r=   r  r   r   r   r>   Z  s    z.WhisperConverter.converted.<locals>.<listcomp>z $A:0 r   z $A:0 $B:1 r   r   )r(   r   rf   r   r   r   r   r	   r   r!   r   r   r   Zprefix_tokensconvert_ids_to_tokensr  r  joinr
   r   zipr   )	rS   r0   rG   r   Zprefix_token_idsprefixesr  r  Zprefix_templater   r   r   rt   D  s8   
	zWhisperConverter.convertedNr   r   r   r   r   r  C  r   r  c                   @   r  )BigBirdConverterc                 C   r<  r=  r>  rs   r   r   r   r   h  r?  zBigBirdConverter.post_processorNr  r   r   r   r   r  g  r  r  c                   @   ru   )CLIPConverterr"   c              
   C   s   | j j}t| j j }| j j}tt||d dddt|d}t	
t	 t	tddt	 g|_t
tjtddd	d
tjddg|_t |_tj| j j| j jf| j j| j jfddd|_|S )Nr   r   Fr0   rG   r   r   r   r   ry   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r   )r(   r   rf   r   r   ry   r   r   rc   r   r   r   r  r   r9  r   r	   r   r   r   r   r   r
   r   r  r  r   r   r   r   r   r   r   rt   t  sD   


zCLIPConverter.convertedNr   r   r   r   r   r  s  r   r  c                   @   ru   )LayoutLMv2Converterr"   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )Nrx   FTrz   r{   r~   r   r   r   r   r   r   r   r   r   r   r   rt     r   zLayoutLMv2Converter.convertedNr   r   r   r   r   r    r   r  c                   @   ru   )BlenderbotConverterr"   c              	   C   st   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjd|j d|j|jfgd|_|S )Nr   Fr   r   z$A:0 r   )r   r   )r(   r   rf   r   r   r   r   r	   r   r!   r   r   r   r
   r   r  r  r   r   r   r   r   rt     s*   

zBlenderbotConverter.convertedNr   r   r   r   r   r    r   r  c                   @   r*  )XGLMConverterc                 C   s4   g d}|dd |j dd  D 7 }|g d7 }|S )NrZ  c                 S   r   r   r   r   r   r   r   r>     r2   z'XGLMConverter.vocab.<locals>.<listcomp>r   ))z<madeupword0>rM  )z<madeupword1>rM  )z<madeupword2>rM  )z<madeupword3>rM  )z<madeupword4>rM  )z<madeupword5>rM  )z<madeupword6>rM  r   rR  r   r   r   r0     s   zXGLMConverter.vocabc                 C   rD  rE  r   rF  r   r   r   r     rG  zXGLMConverter.unk_idc                 C   r<  )Nz</s> $Az</s> $A </s> </s> $BrI  rJ  r   r>  rs   r   r   r   r     r?  zXGLMConverter.post_processorNrU  r   r   r   r   r    r  r  c                   @   sF   e Zd ZdZeZddhZ	 dd Zdd Zdd	 Z	d
d Z
dd ZdS )GemmaConverterTz<start_of_turn>z<end_of_turn>c                 C      t ddS Nr   r  )r   r  r   r   r   r   r        zGemmaConverter.normalizerc                 C   s|   | j jdf| j jdf| j jdfg}|dd |jdd  D 7 }tdd |D s<tdd t|D d }|d ur<d||< |S )	NrM  c                 S   r   r   r   r   r   r   r   r>     r2   z(GemmaConverter.vocab.<locals>.<listcomp>r   c                 s   s    | ]	}|d  dkV  qdS )r   rh   Nr   )r=   r.   r   r   r   	<genexpr>  s    z'GemmaConverter.vocab.<locals>.<genexpr>c                 s   s$    | ]\}}|d  dkr|V  qdS )r   ri   Nr   )r=   r   r.   r   r   r   r    s   " )rh   rM  )r(   r  r  r   r   anynextr  )rS   r   r0   Zoverride_indexr   r   r   r0     s   


zGemmaConverter.vocabc                 C   r  )Nr   Zmerged_with_previous)r	   r   rS   r#  r!   r   r   r   r      r  zGemmaConverter.pre_tokenizerc                 C   rD  rE  r   rF  r   r   r   r   #  rG  zGemmaConverter.unk_idc                 C   s    t t ddt  t  gS )Nr  r   )r   r   r  ByteFallbackFuser  r   r   r   r   '  s   
zGemmaConverter.decoderN)r_   r`   ra   r   rg   r  r   r   r0   r   r   r   r   r   r   r   r    s    
r  c                   @   s@   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dS )LlamaConverterTc                 C   sN   | j ddf| j ddf| j ddfg}|dd |jdd  D 7 }|S )Nr   rM  r   r5   c                 S   r   r   r   r   r   r   r   r>   :  r2   z(LlamaConverter.vocab.<locals>.<listcomp>r   )r(   r  r   rR  r   r   r   r0   4  s   zLlamaConverter.vocabc                 C   rD  r  r   rF  r   r   r   r   =  rG  zLlamaConverter.unk_idc                 C   <   t ddt  t  g}|r|t jdddg7 }t |S Nr  r   r   )contentr  r   r  r  r  r  r   rS   r#  r!   sequencer   r   r   r   A     

zLlamaConverter.decoderc                 C   sT   t | jddr(g }t | jddr|tjddg7 }|tjdddg7 }t|S d S )Nr$   Tr!   r  )prependr   )patternr  )r'   r(   r   Prependr  r   )rS   r   r  r   r   r   r   K  s   
zLlamaConverter.normalizerc                 C   s.   t | jddst|| j}tj||ddS d S )Nr$   TFr#  r)   split)r'   r(   r*   r	   r$  r%  r   r   r   r   T  s   zLlamaConverter.pre_tokenizerc                 C   r'  r^   r   rs   r   r   r   r   Z  rT  zLlamaConverter.post_processorN)
r_   r`   ra   r   r0   r   r   r   r   r   r   r   r   r   r  1  s    	
	r  c                   @   ru   )MarkupLMConverterr"   c           	   
   C   s   | j }|j}t|j }tt||d ddd| j jd}tj	|j
d|_t	 |_t| j j}t| j j}| j j}| j j}tj| d| | d| d| ||f||fgd|_|S )Nr   Fr  r   z $A z $B r   )r(   r   rf   r   r   r   r   ry   r	   r   r!   r   r   r   rc   r   r   r   r   r
   r   r   )	rS   r   r0   rG   r   r   r   r   r   r   r   r   rt   `  s8   
	zMarkupLMConverter.convertedNr   r   r   r   r   r  _  r   r  c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )MoshiConverterTNc                 K   sf   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S Nr   r   	r   rq   rT   r    r   r   r   r   r   )rS   r   Zmodel_max_lengthkwargsr   r   r   r   r   r   rT     s   

zMoshiConverter.__init__c                 C   s:   |j j}tddg}|st|S tt|g| S r  )r  r  r   r  r   r  r  r   r   r   r     s   

zMoshiConverter.normalizerc                 C   r  r  r  r  r   r   r   r     r  zMoshiConverter.decoderc                 C   s   d}t j||ddS )Nr%   Fr  )r	   r$  r%  r   r   r   r     s   zMoshiConverter.pre_tokenizerr^   )r_   r`   ra   r   rT   r   r   r   r   r   r   r   r    s    


r  c                   @   sR   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd ZdS )HeliumConverterTNc                 G   sf   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S r  r  )rS   r   r   r   r   r   r   r   r   rT     s   

zHeliumConverter.__init__c                    s     |}tt| | jd} fddt|jD }|dd t|dd dD  |t	dd	d	d
g |j
ddd |S )Nr   c                    r   r   r  r  rs   r   r   r>     r  z-HeliumConverter.tokenizer.<locals>.<listcomp>c                 S   s"   g | ]\}}}t |d |ddqS )FT)r  r	  Zsingle_wordr
  r  r   r   r   r>     s    c                 S   r  r  r   r-   r   r   r   r1     r  z+HeliumConverter.tokenizer.<locals>.<lambda>r3   
Fr  rO  r   )r  Zpad_id)r0   r   r   r   r   r  r   r  rD   r   Zenable_padding)rS   r   rF   r   r  r   rs   r   r     s&   

zHeliumConverter.tokenizerc                 C   sB   g }|j D ]}|jdkr|d|jfg7 }q||j|jfg7 }q|S )Nz<0x0A>r  )r   rk   r   )rS   r   r0   rk   r   r   r   r0     s   

zHeliumConverter.vocabc                 C   rD  r  r   rF  r   r   r   r     rG  zHeliumConverter.unk_idc                 C   s8   t ddt  t  g}|t jdddg7 }t |S r  r  r  r   r   r   r     s   

zHeliumConverter.decoderc                 C   s   t t dt ddgS r  )r   r   r  r  r   r   r   r   r     s   zHeliumConverter.normalizerc                 C   s   t t ddgS )Nr  
contiguous)r	   r   r   r  r   r   r   r     s   zHeliumConverter.pre_tokenizerc                 C   s   t jddgg ddgdS )NrI  r  )rI  r  rI  r  )rI  r   r   )r
   r   rs   r   r   r   r     s   zHeliumConverter.post_processorr^   )r_   r`   ra   r   rT   r   r0   r   r   r   r   r   r   r   r   r   r    s    
		r  c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r   )chr)r=   nr   r   r   r>     s    z$bytes_to_unicode.<locals>.<listcomp>)rf   rB   ordrC   r@   r  )bscsr  br   r   r   bytes_to_unicode  s   L
r  c                       sN   e Zd ZdZ				d fdd	Zdefdd	Zd
d ZdefddZ	  Z
S )TikTokenConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                    s@   t  j|  || _|| _|| _t|tr| | _d S || _d S r^   )	r   rT   r   r  r!   
isinstancer@   r   additional_special_tokens)rS   r   r  r!   r   r   r  r   r   r   rT   %  s   	zTikTokenConverter.__init__tiktoken_urlc                    s   zddl m} W n ty   tdw || t fddg }i }  D ]P\}}|||< t|dkr:q)g }tdt|D ]%}|d | ||d  }	}
|	 v rh|
 v rh|	|
  v rh||	|
|f qCt	| fddd	d
}|
| q)t	|dd d	d
}fdd|D }||fS )Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c                    s   d  fdd| dD S )Nr   c                    s   g | ]} t | qS r   )r  )r=   charbyte_encoderr   r   r>   D  r2   zdTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>zlatin-1)r  decode)r  r  r   r   token_bytes_to_stringC  s   zPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringr   c                    r+   r,   r   r-   )r   r   r   r1   Q  r2   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>Fr9   c                 S   r  )Nr5   r   r7   r   r   r   r1   S  r  c                    s$   g | ]} |d   |d fqS r;   r   r<   )r  r   r   r>   T  s   $ zETikTokenConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)Ztiktoken.loadr  r  
ValueErrorr  rA   r6   rB   rC   rD   rE   )rS   r  r  rG   r0   r  ZrankrI   rJ   rK   rL   r   )r   r  r  r   extract_vocab_merges_from_model8  s6   z1TikTokenConverter.extract_vocab_merges_from_modelc                 C   s:   |  | j\}}tt||dd}t|jdrd|j_|S )NF)r   ignore_mergesT)r	  r   r   r   r   rO   r
  )rS   rF   rG   r   r   r   r   r   W  s
   zTikTokenConverter.tokenizerr"   c                 C   sh   |   }ttjt| jdddtj| jddg|_t	 |_
|dd | jD  tjdd|_|S )Nr   Fr   r   c                 S   s   g | ]	}t |d ddqS )FTr  r
  r  r   r   r   r>   i  r   z/TikTokenConverter.converted.<locals>.<listcomp>r   )r   r	   r   r   r   r  r   r!   r   r   r   r   r   r
   r   )rS   r   r   r   r   rt   ^  s   
zTikTokenConverter.converted)Nr  FN)r_   r`   ra   rb   rT   rc   r	  r   r   rt   r)  r   r   r   r   r     s    r  ZAlbertTokenizerZBartTokenizerZBarthezTokenizerZBertTokenizerZBigBirdTokenizerZBlenderbotTokenizerZCamembertTokenizerZCLIPTokenizerZCodeGenTokenizerZConvBertTokenizerZDebertaTokenizerZDebertaV2TokenizerZDistilBertTokenizerZDPRReaderTokenizerZDPRQuestionEncoderTokenizerZDPRContextEncoderTokenizerZElectraTokenizerZFNetTokenizerZFunnelTokenizerZGPT2TokenizerZHerbertTokenizerZLayoutLMTokenizerZLayoutLMv2TokenizerZLayoutLMv3TokenizerZLayoutXLMTokenizerZLongformerTokenizerZLEDTokenizerZLxmertTokenizerZMarkupLMTokenizerZMBartTokenizerZMBart50TokenizerZMPNetTokenizerZMobileBertTokenizerZMvpTokenizerZNllbTokenizerZOpenAIGPTTokenizerZPegasusTokenizerZQwen2TokenizerZRealmTokenizerZReformerTokenizerZRemBertTokenizerZRetriBertTokenizerZRobertaTokenizerZRoFormerTokenizerZSeamlessM4TTokenizerZSqueezeBertTokenizerZT5TokenizerZUdopTokenizerZWhisperTokenizerZXLMRobertaTokenizerZXLNetTokenizer)ZSplinterTokenizerZXGLMTokenizerZLlamaTokenizerZCodeLlamaTokenizerZGemmaTokenizerZPhi3TokenizerFc                 C   sn   | j j}|tv r|st| }||  S ztd t| j| jd W S  t	y6   t
dtt  w )a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zConverting from Tiktoken)r   r   zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r   r_   SLOW_TO_FAST_CONVERTERSrt   loggerinfor  r   r   r  r  rf   r   )Ztransformer_tokenizerZfrom_tiktokenZtokenizer_class_nameZconverter_classr   r   r   convert_slow_tokenizer  s&   

r  )r   )F)Prb   r   typingr   	packagingr   Z
tokenizersr   r   r   r   r   r	   r
   Ztokenizers.modelsr   r   r   utilsr   r   r   r   Zutils.import_utilsr   Z
get_loggerr_   r  r    boolrc   r*   rM   rN   rg   rp   rq   rv   r   r   r   r   r   r   r   r   r   r   r   r+  rC  rK  rV  rX  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r   <module>   sR  $


'2''(.' %!5% ($+'4.&)ZQ	
 !"#$%&'()*+,-./01234=