o
    Zh                     @   s`   d dl mZmZmZmZmZ d dlmZ d dlm	Z	 d dl
mZ G dd dZdefdd	Zd
S )    )Regex	Tokenizerdecoderspre_tokenizers
processors)BPE)LlamaTokenizerFast)bytes_to_unicodec                       sN   e Zd ZdZ				d fdd	Zdefdd	Zd
d ZdefddZ	  Z
S )MistralConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                    s(   t  j|  || _|| _|| _|| _d S )N)super__init__vocabpatternadd_prefix_spaceadditional_special_tokens)selfr   r   r   r   argskwargs	__class__ P/var/www/auris/lib/python3.10/site-packages/transformers/integrations/mistral.pyr      s
   	
zMistralConverter.__init__r   c           
         s  | t  fddg }i }t  D ]\\}\}}|| jvrm|||< t|dkr-qg }tdt|D ]%}|d | ||d  }}	| v r[|	 v r[||	  v r[|||	|f q6t| fdddd}|| q|||< qt|dd dd}fd	d
|D }||fS )Nc                    s   d  fdd| dD S )N c                    s   g | ]} t | qS r   )ord).0charbyte_encoderr   r   
<listcomp>!       zcMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>zlatin-1)joindecode)br   r   r   token_bytes_to_string    s   zOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string   c                    s    | d   | d  fS )Nr   r%   r   )x)	bpe_ranksr   r   <lambda>/   r    zBMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>F)keyreversec                 S   s   | d S )N   r   )valr   r   r   r(   3   s    c                    s$   g | ]} |d   |d fqS )r   r%   r   )r   r,   )r$   r   r   r   4   s   $ zDMistralConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)	r	   	enumerateitemsr   lenrangeappendsortedextend)
r   r   mergesidxtokenZranklocalindexZpiece_lZpiece_rr   )r'   r   r$   r   extract_vocab_merges_from_model   s,   

z0MistralConverter.extract_vocab_merges_from_modelc                 C   s:   |  | j\}}tt||dd}t|jdrd|j_|S )NF)Zfuse_unkignore_mergesT)r9   r   r   r   hasattrmodelr:   )r   Zvocab_scoresr4   	tokenizerr   r   r   r=   7   s
   zMistralConverter.tokenizerreturnc                 C   s^   |   }ttjt| jdddtj| jddg|_t	 |_
|| j tjdd|_|S )NisolatedF)Zbehaviorinvert)r   	use_regex)Ztrim_offsets)r=   r   SequenceZSplitr   r   Z	ByteLevelr   Zpre_tokenizerr   decoderadd_special_tokensr   r   Zpost_processor)r   r=   r   r   r   	converted>   s   
zMistralConverter.converted)Nr   FN)__name__
__module____qualname____doc__r   strr9   r=   r   rE   __classcell__r   r   r   r   r
      s    r
   tokenizer_filec                    s|   ddl m} || }|jjj}dd |jjjD   fdd D }|| |}tt	| d
 d}|d	 i |S )
z1Convert a "tekken" tokenizer to a fast Tokenizer.r   )MistralTokenizerc                 S   s    g | ]}t |d r|jn|qS )value)r;   rN   r   r6   r   r   r   r   Y   s    z,convert_tekken_tokenizer.<locals>.<listcomp>c                    s   i | ]}|  |qS r   )r8   rO   Zall_specialr   r   
<dictcomp>]   r    z,convert_tekken_tokenizer.<locals>.<dictcomp>)r   r   )Ztokenizer_objectr   )Z(mistral_common.tokens.tokenizers.mistralrM   	from_fileZinstruct_tokenizerr=   Z_tekken_token2id_nospecialZ_all_special_tokensupdater   r
   rE   rD   )rL   rM   Zmistral_tokenizerr   Zspecials_tokensr=   r   rP   r   convert_tekken_tokenizerN   s   


rT   N)Z
tokenizersr   r   r   r   r   Ztokenizers.modelsr   Ztransformersr   Z#transformers.convert_slow_tokenizerr	   r
   rJ   rT   r   r   r   r   <module>   s    F