
    eTh                     f    S SK JrJrJrJrJr  S SKJr  S SKJ	r	  S SK
Jr   " S S5      rS\4S jrg	)
    )Regex	Tokenizerdecoderspre_tokenizers
processors)BPE)LlamaTokenizerFast)bytes_to_unicodec                   Z   ^  \ rS rSrSr    S
U 4S jjrS\4S jrS rS\	4S jr
S	rU =r$ )MistralConverter   z
A general tiktoken converter.
c                 P   > [         TU ]  " U6   Xl        X l        X0l        X@l        g )N)super__init__vocabpatternadd_prefix_spaceadditional_special_tokens)selfr   r   r   r   argskwargs	__class__s          Y/var/www/auris/envauris/lib/python3.13/site-packages/transformers/integrations/mistral.pyr   MistralConverter.__init__   s(     	$
 0)B&    r   c                 4  ^^ Um[        5       mU4S jn/ n0 n[        TR                  5       5       H  u  nu  pVXPR                  ;  a  XAU" U5      '   [	        U5      S:X  a  M2  / n[        S[	        U5      5       H8  nUS U XXS  pU	T;   d  M  U
T;   d  M  X-   T;   d  M%  UR                  XU45        M:     [        UU4S jSS9nUR                  U5        M  XAU'   M     [        US SS9nU Vs/ s H  o" US   5      U" US   5      4PM     nnX4$ s  snf )Nc           	         > SR                  U R                  S5       Vs/ s H  nT[        U5         PM     sn5      $ s  snf )N zlatin-1)joindecodeord)bcharbyte_encoders     r   token_bytes_to_stringOMistralConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string    s8    77@ST@SLT3@STUUTs   ?   c                 $   > TU S      TU S      4$ )Nr   r'    )x	bpe_rankss    r   <lambda>BMistralConverter.extract_vocab_merges_from_model.<locals>.<lambda>/   s    Yqt_iPQRSPTo4Vr   F)keyreversec                     U S   $ )N   r)   )vals    r   r,   r-   3   s    Ar   r   )	r
   	enumerateitemsr   lenrangeappendsortedextend)r   r   r%   mergesidxtokenranklocalindexpiece_lpiece_rr2   r+   r$   s               @@r   extract_vocab_merges_from_model0MistralConverter.extract_vocab_merges_from_model   s1   	')	V "+IOO,=">C%:::69+E23u:?"1c%j1E',Ve}eFmW)+90D'J[`iIig%=> 2 u*V`efe$"e #? $6F\bc\bUX(Q02GA2OP\bc} ds   1Dc                     U R                  U R                  5      u  p[        [        XSS95      n[	        UR
                  S5      (       a  SUR
                  l        U$ )NF)fuse_unkignore_mergesT)rB   r   r   r   hasattrmodelrF   )r   vocab_scoresr:   	tokenizers       r   rJ   MistralConverter.tokenizer7   sM    #CCDJJOc,GH	9??O44,0IOO)r   returnc                    U R                  5       n[        R                  " [        R                  " [	        U R
                  5      SSS9[        R                  " U R                  SS9/5      Ul        [        R                  " 5       Ul
        UR                  U R                  5        [        R                  " SS9Ul        U$ )NisolatedF)behaviorinvert)r   	use_regex)trim_offsets)rJ   r   SequenceSplitr   r   	ByteLevelr   pre_tokenizerr   decoderadd_special_tokensr   r   post_processor)r   rJ   s     r   	convertedMistralConverter.converted>   s    NN$	"0"9"9$$U4<<%8:V[\(($:O:O[`a#
	 %..0	$$T%C%CD#-#7#7U#K	 r   )r   r   r   r   )Nzs(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+FN)__name__
__module____qualname____firstlineno____doc__r   strrB   rJ   r   rZ   __static_attributes____classcell__)r   s   @r   r   r      sA      K"&CS 69  r   r   tokenizer_filec                    SSK Jn  UR                  U 5      nUR                  R                  R
                  nUR                  R                  R                   Vs/ s H"  n[        US5      (       a  UR                  OUPM$     nnU Vs0 s H  oDUR                  U5      _M     nnUR                  U5        Un[        [        X5S9R                  5       S9nUR                  SU05        U$ s  snf s  snf )z1Convert a "tekken" tokenizer to a fast Tokenizer.r   )MistralTokenizervalue)r   r   )tokenizer_objectr   )(mistral_common.tokens.tokenizers.mistralrf   	from_fileinstruct_tokenizerrJ   _tekken_token2id_nospecial_all_special_tokensrG   rg   r?   updater	   r   rZ   rX   )rd   rf   mistral_tokenizerr   r<   all_specialspecials_tokensrJ   s           r   convert_tekken_tokenizerrr   N   s     J )22>B 00::UUE '99CCWWWE ug..E9W   EPPK5k//66KOP5!E #)]ggiI
   "={!KL! Qs   )C,	C1N)
tokenizersr   r   r   r   r   tokenizers.modelsr   transformersr	   #transformers.convert_slow_tokenizerr
   r   ra   rr   r)   r   r   <module>rw      s-    M M ! + @C CLS r   