
    eTh                        S r SSKrSSKrSSKrSSKJr  SSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJrJrJrJr  S
SKJr  S
SKJr  S
SKJ r   S
SK!J"r"  S
SK#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+J,r,J-r-  S
SK.J/r/J0r0J1r1  \1Rd                  " \35      r4Sr5Sr6Sr7Sr8Sr9\$S-  r$\\\\S.r:\5\8S.r;\0" \$5       " S S\)5      5       r<g)z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)AnyOptionalUnion)Encoding)	Tokenizer)Decoder)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer   )convert_slow_tokenizer)convert_gguf_tokenizer)load_gguf_checkpoint)PreTrainedTokenizer)
INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBaseSpecialTokensMixin	TextInputTextInputPairTruncationStrategy)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)BPEUnigram	WordLevel	WordPiece)tokenizer_file
vocab_filec            )         ^  \ rS rSr% Sr\rSr\\	S'   U 4S jr
\S\4S j5       r\S\4S j5       r\S\4S	 j5       rS\\\4   4S
 jr\S\\\4   4S j5       r\S\\\4   4S j5       r\S\\\4   4S j5       rS\\\4   4S jrS\4S jr\S\4S j5       r\S\4S j5       r       SGS\S\\   S\\   S\S\S\S\S\S\ \\\!4   \"\   4   4S jjr#S\$\\%\   4   S\$\\"\   4   4S jr&S\S\4S  jr'S!\S\\   4S" jr(SHS#\"\$\\4      S\4S$ jjr)SHS%\S\4S& jjr* SHS'\$\\"\   4   S(\S\$\\"\   4   4S) jjr+SIS*\S%\\   S+\S\"\   4S, jjr,S-\-S.\.S/\S0\S1\\   S2\\   4S3 jr/S\-R`                  \.Rb                  SS4SSSSSSSSSSSS4S5\$\"\2   \"\3   \"\4   \"\5   4   S+\S-\-S.\.S/\\   S0\S6\S1\\   S2\\   S7\\   S\\   S\\   S\S\S\S\S\S8\S\64&S9 jjr7SS\-R`                  \.Rb                  SS4SSSSSSSSSSSS4S*\$\2\44   S:\\$\2\44      S+\S-\-S.\.S/\\   S0\S6\S1\\   S2\\   S7\\   S\\   S\\   S\S\S\S\S\S8\S\64(S; jjr8S\"\   S\4S< jr9  SJS=\$\\"\   4   S(\S>\\   S\4S? jjr:  SKS@\$\\;Rx                  4   SA\ \   SB\\   SC\\   S\ \   4
SD jjr=   SLSE jr>SFr?U =r@$ )MPreTrainedTokenizerFastQ   a5  
Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

Handles all the shared methods for tokenization and special tokens, as well as methods for
downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
Nslow_tokenizer_classc           	        > UR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUR                  SS5      nUR                  S0 5      nUR                  SS5      U l        U(       a  Uc  U R                  c  [	        S	5      eUb  [
        R                  " U5      n	GOUb  U(       d  [        R                  " U5      n	OU(       a  [        U5      n	OUbk  [        UR                  S
5      5      n
U
S   S   nU
S   nU
S   n[        X5      u  pUR                  U5        [        U5      S:  a  UR                  U5        O}U R                  b#  USLa  U R                  " U0 UD6n[        U5      n	OMU(       d;  UR                  S
S 5      U l        UR                  S/ 5      U l        [        U SS9n	S nO[	        S5      eXl        Ub  UR                  UR"                  5        SU l        U R                   R&                  nUbq  U R                   R(                  " S$0 UD6  UR+                  SUS   5        UR+                  SUS   5        UR+                  SUS   5        UR+                  SUS   5        OU R                   R-                  5         U R                   R.                  nUb  U R                   R0                  " S$0 UD6  UR+                  SUS   5        UR+                  SUS   5        UR+                  SUS   5        UR+                  SUS   5        UR+                  SUS   5        [2        TU ]h  " S$0 UD6  U R6                  U R                   l        U R:                   Vs1 s H  n[=        [?        U5      5      iM     nn[A        URC                  5       S  S!9 VVs/ s H"  u  nn[=        [?        U5      5      U;  d  M   UPM$     nnn[E        U RF                  RI                  5       5      U Vs/ s H  n[K        U5      PM     sn-   nUU RL                   Vs/ s H  nUU;  d  M  UU;  d  M  UPM     sn-  n[        U5      S:  a  / nU RN                  nU H  n[Q        U[R        5      (       a!  URT                  =(       d    [K        U5      U;   O[K        U5      U;   n[Q        U[J        5      (       a  [S        UUS"9nOUUl*        URW                  U5        M     U(       a  U RY                  U5         [Z        R\                  " U R^                  R`                  Rc                  5       5      nUR                  SU R                  5      U R                  :w  aF  [e        [f        UR                  S#5      5      nU R                  US'   U" S$0 UD6U R^                  l0        g g s  snf s  snnf s  snf s  snf ! [h         a     g f = f)%Ntokenizer_object__slow_tokenizer	gguf_filer%   	from_slowFadded_tokens_decoderadd_prefix_spacezCannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.r&   config
model_type	tokenizertokenizer_configr   additional_special_tokensT)from_tiktokena9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.
max_lengthtruncation_side	directionstridetruncation_strategystrategy	pad_tokenpad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofc                     U S   $ Nr    )xs    \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/tokenization_utils_fast.py<lambda>2PreTrainedTokenizerFast.__init__.<locals>.<lambda>   s    STUVSW    key)specialtyperF   )5popgetr1   r*   
ValueErrorcopydeepcopyTokenizerFast	from_filer   r   r   updatelenr&   r6   
_tokenizerinit_kwargs_decode_use_source_tokenizer
truncationenable_truncation
setdefaultno_truncationpaddingenable_paddingsuper__init__split_special_tokensencode_special_tokensr0   hashreprsorteditemslistadded_tokens_encoderkeysstrall_special_tokens_extendedall_special_tokens
isinstancer   rN   append
add_tokensjsonloadsbackend_tokenizerpre_tokenizer__getstate__getattrpre_tokenizers_fast	Exception)selfargskwargsr,   slow_tokenizerr.   fast_tokenizer_filer/   r0   fast_tokenizer
gguf_paramarchitecturetokenizer_dictr5   additional_kwargs_truncation_paddingtokenadded_tokens_decoder_hashindextokens_to_addencodertokensspecial_tokens
is_specialpre_tok_statepre_tok_class	__class__s                              rH   rc    PreTrainedTokenizerFast.__init__b   s   !::&8$?$6=JJ{D1	$jj)94@JJ{E2	%zz*@"E &

+=u E/D4M4M4U0 
 '!]]+;<N ,Y*445HIN3NCN"-fjj.FGJ%h/=L'4N)*<=0F|0d-NMM*+$%)/0&&2~U7R!66GGN3NCN$jjt<DO-3ZZ8SUW-XD*3DMN!Nr  )%MM.445,1)oo00"OO--<<lK,EF/[1IJhH(=>3[5LMOO))+??**OO**6X6k8K+@A18M3JKnh{.CDlHX,>?2H=Q4RS 	"6"040I0I-DHD]D]$^D]5T$u+%6D]!$^ !'';'A'A'C X
 XuDK (AA  X 	 

 t005578Ta;bTa5CJTa;bb#??
?e5PWCWE\aiv\vE?
 	
 }!F!44N& "%44 ]]Bc%jN&BU~5 
 eS))&ujAE$.EMe$ ' '
	 JJt'='='K'K'X'X'Z[M  !3T5J5JKtOdOdd '(;]=N=Nv=V W484I4I017D7U}7U&&4 e? %_

 <c
6  	 		s=   WW&WW>
WWWB'W 
W%$W%returnc                     g)NTrF   r{   s    rH   is_fastPreTrainedTokenizerFast.is_fast   s    rK   c                     g)z
`bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
can only be `True` if the original `"sentencepiece.model"` was not deleted.
TrF   r   s    rH   can_save_slow_tokenizer/PreTrainedTokenizerFast.can_save_slow_tokenizer   s     rK   c                 4    U R                   R                  SS9$ )z@
`int`: Size of the base vocabulary (without the added tokens).
Fwith_added_tokensrY   get_vocab_sizer   s    rH   
vocab_size"PreTrainedTokenizerFast.vocab_size   s    
 ---FFrK   c                 4    U R                   R                  SS9$ )NTr   )rY   	get_vocabr   s    rH   r   !PreTrainedTokenizerFast.get_vocab   s    ((4(@@rK   c                 "    U R                  5       $ N)r   r   s    rH   vocabPreTrainedTokenizerFast.vocab   s    ~~rK   c                     [        U R                  R                  5       S S9 VVs0 s H  u  pUR                  U_M     snn$ s  snnf )z
Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
optimisation in `self._added_tokens_encoder` for the slow tokenizers.
c                     U S   $ rE   rF   items    rH   rI   >PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<lambda>       dhijdkrK   rL   rh   r0   ri   contentr{   vks      rH   rk   ,PreTrainedTokenizerFast.added_tokens_encoder   s?     *00I0I0O0O0QWk)lm)l		1)lmmm   Ac                 6    U R                   R                  5       $ )z
Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

Returns:
    `Dict[str, int]`: The added tokens.
)rY   get_added_tokens_decoderr   s    rH   r0   ,PreTrainedTokenizerFast.added_tokens_decoder   s     7799rK   c                     [        U R                  R                  5       S S9 VVs0 s H  u  pUR                  U_M     snn$ s  snnf )z
Returns the added tokens in the vocabulary as a dictionary of token to index.

Returns:
    `Dict[str, int]`: The added tokens.
c                     U S   $ rE   rF   r   s    rH   rI   9PreTrainedTokenizerFast.get_added_vocab.<locals>.<lambda>  r   rK   rL   r   r   s      rH   get_added_vocab'PreTrainedTokenizerFast.get_added_vocab
  s?     *00I0I0O0O0QWk)lm)l		1)lmmmr   c                 4    U R                   R                  SS9$ )z4
Size of the full vocabulary with the added tokens.
Tr   r   r   s    rH   __len__PreTrainedTokenizerFast.__len__  s     ---EErK   c                     U R                   $ )zS
`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
)rY   r   s    rH   ru   )PreTrainedTokenizerFast.backend_tokenizer  s    
 rK   c                 .    U R                   R                  $ )zE
`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
)rY   decoderr   s    rH   r   PreTrainedTokenizerFast.decoder   s    
 &&&rK   FTencodingreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                    Uc  SU R                   ;   nUc  SU R                   ;   nU(       a  UR                  b  U/UR                  -   n	OU/n	[        [        5      n
U	 H  nU
S   R	                  UR
                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       d  M  U
S   R	                  [        UR
                  5      5        M     X4$ )ar  
Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
of encodings, take care of building a batch from overflowing tokens.

Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
lists (overflows) of lists (tokens).

Output shape: (overflows, sequence length)
token_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingrB   )model_input_namesoverflowingr   rj   rq   idstype_idsr   r   offsetsrX   )r{   r   r   r   r   r   r   r   r   	encodingsencoding_dictes               rH   _convert_encoding)PreTrainedTokenizerFast._convert_encoding'  s   ( !($48N8N$N! ($48N8N$N!$)=)=)I!
X%9%99I!
I#D)A+&--aee4$./66qzzB$./66q7G7GH)34;;A<Q<QR%./66qyyA}h'..s155z:  ''rK   r   c                     [        U[        5      (       a  U R                  U5      $ U Vs/ s H  o R                  U5      PM     sn$ s  snf )a   
Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the
vocabulary.

Args:
    tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s).

Returns:
    `int` or `List[int]`: The token id or list of token ids.
)rp   rm   #_convert_token_to_id_with_added_voc)r{   r   r   s      rH   convert_tokens_to_ids-PreTrainedTokenizerFast.convert_tokens_to_idsV  sC     fc"";;FCCMSTVE88?VTTTs   Ar   c                 Z    U R                   R                  U5      nUc  U R                  $ U$ r   )rY   token_to_idunk_token_id)r{   r   r   s      rH   r   ;PreTrainedTokenizerFast._convert_token_to_id_with_added_vocf  s,    ++E2=$$$rK   r   c                 J    U R                   R                  [        U5      5      $ r   )rY   id_to_tokenint)r{   r   s     rH   _convert_id_to_token,PreTrainedTokenizerFast._convert_id_to_tokenl  s    **3u:66rK   
new_tokensc                 |    U(       a  U R                   R                  U5      $ U R                   R                  U5      $ r   )rY   add_special_tokensrr   )r{   r   r   s      rH   _add_tokens#PreTrainedTokenizerFast._add_tokenso  s/    ??55jAA))*55rK   pairc                 8    U R                   R                  U5      $ )a  
Returns the number of added tokens when encoding a sequence with special tokens.

<Tip>

This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
this inside your training loop.

</Tip>

Args:
    pair (`bool`, *optional*, defaults to `False`):
        Whether the number of added tokens should be computed in the case of a sequence pair or a single
        sequence.

Returns:
    `int`: Number of special tokens added to sequences.
)rY   num_special_tokens_to_add)r{   r   s     rH   r   1PreTrainedTokenizerFast.num_special_tokens_to_addu  s    & 88>>rK   r   skip_special_tokensc                 @   [        U[        5      (       a  U R                  R                  U5      $ / nU(       a  [	        U R
                  5      O	[	        5       nU H?  n[        U5      nXT;   a  M  UR                  U R                  R                  U5      5        MA     U$ )a  
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
added tokens.

Args:
    ids (`int` or `List[int]`):
        The token id (or token ids) to convert to tokens.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.

Returns:
    `str` or `List[str]`: The decoded token(s).
)rp   r   rY   r   setall_special_idsrq   )r{   r   r   r   ids_to_skipr   s         rH   convert_ids_to_tokens-PreTrainedTokenizerFast.convert_ids_to_tokens  s      c3??..s333Fc$../CEEJE#MM$//55e<=	 
 rK   textr   c                 H    U R                   " SXUS.UD6R                  5       $ )N)r   	text_pairr   rF   )encode_plusr   )r{   r   r   r   r}   s        rH   tokenize PreTrainedTokenizerFast.tokenize  s(    kTN`kdjkrrttrK   padding_strategyr<   r8   r;   rC   rA   c                    U R                   R                  nU R                   R                  nU[        R                  :X  a  Ub  U R                   R                  5         OdUUUR                  U R                  S.n	Uc  Sn
O"U	 Vs0 s H  oUR                  US5      _M     n
nX:w  a  U R                   R                  " S0 U	D6  U[        R                  :X  a  Ub  U R                   R                  5         ggU[        R                  :X  a  UOSnUUb  UOU R                  U R                  U R                   U R"                  US.n	X:w  a  U R                   R$                  " S0 U	D6  ggs  snf )a  
Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
library) and restore the tokenizer settings afterwards.

The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
section.

Args:
    padding_strategy ([`~utils.PaddingStrategy`]):
        The kind of padding that will be applied to the input
    truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
        The kind of truncation that will be applied to the input
    max_length (`int`):
        The maximum size of a sequence.
    stride (`int`):
        The stride to use when handling overflow.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
        the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
    padding_side (`str`, *optional*):
        The side on which the model should have padding applied. Should be selected between ['right', 'left'].
        Default value is picked from the class attribute of the same name.
N)r8   r;   r=   r:   )rB   r:   pad_idr>   r@   rC   rF   )rY   r\   r`   r   DO_NOT_TRUNCATEr_   valuer9   rQ   r]   r   
DO_NOT_PAD
no_padding
MAX_LENGTHrA   pad_token_idr>   r?   ra   )r{   r  r<   r8   r;   rC   rA   r   r   targetcurrentr   rB   s                rH   set_truncation_and_padding2PreTrainedTokenizerFast.set_truncation_and_padding  sR   B oo00??**"4"D"DD&--/ ) /55!11	F "@FG1kooa66G 11;F;999#**, $ $47Q7Q#QZW[F -9-E\4K\K\++!^^#55&8F !..88 "% Hs   E'r   batch_text_or_text_pairsis_split_into_wordsreturn_tensorsrd   c                 2   [        U[        [        45      (       d  [        S[	        U5       S35      eU R                  UUUUUU	S9  U R                  R                  U:w  a  UU R                  l        U R                  R                  UUUS9nU Vs/ s H  nU R                  UUUUUUUUS9PM     nn0 nUS   S   R                  5        H.  nU VVVs/ s H  u  nnUU     H  nUPM     M     nnnnUUU'   M0     U VVVs/ s H  u  nnU  H  nUPM     M     nnnnU(       a4  / n[        U5       H  u  nu  nnUU/[        US   5      -  -  nM      UUS'   US    H  n U R                  U UU5        M     [        UUU
S	9$ s  snf s  snnnf s  snnnf )
Nz:batch_text_or_text_pairs has to be a list or a tuple (got ))r  r<   r8   r;   rC   rA   )r   is_pretokenized)r   r   r   r   r   r   r   r   r   r   overflow_to_sample_mapping)tensor_type)rp   tuplerj   	TypeErrorrO   r  rY   re   encode_batchr   rl   	enumeraterX   &_eventual_warn_about_too_long_sequencer   )!r{   r  r   r  r<   r8   r;   r  rC   rA   r  r   r   r   r   r   r   r   rd   r   r   tokens_and_encodingssanitized_tokensrM   r   _r   stacksanitized_encodingsr  itoksr   s!                                    rH   _batch_encode_plus*PreTrainedTokenizerFast._batch_encode_plus  s   . 2UDMBBLTRjMkLllmn 
 	''- 3!1% 	( 	
 ??004HH4HDOO1OO00$1/ 1 
	. & 
 & ""!&;&;*C+E'=+ # 	 & 	  
( '*1-224C&:N&:74DIqQIQ&:EN$)S! 5 1ES0DWQdqdq0DS %)+& )*> ?9D!*qcC[8I4J.JJ* !@=W9:)+6I77	:wW 7-/BP^__I 
, OSs    FF
Fr   c                    U(       a  X4/OU/nU R                   " U40 SU_SU_SU_SU_SU_SU_SU	_SU
_S	U_S
U_SU_SU_SU_SU_SU_SU_SU_UD6nUcn  U(       dg  [        UR                  5        VVs0 s H5  u  nnU[        U5      S:  a  [	        US   [
        5      (       a  US   OU_M7     snnUR                  5      nU R                  US   UU5        U$ s  snnf )Nr  r   r  r<   r8   r;   rC   rA   r  r   r   r   r   r   r   r   rd   r   r   )r"  r   ri   rX   rp   rj   r   r  )r{   r   r   r   r  r<   r8   r;   r  rC   rA   r  r   r   r   r   r   r   r   rd   r}   batched_inputbatched_outputrM   r  s                            rH   _encode_plus$PreTrainedTokenizerFast._encode_plusO  sk   . 09$*+tf00
 3
  2
 .	

 !4
 "
 
  2
 &
 *
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ "6'
0 !*C* '5&:&:&<&<
U c%j1nE!Hd9S9S%(Y^^&< ((N 	33N;4OQ[]des   7<C
c                     U R                   R                  b%  U R                   R                  R                  U5      $ SR                  U5      $ )N )ru   r   decodejoin)r{   r   s     rH   convert_tokens_to_string0PreTrainedTokenizerFast.convert_tokens_to_string  sJ     %%--9 ""**11&9	
 &!	
rK   	token_idsclean_up_tokenization_spacesc                     UR                  SS5      U l        [        U[        5      (       a  U/nU R                  R                  XS9nUb  UOU R                  nU(       a  U R                  U5      nU$ U$ )Nuse_source_tokenizerF)r   )rP   r[   rp   r   rY   r+  r0  clean_up_tokenization)r{   r/  r   r0  r}   r   
clean_texts          rH   _decodePreTrainedTokenizerFast._decode  s~     -3JJ7Mu,U)i%%"I%%i%Y ,7 )22 	%
 (33D9JKrK   save_directory
file_nameslegacy_formatfilename_prefixc                 l   [        U5      nU R                  c  USL a  [        S5      eUSL =(       d    USL =(       a!    U R                  SL=(       a    U R                  nUSL =(       d    USL nU(       a  [        R
                  R                  X(       a  US-   OS[        -   5      nU R                  R                  5        VV	s0 s H  u  pXR                  :  d  M  X_M     n
nn	U
(       a?  [        USSS	9 n[        R                  " U
S
SSS9S-   nUR                  U5        SSS5        U R                  XS9nX--   U4-   nU(       aR  [        R
                  R                  X(       a  US-   OS[         -   5      nU R"                  R%                  U5        X.4-   nU$ s  sn	nf ! , (       d  f       N= f)z
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
file containing {config + vocab + added-tokens}.
NTzYour tokenizer does not have a legacy version defined and therefore cannot register this version. You might consider leaving the legacy_format at `None` or setting it to `False`.F- wzutf-8)r      )indent	sort_keysensure_ascii
)r:  )rm   r*   rR   r   ospathr,  ADDED_TOKENS_FILErk   ri   r   openrs   dumpswritesave_vocabularyTOKENIZER_FILEru   save)r{   r7  r8  r9  r:  	save_slow	save_fastadded_tokens_filetokr   added_vocabfout_strvocab_filesr%   s                  rH   _save_pretrained(PreTrainedTokenizerFast._save_pretrained  s    ^,$$,$1F`  d";mt&; -))5-,, 	
 "T)C]e-C	 "/3!6rUf f! 9=8Q8Q8W8W8Yv8Y*#]bfufu]u:3:8YKv+S7Cq"jjQ$]bcfjjGGGG$ D ..~._K#15F4HHJWW\\/3!6rUc cN ""''7#&77J! wCCs   FF9,F%%
F3c           
      \
   [         R                  " U R                  R                  5       5      nUR	                  S5      nUR	                  S5      n	Sn
US   S   S:X  a  0 US   S'   / US   S'   OuUS   S   S	:X  a?  US   S
   b5  US   S
   nUS   S   U   S   n
Ub	  X;   a  XZ   n
SUS   S
'   U
S//US   S'   O*US   S   S;   a	  0 US   S'   O[        SUS   S    S35      eUb%  SUS   ;   a  US   S   U;   a  XWS   S      US   S'   [        R                  " [         R                  " U5      5      n/ nU Hl  nUR	                  SS5      nUR	                  SS5      nUS   S   S	:w  a	  U(       d  M<  Ub  US   U;   a
  X^S      US'   UR                  [        S'0 UD65        Mn     Ub  UR                  U5        US   S   S:X  a  SU;  a  US   S   b  US   S   US'   US   S   S:X  a  SU;  a  US   S   b  US   S   US'   US   S   S	:X  a  U
b  XS'   US   b_  US   S   S:X  d2  US   S   S:X  aG  SUS   ;   a>  [        S US   S    5       5      (       a!  [        R                  R                  5       US'   [         US   S      nU" S'X-S.UD6nUR#                  XUS9  U	Gb@  [         R                  " UR                  5       5      nSU	;   a  U	S    H  nU	S   U   S   nUb"  U Vs/ s H  nUR%                  UU5      PM     nnUU	S   U   S'   U H"  nUR'                  U5      nUb  M  [        S 5      e   U Vs/ s H  nUR'                  U5      PM     snU	S   U   S!'   M     S" HG  nUU	;   d  M  U	U   u  nnUb  UU;   a  UU   nUR'                  U5      nUc  [        S 5      eUU/U	U'   MI     U	US'   [        R                  " [         R                  " U5      5      nU R(                  R+                  5       n[,        R.                  R+                  5       nUR1                  S#5        U H  n[3        U U5      c  M  [3        U U5      nUb  UU;   a  UU   nU R4                  R%                  US5      n[7        U[        5      (       a;  [        UUR8                  UR:                  UR<                  UR>                  S$S%9UU'   M  UUU'   M     U R@                  nUb  UR                  U5        [C        U5      S:  a  UUS#'   U RD                  " S'S&U0UD6$ s  snf s  snf )(u  
Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
as the current one.

Args:
    text_iterator (generator of `List[str]`):
        The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
        if you have everything in memory.
    vocab_size (`int`):
        The size of the vocabulary you want for your tokenizer.
    length (`int`, *optional*):
        The total number of sequences in the iterator. This is used to provide meaningful progress tracking
    new_special_tokens (list of `str` or `AddedToken`, *optional*):
        A list of new special tokens to add to the tokenizer you are training.
    special_tokens_map (`Dict[str, str]`, *optional*):
        If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
        token name to new special token name in this argument.
    kwargs (`Dict[str, Any]`, *optional*):
        Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

Returns:
    [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
    `text_iterator`.

added_tokenspost_processorNmodelrO   r!   r   mergesr"   unk_idr   g        )r#   r$   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.	unk_tokenrN   idr   continuing_subword_prefixend_of_word_suffixrv   	ByteLevelSequencepretokenizersc              3   2   #    U  H  nUS    S:H  v   M     g7f)rO   ra  NrF   ).0pretokenizers     rH   	<genexpr>BPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<genexpr>D  s!      (X !(K7(Xs   initial_alphabet)r   r   )rB   trainerr   r   zQAttempted to set a token in the post processor that does not exist in the mappingr   )clssepr6   T)single_wordlstriprstrip
normalizedrN   r,   rF   )#rs   rt   rY   to_strrP   rR   rU   from_strrH  rq   r   extendanyry   ra  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorrQ   r   rZ   rS   r   SPECIAL_TOKENS_ATTRIBUTESremoverx   _special_tokens_maprp   rm  rn  ro  rp  r6   rX   r   )r{   text_iteratorr   rB   new_special_tokensspecial_tokens_mapr}   tokenizer_jsonrX  rY  r]  r\  r4   r   added_tokenrN   r  trainer_classrj  trained_tokenizer_jsonrM   r   r   token_idspecial_tokenspecial_tokens_listspecial_token_fullr6   s                               rH   train_new_from_iterator/PreTrainedTokenizerFast.train_new_from_iterator  sd   D DOO$:$:$<=%)).9'++,<=	'"6*e3/1N7#G,02N7#H-G$V,	9g&x0<'0:*73G<VDQG	%1i6U 2 =I45w'15>4D3Ew'0G$V,0JJ/1N7#G,Mn]dNeflNmMn o> >  *~g66w'48JJ3EU\F]^iFj3kN7#K0!**4::n+EF	 'K!ooi6Gd+Ag&v.);G!-+i2HL^2^);	<R)SI&!!*";{";< ( )!!"45 7#F+u4+69w'(CDP2@2IJe2fF./7#F+u4$F2w'(<=I+9'+BCW+XF'('"6*i7I<Q"+;/*6/7;F!/26:jH#~o'FF (6(G(X  
 .A-J-J-S-S-U)*01H1PQ_:_X^_%%mG%T%%)ZZ	0@0@0B%C">1)*:;C+,<=cB8LF)5TZ![TZ5"4"8"8"FTZ![FLN#34S9(C!'#,#8#8#?#+", s#  "( ouCuntejIDYDYZ_D`ntCuN#34S9%@ < "0 N2-m<HE1)5%CU:U 25 9(44U;H'(o  6;H4EN=1 "0 8F"#34%..tzz:P/QRI!!&&(0JJOOQ""#>?(EtU#/ 'e 4%1mGY6Y$6}$EM%)%=%=%A%A%%N"0*==$.%$6$B$B188188#5#@#@ $%F5M %2F5M% )( %)$B$B!)%,,-?@()A-2KF./~~CyCFCCq "\ Dvs   4T$	T))r[   rY   r1   r6   r&   )NNFFFFT)F)NF)FN)NN)NNN)A__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesr*   r   __annotations__rc   propertyboolr   r   r   r   dictrm   r   r   rk   r   r0   r   r   rU   ru   DecoderFastr   EncodingFastr   r  r   rj   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r   r   r   r   r   r"  r'  r-  r5  rD  PathLikerU  r  __static_attributes____classcell__)r   s   @rH   r(   r(   Q   s   
 *04-4zx       GC G GA4S> A  tCH~     nd38n n n :d3
?&; : :nc3h nF F =   ' ' ' 1504*/+0',#-(-(  (~-(  (~	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-(^UE#x}2D,E U%PSUYZ]U^P^J_ U   7# 7(3- 76d5j+A&B 6]` 6?d ?s ?, GLd3i(?C	sDI~	8uS u uRV umqrumv uI9)I9 0I9 	I9
 I9 %SMI9 smI9` $(,;,F,F2D2T2T$($),0&*(,0404*/+0',#%*+Y`"'OT-0$7H2I4PeKff#
Y`
 !Y` *Y` 0Y` SMY` Y` "Y` %SMY` smY` !Y`  (~Y`  (~Y`  $(!Y`" %)#Y`$ !%%Y`& 'Y`( )Y`* #+Y`, 
-Y`| DH#',;,F,F2D2T2T$($),0&*)-0404*/+0',#%*);I001; E)->">?@; !	;
 *; 0; SM; ; "; %SM; sm; !;  (~;  (~; $(;  %)!;" !%#;$ %;& ';( #);, 
-;z
tCy 
S 
 %*7;	d3i( " '/tn	 
8 )-)-/c2;;.// #J/  ~	/
 "#/ 
s/j rD rDrK   r(   )=r  rS   rs   rD  collectionsr   collections.abcr   typingr   r   r   tokenizers.pre_tokenizerspre_tokenizersry   
tokenizersr   r  r	   rU   tokenizers.decodersr
   r  tokenizers.trainersr   r   r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utilsr   tokenization_utils_baser   r   r   r   r   r   r   r   r   r   utilsr   r   r    
get_loggerr  loggerrK  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILErF  rv  r  r(   rF   rK   rH   <module>r     s   
   	 # $ ' ' 7 / 1 6 ^ ^ : 5 = 3   @ ? 
		H	% "3 / '  (      !!	  (6EXY  ,-|D5 |D .|DrK   