o
    Zh                     @   s^  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZ d
dlmZ d
dlmZ d
dlm Z  d
dl!m"Z" d
dl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z- d
dl.m/Z/m0Z0m1Z1 e12e3Z4dZ5dZ6dZ7dZ8dZ9e$d7 Z$eeeedZ:e5e8dZ;e0e$G dd de)Z<dS )z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)AnyOptionalUnion)Encoding)	Tokenizer)Decoder)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer   )convert_slow_tokenizer)convert_gguf_tokenizer)load_gguf_checkpoint)PreTrainedTokenizer)
INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBaseSpecialTokensMixin	TextInputTextInputPairTruncationStrategy)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)BPEUnigram	WordLevel	WordPiece)tokenizer_file
vocab_filec                )       sj  e Zd ZU dZeZdZeed<  fddZ	e
defddZe
defd	d
Ze
defddZdeeef fddZe
deeef fddZe
deeef fddZe
deeef fddZdeeef fddZdefddZe
defddZe
defddZ							d`ded ee d!ee d"ed#ed$ed%ed&edeeee f e!e f fd'd(Z"d)e#ee$e f de#ee!e f fd*d+Z%d,edefd-d.Z&d/edee fd0d1Z'dad2e!e#eef  defd3d4Z(dad5edefd6d7Z)	dad8e#ee!e f d9ede#ee!e f fd:d;Z*dbd<ed5ee d=ede!e fd>d?Z+d@e,dAe-dBedCedDee dEee fdFdGZ.de,j/e-j0ddHddddddddddddfdIe#e!e1 e!e2 e!e3 e!e4 f d=ed@e,dAe-dBee dCedJedDee dEee dKee d ee d!ee d"ed#ed$ed%ed&edLede5f&dMdNZ6dde,j/e-j0ddHddddddddddddfd<e#e1e3f dOee#e1e3f  d=ed@e,dAe-dBee dCedJedDee dEee dKee d ee d!ee d"ed#ed$ed%ed&edLede5f(dPdQZ7d)e!e defdRdSZ8		dcdTe#ee!e f d9edUee defdVdWZ9		dddXe#ee:j;f dYee dZee d[ee dee f
d\d]Z<			ded^d_Z=  Z>S )fPreTrainedTokenizerFastaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    Nslow_tokenizer_classc                    s  | dd }| dd }| dd }| dd }| dd}| di }|dd| _|r:|d u r:| jd u r:td	|d urDt|}	nw|d urP|sPt|}	nk|rWt	|}	nd|d urt
|d
}
|
d d }|
d }|
d }t||\}	}|| t|dkr|| n3| jd ur|dur| j|i |}t	|}	n|s|d
d | _|dg | _t	| dd}	d }ntd|	| _|d ur||j d| _| jj}|d ur| jjd+i | |d|d  |d|d  |d|d  |d|d  n| j  | jj}|d ur<| jjd+i | |d|d  |d|d  |d|d  |d|d  |d|d  t jd+i | | j| j_d d! | jD   fd"d#t| d$d% d&D t | j!" d'd# D  fd(d#| j#D 7 tdkrg }| j$}D ].}t%|t&r|j'pt(||v nt(||v }t%|t(rt&||d)}n||_'|)| q|r| *| z0t+,| j-j./ }|d| j| jkrt0t1| d*}| j|d< |d+i || j-_.W d S W d S  t2y   Y d S w ),Ntokenizer_objectZ__slow_tokenizer	gguf_filer$   	from_slowFadded_tokens_decoderadd_prefix_spacezCannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.r%   configZ
model_type	tokenizertokenizer_configr   additional_special_tokensT)Zfrom_tiktokena9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.
max_lengthtruncation_side	directionstridetruncation_strategystrategy	pad_tokenpad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofc                 S   s   h | ]}t t|qS  hashrepr.0tokenr=   r=   S/var/www/auris/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py	<setcomp>       z3PreTrainedTokenizerFast.__init__.<locals>.<setcomp>c                    s$   g | ]\}}t t| vr|qS r=   r>   )rB   indexrC   )added_tokens_decoder_hashr=   rD   
<listcomp>   s
    z4PreTrainedTokenizerFast.__init__.<locals>.<listcomp>c                 S      | d S Nr   r=   )xr=   r=   rD   <lambda>       z2PreTrainedTokenizerFast.__init__.<locals>.<lambda>keyc                 S   s   g | ]}t |qS r=   )strrA   r=   r=   rD   rI      s    c                    s    g | ]}| vr|vr|qS r=   r=   rA   )encodertokens_to_addr=   rD   rI      s    )specialtyper=   )3popgetr,   r'   
ValueErrorcopydeepcopyTokenizerFast	from_filer   r   r   updatelenr%   r0   
_tokenizerinit_kwargs_decode_use_source_tokenizer
truncationenable_truncation
setdefaultno_truncationpaddingenable_paddingsuper__init__split_special_tokensencode_special_tokensr+   sorteditemslistadded_tokens_encoderkeysZall_special_tokens_extendedZall_special_tokens
isinstancer   rT   rQ   append
add_tokensjsonloadsbackend_tokenizerpre_tokenizer__getstate__getattrpre_tokenizers_fast	Exception)selfargskwargsr(   Zslow_tokenizerr)   Zfast_tokenizer_filer*   r+   Zfast_tokenizerZ
gguf_paramarchitectureZtokenizer_dictr/   Zadditional_kwargs_truncation_paddingtokensspecial_tokensrC   Z
is_specialZpre_tok_stateZpre_tok_class	__class__)rH   rR   rS   rD   ri   b   s   










z PreTrainedTokenizerFast.__init__returnc                 C      dS )NTr=   r|   r=   r=   rD   is_fast   s   zPreTrainedTokenizerFast.is_fastc                 C   r   )z
        `bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        Tr=   r   r=   r=   rD   can_save_slow_tokenizer   s   z/PreTrainedTokenizerFast.can_save_slow_tokenizerc                 C      | j jddS )zP
        `int`: Size of the base vocabulary (without the added tokens).
        FZwith_added_tokensr_   Zget_vocab_sizer   r=   r=   rD   
vocab_size   s   z"PreTrainedTokenizerFast.vocab_sizec                 C   r   )NTr   )r_   	get_vocabr   r=   r=   rD   r      s   z!PreTrainedTokenizerFast.get_vocabc                 C   s   |   S N)r   r   r=   r=   rD   vocab   s   zPreTrainedTokenizerFast.vocabc                 C       dd t | j dd dD S )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                 S      i | ]\}}|j |qS r=   contentrB   vkr=   r=   rD   
<dictcomp>   rF   z@PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<dictcomp>c                 S   rJ   rK   r=   itemr=   r=   rD   rM      rN   z>PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<lambda>rO   rl   r+   rm   r   r=   r=   rD   ro      s    z,PreTrainedTokenizerFast.added_tokens_encoderc                 C   s
   | j  S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `Dict[str, int]`: The added tokens.
        )r_   Zget_added_tokens_decoderr   r=   r=   rD   r+      s   
z,PreTrainedTokenizerFast.added_tokens_decoderc                 C   r   )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `Dict[str, int]`: The added tokens.
        c                 S   r   r=   r   r   r=   r=   rD   r     rF   z;PreTrainedTokenizerFast.get_added_vocab.<locals>.<dictcomp>c                 S   rJ   rK   r=   r   r=   r=   rD   rM     rN   z9PreTrainedTokenizerFast.get_added_vocab.<locals>.<lambda>rO   r   r   r=   r=   rD   get_added_vocab
  s    z'PreTrainedTokenizerFast.get_added_vocabc                 C   r   )zD
        Size of the full vocabulary with the added tokens.
        Tr   r   r   r=   r=   rD   __len__  s   zPreTrainedTokenizerFast.__len__c                 C   s   | j S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )r_   r   r=   r=   rD   rv     s   z)PreTrainedTokenizerFast.backend_tokenizerc                 C   s   | j jS )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )r_   decoderr   r=   r=   rD   r      s   zPreTrainedTokenizerFast.decoderFTencodingreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 C   s   |du r	d| j v }|du rd| j v }|r |jdur |g|j }	n|g}	tt}
|	D ]>}|
d |j |r=|
d |j |rG|
d |j |rQ|
d |j |r[|
d |j	 |rg|
d t
|j q)|
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        NZtoken_type_idsattention_mask	input_idsspecial_tokens_maskZoffset_mappingr;   )Zmodel_input_namesZoverflowingr   rn   rr   idsZtype_idsr   r   offsetsr^   )r|   r   r   r   r   r   r   r   r   	encodingsZencoding_dicter=   r=   rD   _convert_encoding'  s,   

z)PreTrainedTokenizerFast._convert_encodingr   c                    s&   t |tr
 |S  fdd|D S )aX  
        Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the
        vocabulary.

        Args:
            tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s).

        Returns:
            `int` or `List[int]`: The token id or list of token ids.
        c                       g | ]}  |qS r=   )#_convert_token_to_id_with_added_vocrA   r   r=   rD   rI   d      zAPreTrainedTokenizerFast.convert_tokens_to_ids.<locals>.<listcomp>)rq   rQ   r   r|   r   r=   r   rD   convert_tokens_to_idsV  s   

z-PreTrainedTokenizerFast.convert_tokens_to_idsrC   c                 C   s   | j |}|d u r| jS |S r   )r_   token_to_idZunk_token_id)r|   rC   rG   r=   r=   rD   r   f  s   z;PreTrainedTokenizerFast._convert_token_to_id_with_added_vocrG   c                 C   s   | j t|S r   )r_   id_to_tokenint)r|   rG   r=   r=   rD   _convert_id_to_tokenl  s   z,PreTrainedTokenizerFast._convert_id_to_token
new_tokensc                 C   s   |r| j |S | j |S r   )r_   add_special_tokensrs   )r|   r   r   r=   r=   rD   _add_tokenso  s   z#PreTrainedTokenizerFast._add_tokenspairc                 C   s   | j |S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )r_   num_special_tokens_to_add)r|   r   r=   r=   rD   r   u  s   z1PreTrainedTokenizerFast.num_special_tokens_to_addr   skip_special_tokensc                 C   s`   t |tr| j|S g }|rt| jnt }|D ]}t|}||v r$q|| j| q|S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `List[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `List[str]`: The decoded token(s).
        )rq   r   r_   r   setZall_special_idsrr   )r|   r   r   r   Zids_to_skiprG   r=   r=   rD   convert_ids_to_tokens  s   
z-PreTrainedTokenizerFast.convert_ids_to_tokenstextr   c                 K   s   | j d|||d| S )N)r   	text_pairr   r=   )Zencode_plusr   )r|   r   r   r   r~   r=   r=   rD   tokenize  s   z PreTrainedTokenizerFast.tokenizepadding_strategyr5   r1   r4   r<   r:   c                    s   | j j | j j}|tjkr dur| j   n&|||j| jd} du r'd}	n	 fdd|D }	|	|kr=| j jdi | |t	j
krO|durM| j   dS dS |t	jkrV|nd}
|
|dur_|n| j| j| j| j|d}||krz| j jdi | dS dS )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r1   r4   r6   r3   c                    s   i | ]	}|  |d qS r   rW   )rB   r   r   r=   rD   r     s    zFPreTrainedTokenizerFast.set_truncation_and_padding.<locals>.<dictcomp>)r;   r3   Zpad_idr7   r9   r<   r=   )r_   rb   rf   r   DO_NOT_TRUNCATEre   valuer2   rc   r   
DO_NOT_PADZ
no_paddingZ
MAX_LENGTHr:   Zpad_token_idr7   r8   rg   )r|   r   r5   r1   r4   r<   r:   r   targetcurrentr;   r=   r   rD   set_truncation_and_padding  s>   !


z2PreTrainedTokenizerFast.set_truncation_and_paddingr   batch_text_or_text_pairsis_split_into_wordsreturn_tensorsrj   c                    s&  t |ttfstdt| dj||||||	d jj|kr&|j_jj|||d}fdd|D }i }|d d 	 D ]  fdd|D }|| < qId	d |D }r~g }t
|D ]\}\}}||gt|d
  7 }qh||d< |d
 D ]	}|| qt|||
dS )Nz:batch_text_or_text_pairs has to be a list or a tuple (got ))r   r5   r1   r4   r<   r:   )r   Zis_pretokenizedc                    s&   g | ]}j | d qS ))r   r   r   r   r   r   r   r   )r   )rB   r   )r   r   r   r   r   r   r|   r   r=   rD   rI   )  s    z>PreTrainedTokenizerFast._batch_encode_plus.<locals>.<listcomp>r   c                    s"   g | ]\}}|  D ]}|q
qS r=   r=   )rB   r   _r   rO   r=   rD   rI   ?  s   " c                 S   s   g | ]\}}|D ]}|qqS r=   r=   )rB   r   r   r   r=   r=   rD   rI   A  s    r   overflow_to_sample_mapping)Ztensor_type)rq   tuplern   	TypeErrorrU   r   r_   rk   Zencode_batchrp   	enumerater^   &_eventual_warn_about_too_long_sequencer   )r|   r   r   r   r5   r1   r4   r   r<   r:   r   r   r   r   r   r   r   r   rj   r   Ztokens_and_encodingsZsanitized_tokensstackZsanitized_encodingsr   itoksr   r   r=   )	rP   r   r   r   r   r   r   r|   r   rD   _batch_encode_plus  sF   	
z*PreTrainedTokenizerFast._batch_encode_plusr   c                 K   s   |r||fgn|g}| j |fi d|d|d|d|d|d|d|	d|
d	|d
|d|d|d|d|d|d|d||}|d u rY|sYtdd | D |j}| |d || |S )Nr   r   r   r5   r1   r4   r<   r:   r   r   r   r   r   r   r   r   rj   c                 S   s8   i | ]\}}|t |d krt|d  tr|d  n|qS )r   )r^   rq   rn   )rB   rP   r   r=   r=   rD   r     s    &z8PreTrainedTokenizerFast._encode_plus.<locals>.<dictcomp>r   )r   r   rm   r   r   )r|   r   r   r   r   r5   r1   r4   r   r<   r:   r   r   r   r   r   r   r   r   rj   r~   Zbatched_inputZbatched_outputr=   r=   rD   _encode_plusO  s`   	
z$PreTrainedTokenizerFast._encode_plusc                 C   s$   | j jd ur| j j|S d|S )N )rv   r   decodejoinr   r=   r=   rD   convert_tokens_to_string  s
   z0PreTrainedTokenizerFast.convert_tokens_to_string	token_idsclean_up_tokenization_spacesc                 K   sV   | dd| _t|tr|g}| jj||d}|d ur|n| j}|r)| |}|S |S )NZuse_source_tokenizerF)r   )rV   ra   rq   r   r_   r   r   Zclean_up_tokenization)r|   r   r   r   r~   r   Z
clean_textr=   r=   rD   _decode  s   

zPreTrainedTokenizerFast._decodesave_directory
file_nameslegacy_formatfilename_prefixc                    s6  t |} jdu r|du rtd|du s|du o  jduo  j}|du p(|du }|r}tj||r5|d ndt } fdd j	 D }|rot
|d	d
d}	tj|ddddd }
|	|
 W d   n1 sjw   Y   j||d}|| |f }|rtj||r|d ndt } j| ||f }|S )z
        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
        file containing {config + vocab + added-tokens}.
        NTzYour tokenizer does not have a legacy version defined and therefore cannot register this version. You might consider leaving the legacy_format at `None` or setting it to `False`.F- c                    s    i | ]\}}| j kr||qS r=   )r   )rB   tokrG   r   r=   rD   r     s     z<PreTrainedTokenizerFast._save_pretrained.<locals>.<dictcomp>wzutf-8)r      )indent	sort_keysensure_ascii
)r   )rQ   r'   rX   r   ospathr   ADDED_TOKENS_FILEro   rm   openrt   dumpswriteZsave_vocabularyTOKENIZER_FILErv   save)r|   r   r   r   r   Z	save_slowZ	save_fastZadded_tokens_fileZadded_vocabfZout_strZvocab_filesr$   r=   r   rD   _save_pretrained  s<   
z(PreTrainedTokenizerFast._save_pretrainedc              	      s,  t | j }|d}|d}	d}
|d d dkr)i |d d< g |d d< nW|d d d	kre|d d
 durd|d d
 }|d d | d }
 durU|
 v rU |
 }
d|d d
< |
dgg|d d< n|d d dv rti |d d< ntd|d d  d durd|d v r|d d  v r |d d  |d d< tt |g }|D ]5}|dd}|dd}|d d d	kr|sq dur|d  v rՈ |d  |d< |	t
d+i | q|dur|| |d d dkrd|vr|d d dur|d d |d< |d d dkr'd|vr'|d d dur'|d d |d< |d d d	kr9|
dur9|
|d< |d durn|d d dksg|d d dkrnd|d v rntdd |d d D rntj |d< t|d d  }|d+||d|}j|||d |	dur!t  }d|	v r|	d D ]D}|	d | d  } dur fd!d"|D }||	d | d < |D ]}|}|du rtd#qfd$d"|D |	d | d%< qd&D ]0}||	v r|	| \}} dur| v r | }|}|du rtd#||g|	|< q|	|d< tt || j }tj }|d' |D ]A}t| |durrt| |} durO| v rO | }| j|d}t|t
rnt
||j|j|j|jd(d)||< q2|||< q2| j }|dur|| t!|dkr||d'< | j"d+d*i|S ),uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `List[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`Dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`Dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        added_tokenspost_processorNmodelrU   r    r   Zmergesr!   unk_idr   g        )r"   r#   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.	unk_tokenrT   idr   Zcontinuing_subword_prefixZend_of_word_suffixrw   	ByteLevelSequenceZpretokenizersc                 s   s    | ]	}|d  dkV  qdS )rU   r   Nr=   )rB   Zpretokenizerr=   r=   rD   	<genexpr>D  s
    

zBPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<genexpr>Zinitial_alphabet)r   r   )r;   trainerr   r   c                    s   g | ]}  ||qS r=   r   rA   )special_tokens_mapr=   rD   rI   V  rF   zCPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<listcomp>zQAttempted to set a token in the post processor that does not exist in the mappingc                    r   r=   )r   rA   )r.   r=   rD   rI   _  r   r   )clssepr0   T)single_wordlstriprstrip
normalizedrT   r(   r=   )#rt   ru   r_   Zto_strrV   rX   r[   Zfrom_strr   rr   r   extendanyrz   r   alphabetMODEL_TO_TRAINER_MAPPINGZtrain_from_iteratorr   r`   rY   r   ZSPECIAL_TOKENS_ATTRIBUTESremovery   Z_special_tokens_maprW   rq   r  r  r	  r
  r0   r^   r   )r|   Ztext_iteratorr   r;   Znew_special_tokensr  r~   Ztokenizer_jsonr   r   r   r   r   Zadded_tokenrT   r   Ztrainer_classr  Ztrained_tokenizer_jsonrP   r   rC   Ztoken_idZspecial_tokenZspecial_tokens_listZspecial_token_fullr0   r=   )r  r.   rD   train_new_from_iterator  s   "










"






	

z/PreTrainedTokenizerFast.train_new_from_iterator)NNFFFFT)F)NF)FN)NN)NNN)?__name__
__module____qualname____doc__VOCAB_FILES_NAMESZvocab_files_namesr'   r   __annotations__ri   propertyboolr   r   r   r   dictrQ   r   r   ro   r   r+   r   r   r[   rv   DecoderFastr   EncodingFastr   r   r   rn   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   PathLiker   r  __classcell__r=   r=   r   rD   r&   Q   s  
 |				

*/ 
$
P	

^
	

=


5r&   )=r  rY   rt   r   collectionsr   collections.abcr   typingr   r   r   Ztokenizers.pre_tokenizersZpre_tokenizersrz   Z
tokenizersr   r  r   r[   Ztokenizers.decodersr	   r  Ztokenizers.trainersr
   r   r   r   r   Zintegrations.ggmlr   Zmodeling_gguf_pytorch_utilsr   Ztokenization_utilsr   Ztokenization_utils_baser   r   r   r   r   r   r   r   r   r   utilsr   r   r   Z
get_loggerr  loggerr   ZSPECIAL_TOKENS_MAP_FILEZTOKENIZER_CONFIG_FILEZTIKTOKEN_VOCAB_FILEr   r  r  r&   r=   r=   r=   rD   <module>   sB   0


