o
    Zhx                     @   sF  U d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( e r{ddl)m*Z* ndZ*e+e,Z-ere Z.ee/ee
e/ e
e/ f f e0d< 
neg de rdnde rdndffdde rdndffdde rdndffdde rdndffdde rdndffdd e rd!nde rd"ndffd#d$de rdndffd%e rd&nddffd'd(d)e rd*nde rd+ndffd,d-e rd.ndffd/d0de r-d1ndffd2d3d4de r=dndffd5d6e rId7ndffd8de rUd9ndffd:d;e rad<ndffd=de rmdndffd>d?e rzd@nde rdAndffdBdCe rdnde rdndffdDde rdndffdEd;e rd<ndffdFdGe rdHndffdIdGe rdHndffdJdKe rdLnde rdMndffdNdOe rdPndffdQde rdndffdRde rdndffdSde r	dndffdTdUe rdVndffdWe r dXnde r'dYndffdZd[d\d]d;e r9d<ndffd^d6e rEd7ndffd_d`e rQdandffdbe r\dcnde rcddndffdee rndnde rudndffdfe rdnde rdndffdgdhe rdindffdjdke rdlndffdmdne rdondffdpd6e rd7ndffdqde rdndffdre rdsnddffdtdude rd1ndffdvde rdwndffdxe rdynddffdzd{d|e rd}ndffd~dde rdndffde rdnde r#dndffde r.dnde r5dndffde r@dnde rGdndffde rRdnde rYdndffdde redndffdde rqd1ndffdde r}d1ndffde rdnddffdd6e rd7ndffdd6e rd7ndffdd6e rd7ndffdde rdwndffddd6e rd7ndffddde rdndffddGe rdHndffdde rd1ndffdde rdndffddd;e rd<ndffdde rdndffdde rdndffdde r+dndffdd6e r7d7ndffdd6e rCd7ndffdde rOdndffde rZdnde radndffdde rmdndffde rxdnde rdndffdde rdnde rdndffdde rdndffdde rdndffdde rdndffdde rdndffdde rdndffdde rdndffde rdnde rdndffde rdnde rdndffde r
dnde rdndffdde rdndffdde r)dndffdde r5dndffdde rAdndffdde rMdndffde rXdnde r_dndffdȑdde rmdndffde rxdnddffdde rdwndffdde rdwndffde rdnddffde rdnde rdndffde rdnde rdndffdd;e rd<ndffdde rdndffdڑde rdnde rdndffde rdnde rdndffdde rdndffde rdnddffdde r#dndffdde r/d1ndffdde r;d1ndffdde rGd1ndffdde rSdndffdde r_dwndffdd;e rkd<ndffde rvdnde r}dndffdde rdndffdde rdndffdde rdndffddde rd1ndffdde rdndffde rdnde rdndffde rdnde rdndffde rdnde rdndffdde rdwndffdde r	dwndffdde rdwndffddGe r!dHndffddGe r-dHndffd de r<dndffdd6e rId7ndffddGe rVdHndffddGe rcdHndffdde rpdndffde r|d-nde rd.ndffde rd-nde rd.ndffd	d
e rdnde rdndffddOe rdPndffdde rdndffdde rdndffddde rdndffdde rd1ndffde rdnddffddde rdndffdde rdndffdde r(dndffdde r5dndffdde rBdndffdde rOdndffdde r\dndffdde ridndffdde rvdndffddde rd ndffd!e rdnde rdndffd"e rd#nde rd$ndffd%e rd&nde rŐd'ndffd(d)e rԐd*ndffd+d;e rd<ndffd,d;e rd<ndffd-d.d/e 	r d0ndffd1de 	rdwndffd2e 	rd3nde 	r"d4ndffd5e 	r/d3nde 	r7d4ndffd6e 	rCdnde 	rJdndffd7e 	rWd8nddffd9e 	rddnde 	rkdndffd:e 	rxd;nddffd<d=e 	rd>nddffd?d@dAe 	rdBndffdCde 	rdwndffdDd6e 	rd7ndffdEe 	rdnde 	rdndffdFe 	rdnde 	rdndffdGdHdIdJde 	rdndffdKe 	rdLnde 
rdMndffdNe 
rdnde 
rdndffdOde 
r'dndffdPde 
r4dndffdQde 
rAdndffdRde 
rNdndffdSdTdUdVdWdXdYe 
rldZndffd[dGe 
rydHndffd\e 
rd]nde 
rd^ndffd_d`e 
rdanddffdbe 
rdnde 
rdndffdce 
rdnde 
rdndffdde 
rҐdende 
rڐdfndffdge 
rdnde 
rdndffdhe 
rdnde r dndffdie rdnde rdndffdje rdnde r&dndffZ.e"e$e.Z1dkdl e$2 D Z3dme/fdndoZ4		p					p	qddree/ej5f dse
ee/ej5f  dte6due
e6 dve
e	e/e/f  dwe
ee6e/f  dxe
e/ dye6dze/fd{d|Z7G d}d~ d~Z8dd~gZ9dS (  zAuto Tokenizer class.    N)OrderedDict)TYPE_CHECKINGDictOptionalTupleUnion   )PretrainedConfig)get_class_from_dynamic_moduleresolve_trust_remote_code)load_gguf_checkpoint)PreTrainedTokenizer)TOKENIZER_CONFIG_FILE)cached_fileextract_commit_hashis_g2p_en_availableis_sentencepiece_availableis_tokenizers_availablelogging   )EncoderDecoderConfig   )_LazyAutoMapping)CONFIG_MAPPING_NAMES
AutoConfigconfig_class_to_model_typemodel_type_to_module_name!replace_list_option_in_docstrings)PreTrainedTokenizerFastTOKENIZER_MAPPING_NAMESZalbertZAlbertTokenizerZAlbertTokenizerFastalignZBertTokenizerZBertTokenizerFastZariaZLlamaTokenizerZLlamaTokenizerFastZ
aya_visionZCohereTokenizerFastZbark)Zbart)ZBartTokenizerZBartTokenizerFastZbarthezZBarthezTokenizerZBarthezTokenizerFast)Zbartpho)ZBartphoTokenizerNZbertzbert-generationZBertGenerationTokenizer)zbert-japanese)ZBertJapaneseTokenizerN)Zbertweet)ZBertweetTokenizerNZbig_birdZBigBirdTokenizerZBigBirdTokenizerFastZbigbird_pegasusZPegasusTokenizerZPegasusTokenizerFast)Zbiogpt)ZBioGptTokenizerNZbitnetr   )Z
blenderbot)ZBlenderbotTokenizerZBlenderbotTokenizerFast)zblenderbot-small)ZBlenderbotSmallTokenizerNZblipzblip-2ZGPT2TokenizerZGPT2TokenizerFastZbloomZBloomTokenizerFastZbridgetowerZRobertaTokenizerZRobertaTokenizerFastZbros)Zbyt5)ZByT5TokenizerNZ	camembertZCamembertTokenizerZCamembertTokenizerFast)Zcanine)ZCanineTokenizerNZ	chameleonZchinese_clipclapZclipZCLIPTokenizerZCLIPTokenizerFastZclipseg)Zclvp)ZClvpTokenizerNZ
code_llamaZCodeLlamaTokenizerZCodeLlamaTokenizerFastZcodegenZCodeGenTokenizerZCodeGenTokenizerFastZcohereZcohere2ZcolpaliZconvbertZConvBertTokenizerZConvBertTokenizerFastZcpmZCpmTokenizerZCpmTokenizerFast)Zcpmant)ZCpmAntTokenizerN)Zctrl)ZCTRLTokenizerN)zdata2vec-audioZWav2Vec2CTCTokenizerNzdata2vec-textZdbrxZdebertaZDebertaTokenizerZDebertaTokenizerFastz
deberta-v2ZDebertaV2TokenizerZDebertaV2TokenizerFastZdeepseek_v3Z	diffllamaZ
distilbertZDistilBertTokenizerZDistilBertTokenizerFastZdprZDPRQuestionEncoderTokenizerZDPRQuestionEncoderTokenizerFastZelectraZElectraTokenizerZElectraTokenizerFastZemu3ZernieZernie_mZErnieMTokenizer)Zesm)ZEsmTokenizerNZfalconZfalcon_mambaZGPTNeoXTokenizerFastZfastspeech2_conformerZFastSpeech2ConformerTokenizer)Zflaubert)ZFlaubertTokenizerNZfnetZFNetTokenizerZFNetTokenizerFast)Zfsmt)ZFSMTTokenizerNZfunnelZFunnelTokenizerZFunnelTokenizerFastZgemmaZGemmaTokenizerZGemmaTokenizerFastZgemma2Zgemma3Zgemma3_textgitZglmZglm4zgpt-sw3ZGPTSw3TokenizerZgpt2Zgpt_bigcodeZgpt_neoZgpt_neox)Zgpt_neox_japanese)ZGPTNeoXJapaneseTokenizerNZgptj)zgptsan-japanese)ZGPTSanJapaneseTokenizerNzgrounding-dinoZgroupvitZheliumZherbertZHerbertTokenizerZHerbertTokenizerFast)Zhubertr"   ZibertZideficsZidefics2Zidefics3ZinstructblipZinstructblipvideoZinternvlZQwen2TokenizerZQwen2TokenizerFastZjambaZjanusZjetmoe)Zjukebox)ZJukeboxTokenizerNzkosmos-2ZXLMRobertaTokenizerZXLMRobertaTokenizerFastZlayoutlmZLayoutLMTokenizerZLayoutLMTokenizerFastZ
layoutlmv2ZLayoutLMv2TokenizerZLayoutLMv2TokenizerFastZ
layoutlmv3ZLayoutLMv3TokenizerZLayoutLMv3TokenizerFastZ	layoutxlmZLayoutXLMTokenizerZLayoutXLMTokenizerFastZledZLEDTokenizerZLEDTokenizerFastZliltllamaZllama4Zllama4_textZllavaZ
llava_nextZllava_next_videoZllava_onevisionZ
longformerZLongformerTokenizerZLongformerTokenizerFastZlongt5ZT5TokenizerZT5TokenizerFast)Zluke)ZLukeTokenizerNZlxmertZLxmertTokenizerZLxmertTokenizerFastZm2m_100ZM2M100TokenizerZmambaZmamba2ZmarianZMarianTokenizerZmbartZMBartTokenizerZMBartTokenizerFastZmbart50ZMBart50TokenizerZMBart50TokenizerFastmegazmegatron-bert)zmgp-str)ZMgpstrTokenizerNZmistralZmixtralZmllamaZmlukeZMLukeTokenizerZ
mobilebertZMobileBertTokenizerZMobileBertTokenizerFastZ
modernbertZ	moonshineZmoshiZmpnetZMPNetTokenizerZMPNetTokenizerFastZmptZmraZmt5ZMT5TokenizerZMT5TokenizerFastZmusicgenZmusicgen_melodyZmvpZMvpTokenizerZMvpTokenizerFast)Zmyt5)ZMyT5TokenizerNZnemotronZnezhaZnllbZNllbTokenizerZNllbTokenizerFastznllb-moeZnystromformerZolmoZolmo2Zolmoezomdet-turboZ	oneformerz
openai-gptZOpenAIGPTTokenizerZOpenAIGPTTokenizerFastoptZowlv2ZowlvitZ	paligemmaZpegasusZ	pegasus_x)Z	perceiver)ZPerceiverTokenizerNZ	persimmonphiZphi3Zphimoe)Zphobert)ZPhobertTokenizerNZ
pix2structZpixtralZplbartZPLBartTokenizer)Z
prophetnet)ZProphetNetTokenizerNZqdqbertZqwen2Zqwen2_5_omniZ
qwen2_5_vlZqwen2_audioZ	qwen2_moeZqwen2_vlZqwen3Z	qwen3_moe)Zrag)ZRagTokenizerNrealmZRealmTokenizerZRealmTokenizerFastZrecurrent_gemmaZreformerZReformerTokenizerZReformerTokenizerFastZrembertZRemBertTokenizerZRemBertTokenizerFastZ	retribertZRetriBertTokenizerZRetriBertTokenizerFastZrobertazroberta-prelayernorm)Zroc_bert)ZRoCBertTokenizerNZroformerZRoFormerTokenizerZRoFormerTokenizerFastZrwkvZseamless_m4tZSeamlessM4TTokenizerZSeamlessM4TTokenizerFastZseamless_m4t_v2Zshieldgemma2ZsiglipZSiglipTokenizerZsiglip2Zspeech_to_textZSpeech2TextTokenizer)Zspeech_to_text_2)ZSpeech2Text2TokenizerNZspeecht5ZSpeechT5Tokenizer)Zsplinter)ZSplinterTokenizerZSplinterTokenizerFastZsqueezebertZSqueezeBertTokenizerZSqueezeBertTokenizerFastZstablelmZ
starcoder2Zswitch_transformersZt5)Ztapas)ZTapasTokenizerN)Ztapex)ZTapexTokenizerN)z
transfo-xl)ZTransfoXLTokenizerNZtvpZudopZUdopTokenizerZUdopTokenizerFastZumt5Zvideo_llavaZviltZvipllavaZvisual_bert)Zvits)ZVitsTokenizerN)Zwav2vec2r"   )zwav2vec2-bertr"   )zwav2vec2-conformerr"   )Zwav2vec2_phoneme)ZWav2Vec2PhonemeCTCTokenizerNZwhisperZWhisperTokenizerZWhisperTokenizerFastZxclipZxglmZXGLMTokenizerZXGLMTokenizerFast)Zxlm)ZXLMTokenizerNzxlm-prophetnetZXLMProphetNetTokenizerzxlm-robertazxlm-roberta-xlZxlnetZXLNetTokenizerZXLNetTokenizerFastZxmodZyosoZzambaZzamba2c                 C   s   i | ]\}}||qS  r)   ).0kvr)   r)   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py
<dictcomp>  s    r.   
class_namec              	   C   s   | dkrt S t D ]'\}}| |v r1t|}td| d}zt|| W   S  ty0   Y q
w q
tj	 D ]\}}|D ]}t|dd | krM|    S q=q7td}t
|| r^t|| S d S )Nr   .ztransformers.models__name__Ztransformers)r   r   itemsr   	importlibimport_modulegetattrAttributeErrorTOKENIZER_MAPPING_extra_contenthasattr)r/   module_nameZ
tokenizersmoduleconfigZ	tokenizerZmain_moduler)   r)   r-   tokenizer_class_from_name  s,   	


r=   F pretrained_model_name_or_path	cache_dirforce_downloadresume_downloadproxiestokenrevisionlocal_files_only	subfolderc	                 K   s   |	 dd}
|
durtdt |durtd|
}|	dd}t| t||||||||ddd|d}|du r=t	d i S t
||}t|d	d
}t|}W d   n1 sXw   Y  ||d< |S )a	  
    Loads the tokenizer configuration from a pretrained model tokenizer configuration.

    Args:
        pretrained_model_name_or_path (`str` or `os.PathLike`):
            This can be either:

            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
              huggingface.co.
            - a path to a *directory* containing a configuration file saved using the
              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.

        cache_dir (`str` or `os.PathLike`, *optional*):
            Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
            cache should not be used.
        force_download (`bool`, *optional*, defaults to `False`):
            Whether or not to force to (re-)download the configuration files and override the cached versions if they
            exist.
        resume_download:
            Deprecated and ignored. All downloads are now resumed by default when possible.
            Will be removed in v5 of Transformers.
        proxies (`Dict[str, str]`, *optional*):
            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
        token (`str` or *bool*, *optional*):
            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
            when running `huggingface-cli login` (stored in `~/.huggingface`).
        revision (`str`, *optional*, defaults to `"main"`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
            identifier allowed by git.
        local_files_only (`bool`, *optional*, defaults to `False`):
            If `True`, will only try to load the tokenizer configuration from local files.
        subfolder (`str`, *optional*, defaults to `""`):
            In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can
            specify the folder name here.

    <Tip>

    Passing `token=True` is required when you want to use a private model.

    </Tip>

    Returns:
        `Dict`: The configuration of the tokenizer.

    Examples:

    ```python
    # Download configuration from huggingface.co and cache.
    tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased")
    # This model does not have a tokenizer config so the result will be an empty dict.
    tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base")

    # Save a pretrained tokenizer locally and you can reload its config
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
    tokenizer.save_pretrained("tokenizer-test")
    tokenizer_config = get_tokenizer_config("tokenizer-test")
    ```use_auth_tokenNrThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.V`token` and `use_auth_token` are both specified. Please set only the argument `token`._commit_hashF)r@   rA   rB   rC   rD   rE   rF   rG   Z _raise_exceptions_for_gated_repoZ%_raise_exceptions_for_missing_entriesZ'_raise_exceptions_for_connection_errorsrK   z\Could not locate the tokenizer configuration file, will try to use the model config instead.zutf-8)encoding)popwarningswarnFutureWarning
ValueErrorgetr   r   loggerinfor   openjsonload)r?   r@   rA   rB   rC   rD   rE   rF   rG   kwargsrH   Zcommit_hashZresolved_config_filereaderresultr)   r)   r-   get_tokenizer_config  sF   I

r[   c                   @   s:   e Zd ZdZdd Zeeedd Ze	d
dd	Z
dS )AutoTokenizera  
    This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
    created with the [`AutoTokenizer.from_pretrained`] class method.

    This class cannot be instantiated directly using `__init__()` (throws an error).
    c                 C   s   t d)Nz}AutoTokenizer is designed to be instantiated using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.)EnvironmentError)selfr)   r)   r-   __init__1  s   zAutoTokenizer.__init__c              	   O   sb  | dd}|dur tdt |dddurtd||d< | dd}d|d< | d	d}| d
d}| dd}|dd}	|durd}
t|d}|du rgtd| dddd t D  d|\}}|r{|durvt	|}
nt
d |
du rt	|}
|
du rtd| d|
j|g|R i |S t|fi |}d|v r|d |d< |d}d}d|v rt|d ttfr|d }n|d dd}|du rt|ts|	rt||	fi |}t|ddd }tjd(i |}ntj|fd|i|}|j}t|drd|jv r|jd }|du}t|tv p1|duo1t	|dup1t	|d du}t||||}|rz|rz|rN|d durN|d }n|d }t||fi |}
| dd}tj|rl|
   |
j|g|R d|i|S |durd}
|r|!ds| d}t	|}
|
du r|}t	|}
|
du rtd| d |
j|g|R i |S t|t"rt|j#t|j$urt
d!|j$j% d"|j#j% d# |j$}t&t|j'}|durtt| \}}|r|s|du r|j|g|R i |S |dur|j|g|R i |S td$td%|j% d&dd'd t D  d))a]  
        Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.

        The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either
        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by
        falling back to using pattern matching on `pretrained_model_name_or_path`:

        List options

        Params:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                Can be either:

                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
                    - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
                      applicable to all derived classes)
            inputs (additional positional arguments, *optional*):
                Will be passed along to the Tokenizer `__init__()` method.
            config ([`PretrainedConfig`], *optional*)
                The configuration object used to determine the tokenizer class to instantiate.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded pretrained model configuration should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download the model weights and configuration files and override the
                cached versions if they exist.
            resume_download:
                Deprecated and ignored. All downloads are now resumed by default when possible.
                Will be removed in v5 of Transformers.
            proxies (`Dict[str, str]`, *optional*):
                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            subfolder (`str`, *optional*):
                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                facebook/rag-token-base), specify it here.
            use_fast (`bool`, *optional*, defaults to `True`):
                Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for
                a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer
                is returned instead.
            tokenizer_type (`str`, *optional*):
                Tokenizer type to be loaded.
            trust_remote_code (`bool`, *optional*, defaults to `False`):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
                `additional_special_tokens`. See parameters in the `__init__()` for more details.

        Examples:

        ```python
        >>> from transformers import AutoTokenizer

        >>> # Download vocabulary from huggingface.co and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")

        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
        >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")

        >>> # Download vocabulary from huggingface.co and define model-specific arguments
        >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True)
        ```rH   NrI   rD   rJ   r<   TZ
_from_autouse_fasttokenizer_typetrust_remote_code	gguf_filezPassed `tokenizer_type` z3 does not exist. `tokenizer_type` should be one of z, c                 s   s    | ]}|V  qd S Nr)   r*   cr)   r)   r-   	<genexpr>  s    z0AutoTokenizer.from_pretrained.<locals>.<genexpr>r0   zt`use_fast` is set to `True` but the tokenizer class does not have a fast version.  Falling back to the slow version.zTokenizer class z is not currently imported.rK   tokenizer_classauto_mapr\   F)Zreturn_tensorsZFastr   r   Zcode_revisionz- does not exist or is not currently imported.z The encoder model config class: z3 is different from the decoder model config class: z. It is not recommended to use the `AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder specific tokenizer classes.zzThis tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed in order to use this tokenizer.z!Unrecognized configuration class z8 to build an AutoTokenizer.
Model type should be one of c                 s   s    | ]}|j V  qd S rd   )r1   re   r)   r)   r-   rg     s    r)   )(rM   rN   rO   rP   rR   rQ   r   joinkeysr=   rS   warningfrom_pretrainedr[   
isinstancetuplelistr	   r   r   r   Z	for_modelrh   r9   ri   typer7   r   r
   ospathisdirZregister_for_auto_classendswithr   decoderencoder	__class__r   r1   )clsr?   ZinputsrX   rH   r<   r`   ra   rb   rc   rh   Ztokenizer_class_tupleZtokenizer_class_nameZtokenizer_fast_class_nameZtokenizer_configZconfig_tokenizer_classZtokenizer_auto_mapZ	gguf_pathZconfig_dictZhas_remote_codeZhas_local_codeZ	class_ref_Ztokenizer_class_candidateZ
model_typeZtokenizer_class_pyZtokenizer_class_fastr)   r)   r-   rm   7  s   M















zAutoTokenizer.from_pretrainedNFc                 C   s   |du r|du rt d|durt|trt d|dur&t|tr&t d|durD|durDt|trD|j|krDt d|j d| d| tjv r[t|  \}}|du rU|}|du r[|}tj| ||f|d dS )	a  
        Register a new tokenizer in this mapping.


        Args:
            config_class ([`PretrainedConfig`]):
                The configuration corresponding to the model to register.
            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
                The slow tokenizer to register.
            fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
                The fast tokenizer to register.
        NzKYou need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_classz:You passed a fast tokenizer in the `slow_tokenizer_class`.z:You passed a slow tokenizer in the `fast_tokenizer_class`.zThe fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not consistent with the slow tokenizer class you passed (fast tokenizer has z and you passed z!. Fix one of those so they match!)exist_ok)rQ   
issubclassr   r   slow_tokenizer_classr7   r8   register)Zconfig_classr}   Zfast_tokenizer_classr{   Zexisting_slowZexisting_fastr)   r)   r-   r~     s2   

zAutoTokenizer.register)NNF)r1   
__module____qualname____doc__r_   classmethodr   r   rm   staticmethodr~   r)   r)   r)   r-   r\   )  s     _r\   r7   )NFNNNNFr>   ):r   r3   rV   rr   rN   collectionsr   typingr   r   r   r   r   Zconfiguration_utilsr	   Zdynamic_module_utilsr
   r   Zmodeling_gguf_pytorch_utilsr   Ztokenization_utilsr   Ztokenization_utils_baser   utilsr   r   r   r   r   r   Zencoder_decoderr   Zauto_factoryr   Zconfiguration_autor   r   r   r   r   Ztokenization_utils_fastr   Z
get_loggerr1   rS   r   str__annotations__r7   r2   ZCONFIG_TO_TYPEr=   PathLikeboolr[   r\   __all__r)   r)   r)   r-   <module>   s   	
*	
 !"#$%&'()*,24:<CJPRXYZ[\^defghikry     	  
                        &  -  3  4  5  6  7  8  9  :  ;  <  =  >  ?  @  A  B  C  D  E  F  G  H  I  K  Q  S  Y  [  a  b  c  d  e  f  h  o  v  |  }  ~             	    
                                        !    (    .    /    0    1    2    3    4    5    6    8    >    ?    @    A    B    C    E    L    S    Y    Z    [    ]    `    b    e    f    g    h    j    q    w                           	      
                                                                   "      )      /      0      2      9      @      F      G      I      L      M      N      P      W      ^      d      f      l      m      n      o      q      t      u      w      ~                                       
                                                                                                 !        #        )        *        ,        3        :        A        H        O        V        
`	o  