o
    Zh+                     @   s   d dl Z d dlmZ d dlmZmZmZ d dlmZ ddl	m
Z
mZ ddlmZ ddlmZmZ e r:d	d
lmZ ndZeeZdddZg dZG dd deZdgZdS )    N)copyfile)ListOptionalTuple)
processors   )
AddedTokenBatchEncoding)PreTrainedTokenizerFast)is_sentencepiece_availablelogging   )MBartTokenizerzsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)Zar_ARcs_CZde_DEen_XXZes_XXet_EEfi_FIZfr_XXgu_INhi_INit_ITZja_XXkk_KZko_KRlt_LTlv_LVZmy_MMne_NPZnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CNc                       s  e Zd ZU dZeZddgZeZg Z	e
e ed< g Ze
e ed< 									
				d3 fdd	ZedefddZedefddZejdeddfddZ	d4de
e dee
e  de
e fddZ	d4de
e dee
e  de
e fddZdedee dee fddZ	 		!d5d"e
e ded#ee
e  dedef
 fd$d%Zd&d' Zd(d) Zd6d*d+Zd,eddfd-d.Zd4d/ed0ee dee fd1d2Z   Z!S )7MBartTokenizerFastuO  
    Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on
    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).

    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
    refer to this superclass for more information regarding those methods.

    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
    <tokens> <eos>` for target language documents.

    Examples:

    ```python
    >>> from transformers import MBartTokenizerFast

    >>> tokenizer = MBartTokenizerFast.from_pretrained(
    ...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
    ... )
    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
    ```Z	input_idsZattention_maskprefix_tokenssuffix_tokensN<s></s><unk><pad><mask>c                    s   t |	trt|	dddn|	}	t  |d ur"  fdd|D  t jd
|||||||||	|
| d| |_fddtD _	|
d urK|
nd	_
j
_|_j
 d S )NTF)lstriprstripc                    s   g | ]}| vr|qS  r.   ).0t)_additional_special_tokensr.   `/var/www/auris/lib/python3.10/site-packages/transformers/models/mbart/tokenization_mbart_fast.py
<listcomp>a   s    z/MBartTokenizerFast.__init__.<locals>.<listcomp>)r   r   	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_tokensrc_langtgt_langadditional_special_tokensc                    s   i | ]}|  |qS r.   )convert_tokens_to_ids)r/   Z	lang_codeselfr.   r2   
<dictcomp>u   s    z/MBartTokenizerFast.__init__.<locals>.<dictcomp>r   r.   )
isinstancestrr   FAIRSEQ_LANGUAGE_CODEScopyextendsuper__init__r   Zlang_code_to_id	_src_langr>   cur_lang_coder<   set_src_lang_special_tokens)r@   r   r   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   kwargs	__class__)r1   r@   r2   rH   I   s<   
zMBartTokenizerFast.__init__returnc                 C   s   | j r
tj| j S dS )NF)r   ospathisfiler?   r.   r.   r2   can_save_slow_tokenizer~   s   z*MBartTokenizerFast.can_save_slow_tokenizerc                 C   s   | j S N)rI   r?   r.   r.   r2   r;      s   zMBartTokenizerFast.src_langnew_src_langc                 C   s   || _ | | j  d S rT   )rI   rK   )r@   rU   r.   r.   r2   r;      s   token_ids_0token_ids_1c                 C   s,   |du r| j | | j S | j | | | j S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. The special tokens depend on calling set_lang.

        An MBART sequence has the following format, where `X` represents the sequence:

        - `input_ids` (for encoder) `X [eos, src_lang_code]`
        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
        separator.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)r%   r&   )r@   rV   rW   r.   r.   r2    build_inputs_with_special_tokens   s   z3MBartTokenizerFast.build_inputs_with_special_tokensc                 C   sP   | j g}| jg}|du rt|| | dg S t|| | | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        Nr   )Zsep_token_idZcls_token_idlen)r@   rV   rW   sepclsr.   r.   r2   $create_token_type_ids_from_sequences   s
   "z7MBartTokenizerFast.create_token_type_ids_from_sequencesreturn_tensorsr;   r<   c                 K   sJ   |du s|du rt d|| _| |fd|d|}| |}||d< |S )zIUsed by translation pipeline, to prepare inputs for the generate functionNzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)Zadd_special_tokensr]   Zforced_bos_token_id)
ValueErrorr;   r>   )r@   Z
raw_inputsr]   r;   r<   extra_kwargsZinputsZtgt_lang_idr.   r.   r2   _build_translation_inputs   s   
z,MBartTokenizerFast._build_translation_inputsr   r   	src_texts	tgt_textsc                    s"   || _ || _t j||fi |S rT   )r;   r<   rG   prepare_seq2seq_batch)r@   ra   r;   rb   r<   rL   rM   r.   r2   rc      s   z(MBartTokenizerFast.prepare_seq2seq_batchc                 C      |  | jS rT   )rK   r;   r?   r.   r.   r2   _switch_to_input_mode      z(MBartTokenizerFast._switch_to_input_modec                 C   rd   rT   )set_tgt_lang_special_tokensr<   r?   r.   r.   r2   _switch_to_target_mode   rf   z)MBartTokenizerFast._switch_to_target_modec                 C   |   |  || _g | _| j| jg| _| | j}| | j}tj|dg | |ddg | tt	|| | j| j d| j
_dS )z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].$A$BsinglepairZspecial_tokensNr>   rJ   r%   Zeos_token_idr&   Zconvert_ids_to_tokensr   ZTemplateProcessinglistzipZ
_tokenizerZpost_processor)r@   r;   prefix_tokens_strsuffix_tokens_strr.   r.   r2   rK         z.MBartTokenizerFast.set_src_lang_special_tokenslangc                 C   ri   )zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].rj   rk   rl   Nro   )r@   ru   rr   rs   r.   r.   r2   rg      rt   z.MBartTokenizerFast.set_tgt_lang_special_tokenssave_directoryfilename_prefixc                 C   s~   | j stdtj|std| d d S tj||r"|d ndtd  }tj	| j
tj	|kr<t| j
| |fS )NzhYour fast tokenizer does not have the necessary information to save the vocabulary for a slow tokenizer.zVocabulary path (z) should be a directory.- r   )rS   r^   rP   rQ   isdirloggererrorjoinVOCAB_FILES_NAMESabspathr   r   )r@   rv   rw   Zout_vocab_filer.   r.   r2   save_vocabulary   s   z"MBartTokenizerFast.save_vocabulary)NNr'   r(   r(   r'   r)   r*   r+   NNNrT   )r   Nr   )rO   N)"__name__
__module____qualname____doc__r~   Zvocab_files_namesZmodel_input_namesr   Zslow_tokenizer_classr%   r   int__annotations__r&   rH   propertyboolrS   rC   r;   setterr   rX   r\   r`   r	   rc   re   rh   rK   rg   r   r   __classcell__r.   r.   rM   r2   r$   *   s   
 5






(r$   )rP   shutilr   typingr   r   r   Z
tokenizersr   Ztokenization_utilsr   r	   Ztokenization_utils_fastr
   utilsr   r   Ztokenization_mbartr   Z
get_loggerr   r{   r~   rD   r$   __all__r.   r.   r.   r2   <module>   s    

 
h