o
    ZhB                     @   s  d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZddlmZ ddlmZ ddlmZ eeZd	d
ddddZdZeddG dd deZdede
ee	f dejfddZdeddfddZdedee
ef fddZ dgZ!dS )    N)Path)copyfile)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)logging)requiresz
source.spmz
target.spmz
vocab.jsonztarget_vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabtarget_vocab_fileZtokenizer_config_fileu   ▁)sentencepiece)backendsc                
       s  e Zd ZdZeZddgZedZ											
dEde
eeef  ddf fddZdd ZdedefddZdd ZdefddZdedee fddZdedefddZ fddZ fd d!Zd"ee defd#d$ZdFdee fd%d&Zd'd( Zd)d* Zedefd+d,ZdFd-ed.e
e dee fd/d0Z defd1d2Z!d3d4 Z"d5d6 Z#defd7d8Z$d9eddfd:d;Z%d<d= Z&d>d? Z'	
dGd@edAe
e dBe(dee fdCdDZ)  Z*S )HMarianTokenizeraB  
    Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        source_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the source language.
        target_spm (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
            contains the vocabulary for the target language.
        source_lang (`str`, *optional*):
            A string representing the source language.
        target_lang (`str`, *optional*):
            A string representing the target language.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        model_max_length (`int`, *optional*, defaults to 512):
            The maximum sentence length the model accepts.
        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Examples:

    ```python
    >>> from transformers import MarianForCausalLM, MarianTokenizer

    >>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
    >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
    >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
    >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)

    >>> outputs = model(**inputs)  # should work
    ```Z	input_idsZattention_maskz>>.+<<N<unk></s><pad>   Fsp_model_kwargsreturnc                    s.  |d u ri n|| _ t| sJ d| || _t|| _t|| jvr)tdt|	| jv s2J |rHt|| _dd | j	 D | _
g | _ndd | j	 D | _
dd | jD | _|| _|| _||g| _t|| j | _t|| j | _| j| _| j| _|   t jd	|||||	|
| j ||d	| d S )
Nzcannot find spm source z <unk> token must be in the vocabc                 S      i | ]\}}||qS  r   .0kvr   r   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/marian/tokenization_marian.py
<dictcomp>       z,MarianTokenizer.__init__.<locals>.<dictcomp>c                 S   r   r   r   r   r   r   r!   r"      r#   c                 S   s$   g | ]}| d r|dr|qS )z>>z<<)
startswithendswith)r   r   r   r   r!   
<listcomp>   s   $ z,MarianTokenizer.__init__.<locals>.<listcomp>)	source_langtarget_lang	unk_token	eos_token	pad_tokenmodel_max_lengthr   r   separate_vocabsr   )r   r   existsr-   	load_jsonencoderstrKeyErrortarget_encoderitemsdecoderZsupported_language_codesr'   r(   	spm_filesload_spm
spm_source
spm_targetcurrent_spmcurrent_encoder_setup_normalizersuper__init__)selfr   r   r   r   r'   r(   r)   r*   r+   r,   r   r-   kwargs	__class__r   r!   r>   m   sD   



zMarianTokenizer.__init__c              	   C   sN   zddl m} || jj| _W d S  ttfy&   td dd | _Y d S w )Nr   )MosesPunctNormalizerz$Recommended: pip install sacremoses.c                 S   s   | S Nr   )xr   r   r!   <lambda>   s    z3MarianTokenizer._setup_normalizer.<locals>.<lambda>)	Z
sacremosesrC   r'   	normalizepunc_normalizerImportErrorFileNotFoundErrorwarningswarn)r?   rC   r   r   r!   r<      s   
z!MarianTokenizer._setup_normalizerrE   c                 C   s   |r|  |S dS )zHCover moses empty string edge case. They return empty list for '' input! )rH   )r?   rE   r   r   r!   rG      s   zMarianTokenizer.normalizec                 C   s   | j || j | j S rD   )r;   getr)   )r?   tokenr   r   r!   _convert_token_to_id   s   z$MarianTokenizer._convert_token_to_idtextc                 C   s2   | j |}|r|dgng }|| j d|fS )z6Remove language codes like >>fr<< before sentencepiecer   rM   )language_code_rematchgroupsub)r?   rQ   rS   coder   r   r!   remove_language_code   s   z$MarianTokenizer.remove_language_codec                 C   s&   |  |\}}| jj|td}|| S )N)Zout_type)rW   r:   encoder1   )r?   rQ   rV   piecesr   r   r!   	_tokenize   s   zMarianTokenizer._tokenizeindexc                 C   s   | j || jS )z?Converts an index (integer) in a token (str) using the decoder.)r5   rN   r)   )r?   r[   r   r   r!   _convert_id_to_token   s   z$MarianTokenizer._convert_id_to_tokenc                       t  j|fi |S )ao  
        Convert a list of lists of token ids into a list of strings by calling decode.

        Args:
            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `List[str]`: The list of decoded sentences.
        )r=   batch_decode)r?   	sequencesr@   rA   r   r!   r^      s   zMarianTokenizer.batch_decodec                    r]   )a  
        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
        tokens and clean up tokenization spaces.

        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

        Args:
            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
                List of tokenized input ids. Can be obtained using the `__call__` method.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.
            clean_up_tokenization_spaces (`bool`, *optional*):
                Whether or not to clean up the tokenization spaces. If `None`, will default to
                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                problems).
            kwargs (additional keyword arguments, *optional*):
                Will be passed to the underlying model specific decode method.

        Returns:
            `str`: The decoded sentence.
        )r=   decode)r?   Z	token_idsr@   rA   r   r!   r`      s   zMarianTokenizer.decodetokensc                 C   sv   | j r| jn| j}g }d}|D ]}|| jv r$|||| d 7 }g }q|| q|||7 }|td}| S )zQUses source spm if _decode_use_source_tokenizer is True, and target spm otherwiserM    )	Z_decode_use_source_tokenizerr8   r9   Zall_special_tokensZdecode_piecesappendreplaceSPIECE_UNDERLINEstrip)r?   ra   Zsp_modelZcurrent_sub_tokensZ
out_stringrO   r   r   r!   convert_tokens_to_string   s   
z(MarianTokenizer.convert_tokens_to_stringc                 C   s$   |du r
|| j g S || | j g S )z=Build model inputs from a sequence by appending eos_token_id.N)Zeos_token_id)r?   token_ids_0token_ids_1r   r   r!    build_inputs_with_special_tokens  s   z0MarianTokenizer.build_inputs_with_special_tokensc                 C   s   | j | _| j| _d S rD   )r8   r:   r0   r;   r?   r   r   r!   _switch_to_input_mode  s   z%MarianTokenizer._switch_to_input_modec                 C   s   | j | _| jr| j| _d S d S rD   )r9   r:   r-   r3   r;   rk   r   r   r!   _switch_to_target_mode  s   z&MarianTokenizer._switch_to_target_modec                 C   s
   t | jS rD   )lenr0   rk   r   r   r!   
vocab_size  s   
zMarianTokenizer.vocab_sizesave_directoryfilename_prefixc              	   C   s  t j|std| d d S g }| jrOt j||r |d ndtd  }t j||r1|d ndtd  }t| j	| t| j
| || || nt j||rY|d ndtd  }t| j	| || ttd td g| j| j| jgD ]Z\}}}	t j||r|d nd| }
t j|t j|
krt j|rt||
 ||
 q|t j|st|
d	}|	 }|| W d    n1 sw   Y  ||
 q|t|S )
NzVocabulary path (z) should be a directory-rM   r   r   r   r   wb)ospathisdirloggererrorr-   joinVOCAB_FILES_NAMES	save_jsonr0   r3   rc   zipr6   r8   r9   abspathisfiler   openZserialized_model_protowritetuple)r?   rp   rq   Zsaved_filesZout_src_vocab_fileZout_tgt_vocab_fileZout_vocab_fileZspm_save_filenameZspm_orig_pathZ	spm_modelZspm_save_pathfiZcontent_spiece_modelr   r   r!   save_vocabulary  sR   


$

zMarianTokenizer.save_vocabularyc                 C   s   |   S rD   )get_src_vocabrk   r   r   r!   	get_vocabL  s   zMarianTokenizer.get_vocabc                 C      t | jfi | jS rD   )dictr0   Zadded_tokens_encoderrk   r   r   r!   r   O     zMarianTokenizer.get_src_vocabc                 C   r   rD   )r   r3   Zadded_tokens_decoderrk   r   r   r!   get_tgt_vocabR  r   zMarianTokenizer.get_tgt_vocabc                 C   s"   | j  }|tg d |S )N)r8   r9   r:   rH   r   )__dict__copyupdater   fromkeys)r?   stater   r   r!   __getstate__U  s
   
zMarianTokenizer.__getstate__dc                    sF   | _ t dsi  _ fdd jD \ _ _ j _   d S )Nr   c                 3   s    | ]	}t | jV  qd S rD   )r7   r   )r   frk   r   r!   	<genexpr>c  s    z/MarianTokenizer.__setstate__.<locals>.<genexpr>)r   hasattrr   r6   r8   r9   r:   r<   )r?   r   r   rk   r!   __setstate__\  s   
zMarianTokenizer.__setstate__c                 O   s   dS )zJust EOS   r   )r?   argsr@   r   r   r!   num_special_tokens_to_addg  s   z)MarianTokenizer.num_special_tokens_to_addc                    s(   t | j  | j  fdd|D S )Nc                    s   g | ]
}| v r
d ndqS )r   r   r   )r   rE   all_special_idsr   r!   r&   n  s    z7MarianTokenizer._special_token_mask.<locals>.<listcomp>)setr   removeZunk_token_id)r?   seqr   r   r!   _special_token_maskk  s   
z#MarianTokenizer._special_token_maskrh   ri   already_has_special_tokensc                 C   s:   |r|  |S |du r|  |dg S |  || dg S )zCGet list where entries are [1] if a token is [eos] or [pad] else 0.Nr   )r   )r?   rh   ri   r   r   r   r!   get_special_tokens_maskp  s
   
z'MarianTokenizer.get_special_tokens_mask)	NNNr   r   r   r   NFrD   )NF)+__name__
__module____qualname____doc__rz   Zvocab_files_namesZmodel_input_namesrecompilerR   r   r   r1   r   r>   r<   rG   rP   rW   r   rZ   intr\   r^   r`   rg   rj   rl   rm   propertyro   r   r   r   r   r   r   r   r   r   boolr   __classcell__r   r   rA   r!   r   -   sf    :
>	 -r   ru   r   r   c                 C   s   t jdi |}||  |S )Nr   )r   SentencePieceProcessorLoad)ru   r   Zspmr   r   r!   r7   |  s   
r7   c                 C   s@   t |d}tj| |dd W d    d S 1 sw   Y  d S )Nw   )indent)r   jsondump)dataru   r   r   r   r!   r{     s   "r{   c                 C   s8   t | d}t|W  d    S 1 sw   Y  d S )Nr)r   r   load)ru   r   r   r   r!   r/     s   $r/   )"r   rt   r   rK   pathlibr   shutilr   typingr   r   r   r   r   r	   r   Ztokenization_utilsr   utilsr   Zutils.import_utilsr   Z
get_loggerr   rw   rz   re   r   r1   r   r7   r{   r/   __all__r   r   r   r!   <module>   s6    
	   P
