o
    Zh31                     @   s   d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
ZddlmZ ddlmZ ddlmZ eeZd	Zd
diZeddG dd deZdgZdS )zTokenization classes for .    N)copyfile)AnyDictListOptionalTuple   )PreTrainedTokenizer)logging)requiresu   ▁
vocab_filezsentencepiece.bpe.model)sentencepiece)backendsc                
       sJ  e Zd ZdZeZddgZ							d,d	eee	e
f  d
df fddZdd Zdd Z	d-dee deee  d
ee fddZ	d.dee deee  ded
ee f fddZ	d-dee deee  d
ee fddZedd Zdd Zde	d
ee	 fd d!Zd"d# Zd$d% Zd&d' Zd-d(e	d)ee	 d
ee	 fd*d+Z  ZS )/XGLMTokenizera  
    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
    [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the beginning of
            sequence. The token used is the `cls_token`.

            </Tip>

        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        sep_token (`str`, *optional*, defaults to `"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
            sequence classification or for a text and a question for question answering. It is also used as the last
            token of a sequence built with special tokens.
        cls_token (`str`, *optional*, defaults to `"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole sequence
            instead of per-token classification). It is the first token of the sequence when built with special tokens.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    Z	input_idsZattention_mask<s></s><unk><pad>Nsp_model_kwargsreturnc	              
      s  |d u ri n|_ d_dd tjD }
 dg pg  d<  d   fdd|
D 7  < tjdi j _jt| |_	d_
dddd	d
_tjfddtjD }
j|
 dd j D _t jd||||||j d  d S )N   c                 S   s   g | ]}d | dqS z<madeupword> .0ir   r   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/xglm/tokenization_xglm.py
<listcomp>w       z*XGLMTokenizer.__init__.<locals>.<listcomp>additional_special_tokensc                    s   g | ]
}| d  vr|qS )r    r   )r   word)kwargsr   r   r   z   s       r      r   )r   r   r   r   c                    s$   i | ]}d | d|  j  qS r   )fairseq_offsetr   )selfsp_sizer   r   
<dictcomp>   s   $ z*XGLMTokenizer.__init__.<locals>.<dictcomp>c                 S   s   i | ]\}}||qS r   r   )r   kvr   r   r   r(      s    )	bos_token	eos_token	unk_token	sep_token	cls_token	pad_tokenr   r   )r   num_madeup_wordsrangegetspmSentencePieceProcessorsp_modelLoadstrr   r%   fairseq_tokens_to_idslenupdateitemsfairseq_ids_to_tokenssuper__init__)r&   r   r+   r,   r.   r/   r-   r0   r   r"   Zmadeup_words	__class__)r"   r&   r'   r   r?   g   s6   
	

zXGLMTokenizer.__init__c                 C   s$   | j  }d |d< | j |d< |S )Nr6   sp_model_proto)__dict__copyr6   serialized_model_proto)r&   stater   r   r   __getstate__   s   
zXGLMTokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjdi | j| _| j| j d S )Nr   r   )rC   hasattrr   r4   r5   r6   ZLoadFromSerializedProtorB   )r&   dr   r   r   __setstate__   s
   
zXGLMTokenizer.__setstate__token_ids_0token_ids_1c                 C   s0   |du r
| j g| S | j g}|| | | | S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. An XLM-RoBERTa sequence has the following format:

        - single sequence: `<s> X </s>`
        - pair of sequences: `<s> A </s></s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)sep_token_idr&   rK   rL   sepr   r   r    build_inputs_with_special_tokens   s   z.XGLMTokenizer.build_inputs_with_special_tokensFalready_has_special_tokensc                    s\   |rt  j||ddS |du rdgdgt|  S dgdgt|  ddg dgt|  S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rK   rL   rQ   Nr#   r   )r>   get_special_tokens_maskr:   )r&   rK   rL   rQ   r@   r   r   rR      s   *z%XGLMTokenizer.get_special_tokens_maskc                 C   s@   | j g}|du rt|| dg S t|| | | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
        not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.

        Nr   )rM   r:   rN   r   r   r   $create_token_type_ids_from_sequences   s   z2XGLMTokenizer.create_token_type_ids_from_sequencesc                 C   s   t | j| j | j S N)r:   r6   r%   r1   r&   r   r   r   
vocab_size   s   zXGLMTokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokensr   rU   r   r   r(     r   z+XGLMTokenizer.get_vocab.<locals>.<dictcomp>)r2   rV   r;   Zadded_tokens_encoder)r&   Zvocabr   rU   r   	get_vocab   s   zXGLMTokenizer.get_vocabtextc                 C   s   | j j|tdS )N)Zout_type)r6   encoder8   )r&   rX   r   r   r   	_tokenize  s   zXGLMTokenizer._tokenizec                 C   s4   || j v r
| j | S | j|}|r|| j S | jS )z0Converts a token (str) in an id using the vocab.)r9   r6   Z	PieceToIdr%   Zunk_token_id)r&   tokenZspm_idr   r   r   _convert_token_to_id  s   

z"XGLMTokenizer._convert_token_to_idc                 C   s&   || j v r
| j | S | j|| j S )z=Converts an index (integer) in a token (str) using the vocab.)r=   r6   Z	IdToPiecer%   )r&   indexr   r   r   _convert_id_to_token  s   

z"XGLMTokenizer._convert_id_to_tokenc                 C   s   d |td }|S )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r&   tokensZ
out_stringr   r   r   convert_tokens_to_string  s   z&XGLMTokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d    |fS 1 sbw   Y  |fS )NzVocabulary path (z) should be a directory-r_   r   wb)ospathisdirloggererrorra   VOCAB_FILES_NAMESabspathr   isfiler   openr6   rE   write)r&   rg   rh   Zout_vocab_filefiZcontent_spiece_modelr   r   r   save_vocabulary  s"   (

zXGLMTokenizer.save_vocabulary)r   r   r   r   r   r   NrT   )NF)__name__
__module____qualname____doc__rp   Zvocab_files_namesZmodel_input_namesr   r   r8   r   r?   rG   rJ   r   intrP   boolrR   rS   propertyrV   rW   rZ   r\   r^   rf   r   rv   __classcell__r   r   r@   r   r   #   sh    ?	8





	(r   )rz   rk   shutilr   typingr   r   r   r   r   r   r4   Ztokenization_utilsr	   utilsr
   Zutils.import_utilsr   Z
get_loggerrw   rn   rc   rp   r   __all__r   r   r   r   <module>   s    
  
