o
    ZhLN                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlmZ ddlmZ erBdd	lmZ dd
lmZ ddlmZ eeZddiZdZeddG dd deZdgZ dS )z Tokenization class for model T5.    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )import_protobuf)PreTrainedTokenizer)
AddedToken)	TextInput)logging)requires
vocab_filezspiece.modelu   ▁)sentencepiece)backendsc                
       s  e Zd ZdZeZddgZ									d=d
eee	e
f  ddf fddZd>ddZedd Zedd Zdd Z	d?dee deee  dedee f fddZdd Zdd Zd ee dee fd!d"Z	d@dee deee  dee fd#d$Z	d@dee deee  dee fd%d&Zd'd( Zd)d* Zd+d,dee	 f fd-d.Zed/d0 Zd1d2 Zd3d4 Z d5d6 Z!d7d8 Z"d@d9e	d:ee	 de#e	 fd;d<Z$  Z%S )AT5Tokenizera  
    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 100):
           Add a number of extra ids added to the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. These tokens can be
            retrieved by calling get_sentinel_tokens method and token ids can be by calling get_sentinel_token_ids
            method
         additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.
        legacy (`bool`, *optional*):
            Whether or not the `legacy` behaviour of the tokenizer should be used. Legacy is before the merge of #24622
            and #25224 which includes fixes to properly handle tokens that appear after special tokens. A simple
            example:

            - `legacy=True`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=True)
            >>> tokenizer.encode("Hello <extra_id_0>.")
            [8774, 32099, 3, 5, 1]
            ```
            - `legacy=False`:
            ```python
            >>> from transformers import T5Tokenizer

            >>> tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-base", legacy=False)
            >>> tokenizer.encode("Hello <extra_id_0>.")  # the extra space `[3]` is no longer here
            [8774, 32099, 5, 1]
            ```
            Checkout the [pull request](https://github.com/huggingface/transformers/pull/24565) for more details.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word.

    Attributes:
        sp_model (`SentencePieceProcessor`):
            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
    Z	input_idsZattention_mask</s><unk><pad>d   NTsp_model_kwargsreturnc
                    s  t |trt|ddn|}t |trt|ddn|}t |tr%t|ddn|}|d u r-i n|| _|| _|| _tjdi | j| _| j	| |d urydd |D }t
|dk rc|dd t|D 7 }n!|dkrx|t
|krxtd| d	| d
ndd t|D }|}i | _tt
|D ]}td| ddddddd| jt
| jd | | < q|d u rtd| j d d}|| _| |
dd| _|| _|| _|	| _t jd|||||| j||	d|
 d S )NT)specialc                 S   s   g | ]
}d t |v r|qS )
<extra_id_)str).0x r   U/var/www/auris/lib/python3.10/site-packages/transformers/models/t5/tokenization_t5.py
<listcomp>   s    z(T5Tokenizer.__init__.<locals>.<listcomp>   c                 S      g | ]}d | dqS r   >r   r   ir   r   r    r!          r   zBoth extra_ids (z!) and additional_special_tokens (zk) are provided to T5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensc                 S   r#   r$   r   r&   r   r   r    r!      r(   r   r%   F)Zsingle_wordlstriprstripr   
normalizedz2You are using the default legacy behaviour of the a_  . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565	from_slow)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   legacyadd_prefix_spacer   )
isinstancer   r   r   r   Z
_extra_idsspmSentencePieceProcessorsp_modelLoadlenrange
ValueErrorZ_added_tokens_decoderloggerZwarning_once	__class__r2   get_spm_processorpopr3   super__init__)selfr   r-   r.   r/   r0   r1   r   r2   r3   kwargsZextra_tokensr'   r=   r   r    rA      s\    	
zT5Tokenizer.__init__Fc                 C   s   t jdi | j}| js|r|| j |S t| jd3}| }td| j	j
 d}|j|}| }d|_|j| | }|| W d    |S 1 sRw   Y  |S )NrbzThe new behaviour of z (with `self.legacy = False`)Fr   )r5   r6   r   r2   r8   r   openreadr
   r=   __name__Z
ModelProtoZ
FromStringZNormalizerSpecZadd_dummy_prefixnormalizer_specZ	MergeFromZSerializeToStringZLoadFromSerializedProto)rB   r,   Z	tokenizerfr7   Z	model_pb2modelrI   r   r   r    r>      s"   

		zT5Tokenizer.get_spm_processorc                 C   sZ   | t jv r+t j|  }|d ur||kr|S |d u r+td| d|  d| d| d	t |S )NzGThis tokenizer was incorrectly instantiated with a model max length of z which will be corrected in Transformers v5.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on z( automatically truncating your input to zM when padding/encoding.
- If you want to encode/pad to sequences longer than z you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
- To avoid this warning, please instantiate this tokenizer with `model_max_length` set to your preferred value.)r   Zmax_model_input_sizeswarningswarnFutureWarning)Zpretrained_model_name_or_pathZmax_model_lengthZinit_max_model_lengthZdeprecated_max_model_lengthr   r   r    !_eventually_correct_t5_max_length   s$   

	z-T5Tokenizer._eventually_correct_t5_max_lengthc                 C   s
   | j  S N)r7   Zget_piece_sizerB   r   r   r    
vocab_size   s   
zT5Tokenizer.vocab_sizec                    s(    fddt  jD }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokensr&   rQ   r   r    
<dictcomp>   r(   z)T5Tokenizer.get_vocab.<locals>.<dictcomp>)r:   rR   updateZadded_tokens_encoder)rB   Zvocabr   rQ   r    	get_vocab   s   zT5Tokenizer.get_vocabtoken_ids_0token_ids_1already_has_special_tokensc                    sZ   |rt  j||ddS |du rdgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rV   rW   rX   Nr   r"   )r@   get_special_tokens_maskr9   )rB   rV   rW   rX   rD   r   r    rY      s   (z#T5Tokenizer.get_special_tokens_maskc                 C   s   t ttdd | jS )Nc                 S   s   t td| d uS )Nz<extra_id_\d+>)boolresearch)r   r   r   r    <lambda>  s    z1T5Tokenizer.get_sentinel_tokens.<locals>.<lambda>)listsetfilterr1   rQ   r   r   r    get_sentinel_tokens  s   zT5Tokenizer.get_sentinel_tokensc                    s    fdd   D S )Nc                    s   g | ]}  |qS r   )Zconvert_tokens_to_ids)r   tokenrQ   r   r    r!     s    z6T5Tokenizer.get_sentinel_token_ids.<locals>.<listcomp>)ra   rQ   r   rQ   r    get_sentinel_token_ids  s   z"T5Tokenizer.get_sentinel_token_ids	token_idsc                 C   s>   t |dkr|d | jkrtd| j d |S || jg S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r9   eos_token_idrL   rM   r-   )rB   rd   r   r   r    _add_eos_if_not_present  s   z#T5Tokenizer._add_eos_if_not_presentc                 C   s<   | j g}|du rt|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
        use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )rf   r9   )rB   rV   rW   Zeosr   r   r    $create_token_type_ids_from_sequences)  s   z0T5Tokenizer.create_token_type_ids_from_sequencesc                 C   s(   |  |}|du r|S |  |}|| S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)rg   )rB   rV   rW   r   r   r     build_inputs_with_special_tokens?  s
   

z,T5Tokenizer.build_inputs_with_special_tokensc                 C   s   | j  }d |d< |S )Nr7   )__dict__copy)rB   stater   r   r    __getstate__Y  s   
zT5Tokenizer.__getstate__c                 C   s<   || _ t| dsi | _tjdi | j| _| j| j d S )Nr   r   )rj   hasattrr   r5   r6   r7   r8   r   )rB   dr   r   r    __setstate__^  s
   
zT5Tokenizer.__setstate__textr   c                    s   | j s	t|dkrt j|fi |S |td}| jr t| }t j|fi |}t|dkrC|d tkrC|d | jv rC|dd }|S )z
        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
        first token is special.
        r    r"   N)r2   r9   r@   tokenizereplaceSPIECE_UNDERLINEr3   all_special_tokensrB   rq   rC   tokensrD   r   r    rs   h  s   &zT5Tokenizer.tokenizec                 C   s   t | jt| jS rP   )r9   r7   encoder   r.   rQ   r   r   r    unk_token_lengthz  s   zT5Tokenizer.unk_token_lengthc                 K   sZ   | j s
|tdfs| jj|tdS | jj| j| td}t|| jkr+|| jd S |S )u(  
        Returns a tokenized string.

        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
        rr   )Zout_typeN)	r2   
startswithru   r7   ry   r   r.   r9   rz   rw   r   r   r    	_tokenize~  s   
 zT5Tokenizer._tokenizec                 C   s   | j |S )z0Converts a token (str) in an id using the vocab.)r7   Zpiece_to_id)rB   rb   r   r   r    _convert_token_to_id  s   z T5Tokenizer._convert_token_to_idc                 C   s   | j |}|S )z=Converts an index (integer) in a token (str) using the vocab.)r7   Z	IdToPiece)rB   indexrb   r   r   r    _convert_id_to_token  s   z T5Tokenizer._convert_id_to_tokenc                 C   s   |d  tr| jr|d dd |d< g }d}d}|D ]#}|| jv r8|s)|d7 }|| j|| 7 }d}g }q|| d}q|| j|7 }| S )z:Converts a sequence of tokens (string) in a single string.r   r"   N Frr   T)r{   ru   r3   rv   r7   decodeappendstrip)rB   rx   Zcurrent_sub_tokensZ
out_stringZprev_is_specialrb   r   r   r    convert_tokens_to_string  s    

z$T5Tokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   t j|std| d d S t j||r|d ndtd  }t j| jt j|kr?t j	| jr?t
| j| |fS t j	| jsgt|d}| j }|| W d    |fS 1 sbw   Y  |fS )NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirr<   errorjoinVOCAB_FILES_NAMESabspathr   isfiler   rF   r7   Zserialized_model_protowrite)rB   r   r   Zout_vocab_filefiZcontent_spiece_modelr   r   r    save_vocabulary  s"   (

zT5Tokenizer.save_vocabulary)r   r   r   r   NNNT)F)NFrP   )&rH   
__module____qualname____doc__r   Zvocab_files_namesZmodel_input_namesr   r   r   r   rA   r>   staticmethodrO   propertyrR   rU   r   intrZ   rY   ra   rc   rg   rh   ri   rm   rp   rs   rz   r|   r}   r   r   r   r   __classcell__r   r   rD   r    r   ,   s|    N
K








(r   )!r   r   r[   rL   shutilr   typingr   r   r   r   r   r   r   r5   Zconvert_slow_tokenizerr
   Ztokenization_utilsr   Ztokenization_utils_baser   r   utilsr   Zutils.import_utilsr   Z
get_loggerrH   r<   r   ru   r   __all__r   r   r   r    <module>   s.    
   
