o
    ZhK'                     @   s`   d Z ddlZddlmZmZmZ ddlmZmZ ddl	m
Z
 e
eZG dd deZdgZdS )	z"Tokenization class for model ByT5.    N)ListOptionalTuple   )
AddedTokenPreTrainedTokenizer)loggingc                
       s:  e Zd ZdZddgZ					d*	d+ fd
dZedd Zdd Z	d,de	e
 dee	e
  ded	e	e
 f fddZde	e
 d	e	e
 fddZ	d-de	e
 dee	e
  d	e	e
 fddZ	d-de	e
 dee	e
  d	e	e
 fddZded	e	e fddZd d! Zd"d# Zd$d% Zd-d&ed'ee d	ee fd(d)Z  ZS ).ByT5Tokenizera  
    Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

            <Tip>

            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
            The token used is the `sep_token`.

            </Tip>

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 125):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
            like in ByT5 preprocessing see
            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    Z	input_idsZattention_mask</s><unk><pad>}   Nreturnc                    s
  |dkr|d u rdd t |D }n(|dkr:|d ur:t|dkr:tttdd |}||kr:td| d| dt|trFt|d	d	d
n|}t|trTt|d	d	d
n|}t|trbt|d	d	d
n|}|||d| _t| j| _	d| _
t jd|||d|d| d S )Nr   c                 S   s   g | ]}d | dqS )z
<extra_id_> .0ir   r   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/byt5/tokenization_byt5.py
<listcomp>I       z*ByT5Tokenizer.__init__.<locals>.<listcomp>c                 S   s   t dt| v S )NZextra_id)boolstr)xr   r   r   <lambda>L   s    z(ByT5Tokenizer.__init__.<locals>.<lambda>zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to ByT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r            )	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   )rangelensetfilter
ValueError
isinstancer   r   Z_added_tokens_decoderoffset_utf_vocab_sizesuper__init__)selfr    r!   r"   r#   r$   kwargsZextra_tokens	__class__r   r   r.   >   s.   

zByT5Tokenizer.__init__c                 C   s   | j S N)r,   r/   r   r   r   
vocab_sizee   s   zByT5Tokenizer.vocab_sizec                    s.    fddt  j j D }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokensr   r4   r   r   
<dictcomp>j   r   z+ByT5Tokenizer.get_vocab.<locals>.<dictcomp>)r%   r5   r+   updateadded_tokens_encoder)r/   Zvocabr   r4   r   	get_vocabi   s   zByT5Tokenizer.get_vocabFtoken_ids_0token_ids_1already_has_special_tokensc                    sZ   |rt  j||ddS |du rdgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r:   r;   r<   Nr   r   )r-   get_special_tokens_maskr&   )r/   r:   r;   r<   r1   r   r   r=   n   s   (z%ByT5Tokenizer.get_special_tokens_mask	token_idsc                 C   s>   t |dkr|d | jkrtd| j d |S || jg S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r&   eos_token_idwarningswarnr    )r/   r>   r   r   r   _add_eos_if_not_present   s   z%ByT5Tokenizer._add_eos_if_not_presentc                 C   s<   | j g}|du rt|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )r@   r&   )r/   r:   r;   Zeosr   r   r   $create_token_type_ids_from_sequences   s   z2ByT5Tokenizer.create_token_type_ids_from_sequencesc                 C   s(   |  |}|du r|S |  |}|| S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)rC   )r/   r:   r;   r   r   r    build_inputs_with_special_tokens   s
   

z.ByT5Tokenizer.build_inputs_with_special_tokenstextc                 C   s   dd | dD }|S )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsc                 S   s   g | ]}t |qS r   )chrr   r   r   r   r      s    z+ByT5Tokenizer._tokenize.<locals>.<listcomp>utf-8)encode)r/   rF   tokensr   r   r   	_tokenize   s   zByT5Tokenizer._tokenizec                 C   s&   t |dkr
d}|S t|| j }|S )z0Converts a token (str) in an id using the vocab.r   N)r&   ordr+   )r/   tokenZtoken_idr   r   r   _convert_token_to_id   s
   z"ByT5Tokenizer._convert_token_to_idc                 C   s   t || j }|S )z=Converts an index (integer) in a token (str) using the vocab.)rG   r+   )r/   indexrM   r   r   r   _convert_id_to_token   s   z"ByT5Tokenizer._convert_id_to_tokenc                 C   sh   d}|D ]&}|| j v r| j | d}n|| jv r|d}ntt|g}||7 }q|jddd}|S )z:Converts a sequence of tokens (string) in a single string.    rH   ignore)errors)Zadded_tokens_decoderrI   r8   bytesrL   decode)r/   rJ   bstringrM   Z
tok_stringstringr   r   r   convert_tokens_to_string   s   


z&ByT5Tokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   dS )Nr   r   )r/   rY   rZ   r   r   r   save_vocabulary   s   zByT5Tokenizer.save_vocabulary)r
   r   r   r   N)r   N)NFr3   )__name__
__module____qualname____doc__Zmodel_input_namesr.   propertyr5   r9   r   intr   r   r=   rC   rD   rE   r   rK   rN   rP   rX   r   r[   __classcell__r   r   r1   r   r	      s\     '






(r	   )r_   rA   typingr   r   r   Ztokenization_utilsr   r   utilsr   Z
get_loggerr\   loggerr	   __all__r   r   r   r   <module>   s   
 
R