o
    Zh<                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZmZ ddlmZ eeZdd	iZG d
d dZG dd deZdgZdS )z"Tokenization class for model MyT5.    N)defaultdict)DictListOptionalTupleUnion   )
AddedTokenPreTrainedTokenizer)logging
vocab_filezbyte_maps.jsonc                	   @   s   e Zd ZdZdZdeeeeef f fddZdeeee	e
e f f dedefd	d
Zdeeef deeee	e
e f f fddZde
e dede
e f fddZdde
e de
e fddZdS )ByteRewriteraZ  
    Byte rewriter class for MyT5 tokenizer.
    This class is used to rewrite bytes using a hash tree. The hash tree is constructed from a set of rewriting rules.

    Args:
        rewriting_rules (`str` or `Dict[str, str]`):
            A path to a json file containing the rewriting rules or a dictionary containing the rewriting rules.

    z[LEAF]rewriting_rulesc                 C   s   t |tr t|d}t|}W d    n1 sw   Y  nt |ts.tdt| | || _	dd |
 D }| || _d S )NrzDrewriting_rules should be either a path to json file or a dict, got c                 S   s   i | ]\}}||qS  r   ).0kvr   r   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/myt5/tokenization_myt5.py
<dictcomp>8   s    z)ByteRewriter.__init__.<locals>.<dictcomp>)
isinstancestropenjsonloaddict
ValueErrortypeconstruct_hash_tree	hash_treeitemsreverse_hash_tree)selfr   fZreverse_rewriting_rulesr   r   r   __init__.   s   

zByteRewriter.__init__r   byte_in_sequencebyte_out_sequencec                 C   sH   | d}| d}|}|D ]}||vri ||< || }q||| j< dS )zL
        Add a leaf with the output byte sequence to the hash tree.
         N)splitLEAF)r"   r   r%   r&   Zbyte_in_listZbyte_out_listtree_pointerbr   r   r   add_leaf;   s   


zByteRewriter.add_leafreturnc                 C   sT   t t}dd tdD D ]
}|g|| | j< q| D ]\}}| ||| q|S )zE
        Construct a hash tree for rewritten byte sequences.
        c                 s   s    | ]}|d V  qdS )02xNr   )r   xr   r   r   	<genexpr>O   s    z3ByteRewriter.construct_hash_tree.<locals>.<genexpr>   )r   r   ranger)   r    r,   )r"   r   r   r+   Zin_sequenceZout_sequencer   r   r   r   J   s   z ByteRewriter.construct_hash_treebyte_sequenceNc                 C   s0   | j }|D ]}||v r|| }q dS || j S )zW
        Search the hash tree and return the rewritten byte sequence if found.
        N)r   r)   )r"   r3   r*   r+   r   r   r   search_hash_treeW   s   

zByteRewriter.search_hash_treeFin_bytesc           
      C   s   g }d}d}|t |k rS|s| jn| j}t|t |D ](}|| }||v r*|| }n||kr5|g}	|} n n| j|v rC|| j }	|}q||	 |d }|t |k s|S )a6  
        Rewrite a sequence of bytes using the hash tree.

        Args:
            in_bytes (`List[str]`): A list of bytes to be rewritten.
            reverse (`bool`): If True, decoding is performed with the reverse hash tree.
        Returns:
            `List[str]`: The rewritten byte sequence.
        r      )lenr   r!   r2   r)   extend)
r"   r5   reverseZ	out_bytesZb_startZb_endr*   jr+   Zcur_leafr   r   r   rewrite_bytesd   s,   




zByteRewriter.rewrite_bytes)F)__name__
__module____qualname____doc__r)   r   r   r   r$   r   r   r,   r   r4   r;   r   r   r   r   r   !   s    
*." r   c                
       sr  e Zd ZdZddgZeZ					d/	d0 fd
dZedd Z	dd Z
	d1dee deee  ded	ee f fddZdee d	ee fddZ	d2dee deee  d	ee fddZ	d2dee deee  d	ee fddZded	ee fddZd d! Zd"d# Zd$ee d	ee fd%d&Zd$ee d	ee fd'd(Zd)d* Zd2d+ed,ee d	ee fd-d.Z  ZS )3MyT5Tokenizera  
    Construct a MyT5 tokenizer.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`): The file containing the byte rewriting rules.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.

        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        extra_ids (`int`, *optional*, defaults to 125):
            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
            like in ByT5 preprocessing see
            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
        additional_special_tokens (`List[str]`, *optional*):
            Additional special tokens used by the tokenizer.
    Z	input_idsZattention_mask</s><unk><pad>}   Nr-   c           	         s<  |dkr|d u rdd t |D }n(|dkr:|d ur:t|dkr:tttdd |}||kr:td| d| dt|trFt|d	d	d
n|}t|trTt|d	d	d
n|}t|trbt|d	d	d
n|}|||d| _t| j| _	d| _
tt|d| _t| jd | _t| jd | _t jd|||d|d| d S )Nr   c                 S   s   g | ]}d | dqS )z
<extra_id_>r   r   ir   r   r   
<listcomp>       z*MyT5Tokenizer.__init__.<locals>.<listcomp>c                 S   s   t dt| v S )NZextra_id)boolr   )r/   r   r   r   <lambda>   s    z(MyT5Tokenizer.__init__.<locals>.<lambda>zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to MyT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r   r6      r1   r   Zdecompose_mapZ	merge_map)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr   )r2   r7   setfilterr   r   r   r	   Z_added_tokens_decoderoffset_utf_vocab_sizer   r   r   	byte_mapsr   decompose_rewritermerge_rewritersuperr$   )	r"   r   rO   rP   rQ   rR   rS   kwargsZextra_tokens	__class__r   r   r$      s4   
zMyT5Tokenizer.__init__c                 C   s   | j S N)rW   r"   r   r   r   
vocab_size   s   zMyT5Tokenizer.vocab_sizec                    s.    fddt  j j D }| j |S )Nc                    s   i | ]}  ||qS r   )Zconvert_ids_to_tokensrF   r`   r   r   r      rI   z+MyT5Tokenizer.get_vocab.<locals>.<dictcomp>)r2   ra   rV   updateadded_tokens_encoder)r"   Zvocabr   r`   r   	get_vocab   s   zMyT5Tokenizer.get_vocabFtoken_ids_0token_ids_1already_has_special_tokensc                    sZ   |rt  j||ddS |du rdgt| dg S dgt| dg dgt|  dg S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)re   rf   rg   Nr   r6   )r[   get_special_tokens_maskr7   )r"   re   rf   rg   r]   r   r   rh      s   (z%MyT5Tokenizer.get_special_tokens_mask	token_idsc                 C   s>   t |dkr|d | jkrtd| j d |S || jg S )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)r7   eos_token_idwarningswarnrO   )r"   ri   r   r   r   _add_eos_if_not_present   s   z%MyT5Tokenizer._add_eos_if_not_presentc                 C   s<   | j g}|du rt|| dg S t|| | | dg S )a  
        Create a mask from the two sequences passed to be used in a sequence-pair classification task. MyT5 does not
        make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of zeros.
        Nr   )rk   r7   )r"   re   rf   Zeosr   r   r   $create_token_type_ids_from_sequences  s   z2MyT5Tokenizer.create_token_type_ids_from_sequencesc                 C   s(   |  |}|du r|S |  |}|| S )a  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A sequence has the following format:

        - single sequence: `X </s>`
        - pair of sequences: `A </s> B </s>`

        Args:
            token_ids_0 (`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
        N)rn   )r"   re   rf   r   r   r    build_inputs_with_special_tokens  s
   

z.MyT5Tokenizer.build_inputs_with_special_tokenstextc                 K   s"   dd | dD }| |}|S )zTake as input a string and return a list of strings (tokens) for words/sub-words.
        Represents tokens in two character hex formatc                 S   s   g | ]}|d qS )r.   r   rF   r   r   r   rH   :  s    z+MyT5Tokenizer._tokenize.<locals>.<listcomp>utf-8)encodemorphological_encode)r"   rq   r\   tokensr   r   r   	_tokenize6  s   
zMyT5Tokenizer._tokenizec                 C   s(   t |dkr
d}|S t|d| j }|S )z0Converts a token (str) in an id using the vocab.rN   N   )r7   intrV   )r"   tokenZtoken_idr   r   r   _convert_token_to_id>  s
   z"MyT5Tokenizer._convert_token_to_idc                 C   s   || j  d}|S )z=Converts an index (integer) in a token (str) using the vocab.r.   )rV   )r"   indexry   r   r   r   _convert_id_to_tokenH  s   z"MyT5Tokenizer._convert_id_to_tokenindicesc                 C   $   | j j|dd}| jj|dd}|S )NFr9   )rY   r;   rZ   r"   r}   r   r   r   rt   M     z"MyT5Tokenizer.morphological_encodec                 C   r~   )NTr   )rZ   r;   rY   r   r   r   r   morphological_decodeS  r   z"MyT5Tokenizer.morphological_decodec                 C   s   d}g }|D ] }|| j v r|| j |  q|| jv r!|| q|| q| |}t| j  t| jB }|D ]}||v rH|t|d7 }q:|t|7 }q:|jddd}|S )z:Converts a sequence of tokens (string) in a single string.    rr   ignore)errors)	Zadded_tokens_decoderappendrc   r   rT   valuesbytesfromhexdecode)r"   ru   bstringZ
out_tokensry   Z_added_tokensstringr   r   r   convert_tokens_to_stringY  s    


z&MyT5Tokenizer.convert_tokens_to_stringsave_directoryfilename_prefixc                 C   s   t j|rt j||r|d ndtd  }n
|r|d nd| }t|ddd}|tj| j	ddd	 W d    |fS 1 sBw   Y  |fS )
N- r   wrr   )encodingrN   F)indentensure_ascii)
ospathisdirjoinVOCAB_FILES_NAMESr   writer   dumpsrX   )r"   r   r   r   writerr   r   r   save_vocabularyp  s   
zMyT5Tokenizer.save_vocabulary)rA   rB   rC   rD   N)r-   N)NFr_   )r<   r=   r>   r?   Zmodel_input_namesr   Zvocab_files_namesr$   propertyra   rd   r   rx   r   rJ   rh   rn   ro   rp   r   rv   rz   r|   rt   r   r   r   r   __classcell__r   r   r]   r   r@      sb    	.






(r@   )r?   r   r   rl   collectionsr   typingr   r   r   r   r   Ztokenization_utilsr	   r
   utilsr   Z
get_loggerr<   loggerr   r   r@   __all__r   r   r   r   <module>   s   
f 
v