
    fTh<                         S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	J
r
Jr  SSKJrJr  SSKJr  \R"                  " \5      rSS	0r " S
 S5      r " S S\5      rS/rg)z"Tokenization class for model MyT5.    N)defaultdict)DictListOptionalTupleUnion   )
AddedTokenPreTrainedTokenizer)logging
vocab_filezbyte_maps.jsonc            	           \ rS rSrSrSrS\\\\\4   4   4S jr	S\\\\
\\   4   4   S\S\4S	 jrS\\\4   S
\\\\
\\   4   4   4S jrS\\   S
\S\\   4   4S jrSS\\   S
\\   4S jjrSrg)ByteRewriter!   aB  
Byte rewriter class for MyT5 tokenizer.
This class is used to rewrite bytes using a hash tree. The hash tree is constructed from a set of rewriting rules.

Args:
    rewriting_rules (`str` or `Dict[str, str]`):
        A path to a json file containing the rewriting rules or a dictionary containing the rewriting rules.

z[LEAF]rewriting_rulesc                    [        U[        5      (       a,  [        US5       n[        R                  " U5      nS S S 5        O,[        U[
        5      (       d  [        S[        U5       35      eU R                  U5      U l	        UR                  5        VVs0 s H  u  p4XC_M	     nnnU R                  U5      U l        g ! , (       d  f       N_= fs  snnf )NrzDrewriting_rules should be either a path to json file or a dict, got )
isinstancestropenjsonloaddict
ValueErrortypeconstruct_hash_tree	hash_treeitemsreverse_hash_tree)selfr   fkvreverse_rewriting_ruless         b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/myt5/tokenization_myt5.py__init__ByteRewriter.__init__.   s    os++os+q"&))A, ,+OT22VW[\kWlVmn  11/B4C4I4I4K"L4KDA144K"L!%!9!9:Q!R ,+ #Ms   B?C?
Cr   byte_in_sequencebyte_out_sequencec                     UR                  S5      nUR                  S5      nUnU H  nXv;  a  0 Xg'   Xg   nM     XVU R                  '   g)z<
Add a leaf with the output byte sequence to the hash tree.
 N)splitLEAF)r    r   r(   r)   byte_in_listbyte_out_listtree_pointerbs           r%   add_leafByteRewriter.add_leaf;   sU     (--c2)//4 A$"$'?L 
 #0TYY    returnc                     [        [        5      nS [        S5       5        H  nU/X#   U R                  '   M     UR	                  5        H  u  pEU R                  X$U5        M     U$ )z5
Construct a hash tree for rewritten byte sequences.
c              3   (   #    U  H  oS  v   M
     g7f)02xN ).0xs     r%   	<genexpr>3ByteRewriter.construct_hash_tree.<locals>.<genexpr>O   s     1jsG*js      )r   r   ranger-   r   r2   )r    r   r   r1   in_sequenceout_sequences         r%   r    ByteRewriter.construct_hash_treeJ   sb      %	1eCj1A'(cIL# 2 *9)>)>)@%KMM),? *A r4   byte_sequenceNc                 ^    U R                   nU H  nX2;   a  X#   nM    g   X R                     $ )zG
Search the hash tree and return the rewritten byte sequence if found.
N)r   r-   )r    rC   r0   r1   s       r%   search_hash_treeByteRewriter.search_hash_treeW   s6     ~~A +	  II&&r4   in_bytesc                 p   / nSnSnU[        U5      :  a  U(       d  U R                  OU R                  n[        U[        U5      5       HA  nX   nX;   a  Xh   nOXt:X  a  U/n	Un  O(  O&U R                  U;   d  M1  X`R                     n	UnMC     UR                  W	5        US-   nU[        U5      :  a  M  U$ )z
Rewrite a sequence of bytes using the hash tree.

Args:
    in_bytes (`List[str]`): A list of bytes to be rewritten.
    reverse (`bool`): If True, decoding is performed with the reverse hash tree.
Returns:
    `List[str]`: The rewritten byte sequence.
r      )lenr   r   r?   r-   extend)
r    rG   reverse	out_bytesb_startb_endr0   jr1   cur_leafs
             r%   rewrite_bytesByteRewriter.rewrite_bytesd   s     	H%184>>d>T>TL7CM2K$#/?L\ !sHE99,+II6HE 3 X&aiG! H%$ r4   )r   r   )F)__name__
__module____qualname____firstlineno____doc__r-   r   r   r   r&   r   r   r2   r   rE   rR   __static_attributes__r9   r4   r%   r   r   !   s     DSc4S>.A(B S0$sE$S	/,B'B"C 0WZ 0or 04S> d3PUVZ\`ad\eVePfKfFg 'd3i 'E$S	/<R ' d3i  49    r4   r   c            
         ^  \ rS rSrSrSS/r\r     S SU 4S jjjr\	S 5       r
S r SS	\\   S
\\\      S\S\\   4U 4S jjjrS\\   S\\   4S jr S S	\\   S
\\\      S\\   4S jjr S S	\\   S
\\\      S\\   4S jjrS\S\\   4S jrS rS rS\\   S\\   4S jrS\\   S\\   4S jrS rS S\S\\   S\\   4S jjrSrU =r$ )!MyT5Tokenizer   a  
Construct a MyT5 tokenizer.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`): The file containing the byte rewriting rules.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    extra_ids (`int`, *optional*, defaults to 125):
        Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
        accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
        indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
        like in ByT5 preprocessing see
        [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
    additional_special_tokens (`List[str]`, *optional*):
        Additional special tokens used by the tokenizer.
	input_idsattention_maskr5   c           	      
  > US:  a#  Uc   [        U5       Vs/ s H	  nSU S3PM     nnONUS:  aH  UbE  [        U5      S:  a6  [        [        [        S U5      5      5      n	X:w  a  [	        SU SU S35      e[        U[        5      (       a  [        USSS	9OUn[        U[        5      (       a  [        USSS	9OUn[        U[        5      (       a  [        USSS	9OUnXBUS
.U l        [        U R                  5      U l	        SU l
        [        R                  " [        US5      5      U l        [        U R                  S   5      U l        [        U R                  S   5      U l        [$        T
U ]L  " SUUUSUS.UD6  g s  snf )Nr   z
<extra_id_>c                 0    [        S[        U 5      ;   5      $ )Nextra_id)boolr   )r;   s    r%   <lambda>(MyT5Tokenizer.__init__.<locals>.<lambda>   s    Ds1v9M4Nr4   zBoth extra_ids (z!) and additional_special_tokens (zm) are provided to MyT5Tokenizer. In this case the additional_special_tokens must include the extra_ids tokensT)lstriprstrip)r   rI      r>   r   decompose_map	merge_map)	eos_token	unk_token	pad_token	extra_idsadditional_special_tokensr9   )r?   rJ   setfilterr   r   r   r
   _added_tokens_decoderoffset_utf_vocab_sizer   r   r   	byte_mapsr   decompose_rewritermerge_rewritersuperr&   )r    r   rk   rl   rm   rn   ro   kwargsiextra_tokens	__class__s             r%   r&   MyT5Tokenizer.__init__   s    q=6>DI)DT(UDTq:aS):DT%(U%]8DMfIgjkIks6*NPi#jklL( &yk1RSlRm n( (  HRR[]`GaGaJydCgp	GQR[]`GaGaJydCgp	GQR[]`GaGaJydCgp	)2Y%O"$445# 4
C#89".t~~o/N"O*4>>++FG 	
&?	
 	
3 )Vs   F c                     U R                   $ N)rt   )r    s    r%   
vocab_sizeMyT5Tokenizer.vocab_size   s    ###r4   c                     [        U R                  U R                  -   5       Vs0 s H  oR                  U5      U_M     nnUR	                  U R
                  5        U$ s  snf r   )r?   r   rs   convert_ids_to_tokensupdateadded_tokens_encoder)r    rz   vocabs      r%   	get_vocabMyT5Tokenizer.get_vocab   sX    ;@SWS^S^A^;_`;_a++A.1;_`T../ as   Atoken_ids_0token_ids_1already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/[        U5      -  S/-   $ S/[        U5      -  S/-   S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r   r   rI   )rx   get_special_tokens_maskrJ   )r    r   r   r   r|   s       r%   r   %MyT5Tokenizer.get_special_tokens_mask   sw    $ &72']a 3  
 C#k**qc11c+&&1#-!s;7G1GHA3NNr4   	token_idsc                     [        U5      S:  a9  US   U R                  :X  a&  [        R                  " SU R                   S35        U$ XR                  /-   $ )z.Do not add eos again if user already added it.r   zThis sequence already has zQ. In future versions this behavior may lead to duplicated eos tokens being added.)rJ   eos_token_idwarningswarnrk   )r    r   s     r%   _add_eos_if_not_present%MyT5Tokenizer._add_eos_if_not_present   s[    y>A)B-43D3D"DMM,T^^,< =+ +  1 1222r4   c                 r    U R                   /nUc  [        X-   5      S/-  $ [        X-   U-   U-   5      S/-  $ )ay  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. MyT5 does not
make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of zeros.
r   )r   rJ   )r    r   r   eoss       r%   $create_token_type_ids_from_sequences2MyT5Tokenizer.create_token_type_ids_from_sequences  sL        !{()QC//;${2S89QC??r4   c                 X    U R                  U5      nUc  U$ U R                  U5      nX-   $ )a"  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:

- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)r   )r    r   r   s      r%    build_inputs_with_special_tokens.MyT5Tokenizer.build_inputs_with_special_tokens  s9    & 22;?66{CK,,r4   textc                 v    UR                  S5       Vs/ s H  o3S PM     nnU R                  U5      nU$ s  snf )zTake as input a string and return a list of strings (tokens) for words/sub-words.
Represents tokens in two character hex formatutf-8r8   )encodemorphological_encode)r    r   ry   rz   tokenss        r%   	_tokenizeMyT5Tokenizer._tokenize6  sA     '+kk'&:;&:sG*&:;**62 <s   6c                 ^    [        U5      S:w  a  SnU$ [        US5      U R                  -   nU$ )z0Converts a token (str) in an id using the vocab.rh   N   )rJ   intrs   )r    tokentoken_ids      r%   _convert_token_to_id"MyT5Tokenizer._convert_token_to_id>  s6     u:?H  5"~3Hr4   c                 &    XR                   -
  S nU$ )z=Converts an index (integer) in a token (str) using the vocab.r8   )rs   )r    indexr   s      r%   _convert_id_to_token"MyT5Tokenizer._convert_id_to_tokenH  s    ;;&s+r4   indicesc                 n    U R                   R                  USS9nU R                  R                  USS9nU$ )NFrL   )rv   rR   rw   r    r   s     r%   r   "MyT5Tokenizer.morphological_encodeM  s=    ))777O%%33GU3Kr4   c                 n    U R                   R                  USS9nU R                  R                  USS9nU$ )NTr   )rw   rR   rv   r   s     r%   morphological_decode"MyT5Tokenizer.morphological_decodeS  s=    %%33GT3J))777Nr4   c                    Sn/ nU He  nX@R                   ;   a   UR                  U R                   U   5        M2  X@R                  ;   a  UR                  U5        MT  UR                  U5        Mg     U R                  U5      n[	        U R                   R                  5       5      [	        U R                  5      -  nU H1  nXE;   a  U[        US5      -  nM  U[        R                  U5      -  nM3     UR                  SSS9nU$ )z:Converts a sequence of tokens (string) in a single string.r4   r   ignore)errors)	added_tokens_decoderappendr   r   rp   valuesbytesfromhexdecode)r    r   bstring
out_tokensr   _added_tokensstrings          r%   convert_tokens_to_string&MyT5Tokenizer.convert_tokens_to_stringY  s    
E111!!$";";E"BC333!!%(!!%(  ..z:
D55<<>?#dF_F_B``E%5005==//	  
 9r4   save_directoryfilename_prefixc           	         [         R                  R                  U5      (       a6  [         R                  R                  X(       a  US-   OS[        S   -   5      nOU(       a  US-   OSU-   n[        USSS9 nUR                  [        R                  " U R                  SSS	95        S S S 5        U4$ ! , (       d  f       U4$ = f)
N- r   wr   )encodingrh   F)indentensure_ascii)
ospathisdirjoinVOCAB_FILES_NAMESr   writer   dumpsru   )r    r   r   r   writers        r%   save_vocabularyMyT5Tokenizer.save_vocabularyp  s    77==((/3!6rUfgsUt tJ 4C/C/n\J*cG4LLDNN15QR 5} 54}s   70B22
C)rr   rt   ru   rv   rw   rs   )z</s>z<unk>z<pad>}   N)r5   N)NFr   )rT   rU   rV   rW   rX   model_input_namesr   vocab_files_namesr&   propertyr   r   r   r   r   rc   r   r   r   r   r   r   r   r   r   r   r   r   r   rY   __classcell__)r|   s   @r%   r[   r[      s   4 %&67)
 "&,
 
,
 ,
\ $ $ sxO9O3;DI3FOkoO	cO O8	3c 	3tCy 	3 JN@9@3;DI3F@	c@0 JN-9-3;DI3F-	c-4c S	 
DI $s) DI $s) .	c 	HSM 	]bcf]g 	 	r4   r[   )rX   r   r   r   collectionsr   typingr   r   r   r   r   tokenization_utilsr
   r   utilsr   
get_loggerrT   loggerr   r   r[   __all__r9   r4   r%   <module>r      si    )  	  # 5 5 A  
		H	% "#34 c cLr' rj 
r4   