
    fThD3                         S r SSKrSSKrSSKJr  SSKJrJrJr  SSK	J
r
  SSKJr  \R                  " \5      rSS	S
.rS r " S S\
5      rS/rg)z Tokenization classes for PhoBERT    N)copyfile)ListOptionalTuple   )PreTrainedTokenizer)loggingz	vocab.txtz	bpe.codes)
vocab_filemerges_filec                 z    [        5       nU S   nU SS  H  nUR                  X#45        UnM     [        U5      nU$ )zy
Return set of symbol pairs in a word.

Word is represented as tuple of symbols (symbols being variable-length strings).
r      N)setadd)wordpairs	prev_charchars       h/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/phobert/tokenization_phobert.py	get_pairsr   #   sH     EEQIQR		9#$	  JEL    c            
       H  ^  \ rS rSrSr\r       SU 4S jjr SS\\	   S\
\\	      S\\	   4S jjr SS\\	   S\
\\	      S\S\\	   4U 4S	 jjjr SS\\	   S\
\\	      S\\	   4S
 jjr\S 5       rS rS rS rS rS rS rSS\S\
\   S\\   4S jjrS rSrU =r$ )PhobertTokenizer3   a  
Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
    bos_token (`st`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the beginning of
        sequence. The token used is the `cls_token`.

        </Tip>

    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        The classifier token which is used when doing sequence classification (classification of the whole sequence
        instead of per-token classification). It is the first token of the sequence when built with special tokens.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    mask_token (`str`, *optional*, defaults to `"<mask>"`):
        The token used for masking values. This is the token used when training this model with masked language
        modeling. This is the token which the model will try to predict.
c
                   > Xl         X l        0 U l        SU R                  [        U5      '   SU R                  [        U5      '   SU R                  [        U5      '   SU R                  [        U5      '   U R	                  U5        U R                  R                  5        VVs0 s H  u  pX_M	     snnU l        [        USS9 nUR                  5       R                  S5      S S nS S S 5        W Vs/ s H  n[        UR                  5       S S 5      PM!     nn[        [        U[        [        U5      5      5      5      U l        0 U l        ["        TU ]H  " S
UUUUUUU	S	.U
D6  g s  snnf ! , (       d  f       N= fs  snf )Nr   r      r   utf-8encoding
)	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_token )r
   r   encoderstradd_from_fileitemsdecoderopenreadsplittupledictziprangelen	bpe_rankscachesuper__init__)selfr
   r   r!   r"   r$   r%   r#   r&   r'   kwargskvmerges_handlemergesmerge	__class__s                   r   r9   PhobertTokenizer.__init__f   sO    %&'(S^$'(S^$'(S^$'(S^$:&)-););)=>)=)=>+0M"'')//5cr:F 19?@%cr*+@c&%F*<=>
 		
!		
 		
 ?00@s   #E'#E-3&E>-
E;token_ids_0token_ids_1returnc                     Uc  U R                   /U-   U R                  /-   $ U R                   /nU R                  /nX1-   U-   U-   U-   U-   $ )a6  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A PhoBERT sequence has the following format:

- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)cls_token_idsep_token_id)r:   rC   rD   clsseps        r    build_inputs_with_special_tokens1PhobertTokenizer.build_inputs_with_special_tokens   se    ( %%&48I8I7JJJ  !  ! 3&,{:S@@r   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/S/[        U5      -  -   S/-   $ S/S/[        U5      -  -   SS/-   S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)rC   rD   rM   r   r   )r8   get_special_tokens_maskr5   )r:   rC   rD   rM   rA   s       r   rO   (PhobertTokenizer.get_special_tokens_mask   s    & &72']a 3   31#K 001QC77sqcC,,-A61#K@P:PQUVTWWWr   c                     U R                   /nU R                  /nUc  [        XA-   U-   5      S/-  $ [        XA-   U-   U-   U-   U-   5      S/-  $ )a|  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. PhoBERT does not
make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of zeros.
r   )rH   rG   r5   )r:   rC   rD   rJ   rI   s        r   $create_token_type_ids_from_sequences5PhobertTokenizer.create_token_type_ids_from_sequences   si    "   !  !s(3./1#553$s*S0;>DEKKr   c                 ,    [        U R                  5      $ N)r5   r)   r:   s    r   
vocab_sizePhobertTokenizer.vocab_size   s    4<<  r   c                 B    [        U R                  40 U R                  D6$ rU   )r2   r)   added_tokens_encoderrV   s    r   	get_vocabPhobertTokenizer.get_vocab   s    DLL>D$=$=>>r   c                 6  ^  UT R                   ;   a  T R                   U   $ [        U5      n[        [        US S 5      US   S-   /-   5      n[        U5      nU(       d  U$  [	        UU 4S jS9nUT R
                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUS S	 nUT R                   U'   U$ ! [         a    UR                  X(S  5         Mq  f = f)
Nr    z</w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)r6   getfloat)pairr:   s    r   <lambda>&PhobertTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1Wr   )keyr   r   r   @@ )r7   r1   listr   minr6   r5   indexextend
ValueErrorappendjoin)
r:   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpePhobertTokenizer.bpe   s   DJJ::e$$U|T$s)_R6(9'::;$L$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : zz$CRy 

5- " OODH-s   E7 7FFc                     / n[         R                  " SU5      nU H;  nUR                  [        U R	                  U5      R                  S5      5      5        M=     U$ )zTokenize a string.z\S+\n? )refindallrk   rh   rv   r0   )r:   textsplit_tokenswordsro   s        r   	_tokenizePhobertTokenizer._tokenize  sM    

9d+ETXXe_%:%:3%? @A r   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r)   r`   r#   )r:   ro   s     r   _convert_token_to_id%PhobertTokenizer._convert_token_to_id  s*    ||||'7'7'GHHr   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r-   r`   r#   )r:   rj   s     r   _convert_id_to_token%PhobertTokenizer._convert_id_to_token!  s    ||~~66r   c                 d    SR                  U5      R                  SS5      R                  5       nU$ )z:Converts a sequence of tokens (string) in a single string.ry   rf    )rn   replacestrip)r:   tokens
out_strings      r   convert_tokens_to_string)PhobertTokenizer.convert_tokens_to_string%  s,    XXf%--eR8>>@
r   save_directoryfilename_prefixc                 4   [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aE  [         R                  R                  U R                  5      (       a  [        U R                  U5        On[         R                  R                  U R                  5      (       d@  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        [         R                  R                  U R                  5      [         R                  R                  U5      :w  a  [        U R                  U5        X44$ ! , (       d  f       Nq= f)NzVocabulary path (z) should be a directory-r   r
   r   wb)ospathisdirloggererrorrn   VOCAB_FILES_NAMESabspathr
   isfiler   r.   sp_modelserialized_model_protowriter   )r:   r   r   out_vocab_fileout_merge_fileficontent_spiece_models          r   save_vocabulary PhobertTokenizer.save_vocabulary*  sr   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 o_s22QbcpQqq
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n500nd+r'+}}'K'K'M$-. , 77??4++,0OOT%%~6-- ,+s   2,H		
Hc                    [        U[        5      (       a'   [        USSS9 nU R                  U5        SSS5        gUR                  5       nU H\  nUR                  5       nUR                  S5      nUS:X  a  [        S	5      eUSU n[        U R                  5      U R                  U'   M^     g! , (       d  f       g= f! [         a  nUeSnAf[
         a    [        SU S35      ef = f)
zY
Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
rr   r   NzIncorrect encoding detected in z, please rebuild the datasetry   r    z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer*   r.   r+   FileNotFoundErrorUnicodeError	Exception	readlinesr   rfindrl   r5   r)   )	r:   ffdfnfelineslineTmplineidxr   s	            r   r+   PhobertTokenizer.add_from_fileG  s     ac!S73r&&r* 4 G==?D**S/Cby !XYY:D!$T\\!2DLL  43 	 % 
 c"A!D` abbcs3   C B0C 0
B>:C >C 
C*CC*)r6   r7   r-   r)   r   r
   )<s></s>r   r   z<unk>z<pad>z<mask>rU   )NF)__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesr9   r   intr   rK   boolrO   rR   propertyrW   r[   rv   r   r   r   r   r*   r   r   r+   __static_attributes____classcell__)rA   s   @r   r   r   3   sK   .` * *
Z JNA9A3;DI3FA	cA6 sxX9X3;DI3FXkoX	cX X: JNL9L3;DI3FL	cL0 ! !?*XI7
.c .HSM .]bcf]g .:3 3r   r   )r   r   rz   shutilr   typingr   r   r   tokenization_utilsr   utilsr	   
get_loggerr   r   r   r   r   __all__r(   r   r   <module>r      s`     ' 	 	  ( ( 5  
		H	%   i3* i3X	 
r   