
    fTha                         S SK r S SKrS SKrS SKrS SKJrJrJr  SSKJ	r	J
r
JrJr  SSKJr  \R                  " \5      rSSS.rS	 rS
 rS rS r " S S5      r " S S\	5      rS/rg)    N)ListOptionalTuple   )PreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 d    [        5       nU S   nU SS  H  nUR                  X#45        UnM     U$ )zw
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
r      N)setadd)wordpairs	prev_charchars       h/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/herbert/tokenization_herbert.py	get_pairsr   "   s?    
 EEQIQR		9#$	  L    c                 .   U R                  SS5      n [        R                  " SSU 5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  S	S
5      n U R                  SS
5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  S S!5      n U R                  S"S#5      n U R                  S$S%5      n U R                  S&S'5      n U R                  S(S)5      n U R                  S*S+5      n U R                  S,S-5      n [        R                  " S.SU 5      n U R                  S/S05      n U R                  S1S25      n U R                  S3S45      n U R                  S5S65      n U R                  S7S85      n U R                  S9S:5      n U R                  S;S<5      n U R                  S=S>5      n U R                  S?S@5      n U $ )Azr
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
u   ，,u   。\s*z. u   、u   ”"u   “u   ∶:u   ：u   ？?u   《u   》u   ）)u   ！!u   （(u   ；;u   １1u   」u   「u   ０0u   ３3u   ２2u   ５5u   ６6u   ９9u   ７7u   ８8u   ４4u   ．\s*u   ～~u   ’'u   …z...u   ━-u   〈<u   〉>u   【[u   】]u   ％%)replaceresub)texts    r   replace_unicode_punctr8   0   sM    <<s#D66)T4(D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D66)T4(D<<s#D<<s#D<<u%D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#DKr   c                     / nU  HB  n[         R                  " U5      nUR                  S5      (       a  M1  UR                  U5        MD     SR	                  U5      $ )zo
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
C )unicodedatacategory
startswithappendjoin)r7   outputr   cats       r   remove_non_printing_charrC   \   sQ     F""4(>>#d	 
 776?r   c                 X    U R                  5       n U (       d  / $ U R                  5       nU$ )z@Runs basic whitespace cleaning and splitting on a piece of text.)stripsplit)r7   tokenss     r   whitespace_tokenizerH   j   s%    ::<D	ZZ\FMr   c                   X    \ rS rSrSr     SS jrSS jrS rSS jrS r	S	 r
S
 rSrg)BasicTokenizert   ab  
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

Args:
    do_lower_case (`bool`, *optional*, defaults to `True`):
        Whether or not to lowercase the input when tokenizing.
    never_split (`Iterable`, *optional*):
        Collection of tokens which will never be split during tokenization. Only has an effect when
        `do_basic_tokenize=True`
    tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
        Whether or not to tokenize Chinese characters.

        This should likely be deactivated for Japanese (see this
        [issue](https://github.com/huggingface/transformers/issues/328)).
    strip_accents (`bool`, *optional*):
        Whether or not to strip all accents. If this option is not specified, then it will be determined by the
        value for `lowercase` (as in the original BERT).
    do_split_on_punc (`bool`, *optional*, defaults to `True`):
        In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
        the full context of the words, such as contractions.
Nc                 ^    Uc  / nXl         [        U5      U l        X0l        X@l        XPl        g N)do_lower_caser   never_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfrN   rO   rP   rQ   rR   s         r   __init__BasicTokenizer.__init__   s4     K*{+&<#* 0r   c                 z   U(       a$  U R                   R                  [        U5      5      OU R                   nU R                  U5      nU R                  (       a  U R                  U5      n[        R                  " SU5      n[        U5      n/ nU H  nXb;  ad  U R                  (       a1  UR                  5       nU R                  SLa  U R                  U5      nO"U R                  (       a  U R                  U5      nUR                  U R                  Xb5      5        M     [        SR                  U5      5      nU$ )a:  
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

Args:
    never_split (`List[str]`, *optional*)
        Kept for backward compatibility purposes. Now implemented directly at the base class level (see
        [`PreTrainedTokenizer.tokenize`]) List of token not to split.
NFCF )rO   unionr   _clean_textrP   _tokenize_chinese_charsr<   	normalizerH   rN   lowerrQ   _run_strip_accentsextend_run_split_on_puncr@   )rS   r7   rO   unicode_normalized_textorig_tokenssplit_tokenstokenoutput_tokenss           r   tokenizeBasicTokenizer.tokenize   s
    CNd&&,,S-=>SWScSc% &&//5D"-"7"7t"D)*AB E'%%!KKME))6 $ 7 7 >'' 33E:E 7 7 KL ! ,CHH\,BCr   c                     [         R                  " SU5      n/ nU H2  n[         R                  " U5      nUS:X  a  M!  UR                  U5        M4     SR	                  U5      $ )z$Strips accents from a piece of text.NFDMnr;   )r<   r\   r=   r?   r@   )rS   r7   rA   r   rB   s        r   r^   !BasicTokenizer._run_strip_accents   sY    $$UD1D&&t,Cd{MM$	 
 wwvr   c                    U R                   (       a  Ub  X;   a  U/$ [        U5      nSnSn/ nU[        U5      :  am  X4   n[        U5      (       a  UR	                  U/5        SnO.U(       a  UR	                  / 5        SnUS   R	                  U5        US-  nU[        U5      :  a  Mm  U Vs/ s H  nSR                  U5      PM     sn$ s  snf )z&Splits punctuation on a piece of text.r   TFr   r;   )rR   listlenr	   r?   r@   )	rS   r7   rO   charsistart_new_wordrA   r   xs	            r   r`   !BasicTokenizer._run_split_on_punc   s    $$)@TEX6MT
#e*n8Dt$$tf%!%!MM"%!&r
!!$'FA #e*n %++Fq
F+++s   .Cc                    / nU Hj  n[        U5      nU R                  U5      (       a5  UR                  S5        UR                  U5        UR                  S5        MY  UR                  U5        Ml     SR                  U5      $ )z)Adds whitespace around any CJK character.rX   r;   )ord_is_chinese_charr?   r@   rS   r7   rA   r   cps        r   r[   &BasicTokenizer._tokenize_chinese_chars   sk    DTB$$R((c"d#c"d#  wwvr   c                     US:  a  US::  dT  US:  a  US::  dH  US:  a  US::  d<  US:  a  US::  d0  US	:  a  US
::  d$  US:  a  US::  d  US:  a  US::  d  US:  a  US::  a  gg)z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TF )rS   ry   s     r   rw   BasicTokenizer._is_chinese_char   sr     6\bFlfvg"-g"-g"-g"-fvg"-r   c                     / nU H`  n[        U5      nUS:X  d  US:X  d  [        U5      (       a  M,  [        U5      (       a  UR                  S5        MO  UR                  U5        Mb     SR	                  U5      $ )zBPerforms invalid character removal and whitespace cleanup on text.r   i  rX   r;   )rv   r   r
   r?   r@   rx   s        r   rZ   BasicTokenizer._clean_text  sg    DTBQw",+d*;*;d##c"d#  wwvr   )rN   rR   rO   rQ   rP   )TNTNTrM   )__name__
__module____qualname____firstlineno____doc__rT   rf   r^   r`   r[   rw   rZ   __static_attributes__r|   r   r   rJ   rJ   t   s<    0 #1 $L	,,0r   rJ   c                     ^  \ rS rSrSr\rSSSSSSSS	/ S
QSS4U 4S jjr\S 5       r	S r
S rS rS r\S 5       rS rS rS rS rS rS r S%S\\   S\\\      S\\   4S jjr S&S\\   S\\\      S\S\\   4U 4S jjjr S%S\\   S\\\      S\\   4S jjrS%S\S \\   S\\   4S! jjrS" rS# r S$r!U =r"$ )'HerbertTokenizeri  a  
Construct a BPE tokenizer for HerBERT.

Peculiarities:

- uses BERT's pre-tokenizer: BaseTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of a
  punctuation character will be treated separately.

- Such pretokenized input is BPE subtokenized

This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should refer to the
superclass for more information regarding methods.
Nz<s>z<unk>z<pad>z<mask>z</s>F)
z
<special0>z
<special1>z
<special2>z
<special3>z
<special4>z
<special5>z
<special6>z
<special7>z
<special8>z
<special9>c                   >  SS K nXl        0 U l        0 U l        1 SkU l        Xl        Xl        Xl        Ub  Ub  [        U5      [        U5      :X  d   eS U l
        S U l        [        USS9 n[        R                  " U5      U l        S S S 5        U R                  R!                  5        VVs0 s H	  u  nnUU_M     snnU l        [        USS9 nUR%                  5       R'                  S5      S S nS S S 5        W Vs/ s H  n[)        UR'                  5       S S 5      PM!     nn[+        [-        U[/        [        U5      5      5      5      U l        0 U l        [4        TU ]l  " SUU	UUUUUUUU
S S	.UD6  [9        S
U R:                  S
S
S9U l        g ! [         a    [        S5      ef = f! , (       d  f       GN*= fs  snnf ! , (       d  f       N= fs  snf )Nr   zrYou need to install sacremoses to use HerbertTokenizer. See https://pypi.org/project/sacremoses/ for installation.>   jathzhutf-8encoding
rm      )	unk_token	bos_token	sep_token	pad_token	cls_token
mask_tokenadditional_special_tokenslang2idid2langdo_lowercase_and_remove_accenttokenizer_fileF)rN   rO   rP   rQ   r|   )
sacremosesImportErrorsmcache_moses_punct_normalizercache_moses_tokenizerlang_with_custom_tokenizerr   r   r   ro   ja_word_tokenizerzh_word_tokenizeropenjsonloadencoderitemsdecoderreadrF   tupledictziprange	bpe_rankscachesuperrT   rJ   all_special_tokensbert_pre_tokenizer)rS   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargsr   vocab_handlekvmerges_handlemergesmerge	__class__s                         r   rT   HerbertTokenizer.__init__&  s   8	  -/)%'"*<'.L+7#6w<3w<///!%!%*w/<99\2DL 0)-););)=>)=A1)=>+0M"'')//5cr:F 18>?u%bq)*?c&%F*<=>
 	
!&?+I	
 	
 #1//#(	#
_  	M 	. 0/>00?s/   F .F-/F?#G&GF*-
F<
Gc                     U R                   $ rM   )r   rS   s    r   rN   HerbertTokenizer.do_lower_casez  s     222r   c                     X R                   ;  a(  U R                  R                  US9nX0R                   U'   OU R                   U   nUR                  U5      $ )Nlang)r   r   MosesPunctNormalizerr\   )rS   r7   r   punct_normalizers       r   moses_punct_norm!HerbertTokenizer.moses_punct_norm  sV    888#ww;;;F6F--d3#@@F))$//r   c                     X R                   ;  a(  U R                  R                  US9nX0R                   U'   OU R                   U   nUR                  USSS9$ )Nr   F)
return_strescape)r   r   MosesTokenizerrf   )rS   r7   r   moses_tokenizers       r   moses_tokenizeHerbertTokenizer.moses_tokenize  s[    111"gg44$4?O/>&&t,"88>O''u'MMr   c                 T    [        U5      nU R                  X5      n[        U5      nU$ rM   )r8   r   rC   )rS   r7   r   s      r   moses_pipelineHerbertTokenizer.moses_pipeline  s+    $T*$$T0'-r   c                    U R                   c<   SS KnUR                  S[        R                  R	                  S5       S35      U l         [        U R                   R                  U5      5      $ ! [
        [        4 a    [        R                  S5        [        R                  S5        [        R                  S5        [        R                  S5        [        R                  S	5        [        R                  S
5        e f = f)Nr   z-model r,   z/local/share/kytea/model.binzMake sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following stepsz81. git clone git@github.com:neubig/kytea.git && cd kyteaz2. autoreconf -iz#3. ./configure --prefix=$HOME/localz4. make && make installz5. pip install kytea)r   Mykyteaospath
expanduserAttributeErrorr   loggererrorrn   getWS)rS   r7   r   s      r   ja_tokenizeHerbertTokenizer.ja_tokenize  s    !!))0bgg00566RS*& D**00677 #K0 
[ WX/0BC6734
s   ;A. .BC>c                 ,    [        U R                  5      $ rM   )ro   r   r   s    r   
vocab_sizeHerbertTokenizer.vocab_size  s     4<<  r   c                 B    [        U R                  40 U R                  D6$ rM   )r   r   added_tokens_encoderr   s    r   	get_vocabHerbertTokenizer.get_vocab  s    DLL>D$=$=>>r   c                   ^  [        US S 5      US   S-   4-   nUT R                  ;   a  T R                  U   $ [        U5      nU(       d  US-   $  [        UU 4S jS9nUT R                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUS	:X  a  S
nUT R                  U'   U$ ! [         a    UR                  X(S  5         Mt  f = f)Nrm   </w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)r   getfloat)pairrS   s    r   <lambda>&HerbertTokenizer.bpe.<locals>.<lambda>  s    1C1CD%PU,1Wr   keyr   r   r   rX   z
  </w>z
</w>)r   r   r   minr   ro   indexr_   
ValueErrorr?   r@   )
rS   rd   r   r   bigramfirstsecondnew_wordrq   js
   `         r   bpeHerbertTokenizer.bpe  s   U3BZ E"I$6#88DJJ::e$$$6>!$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~:D 

5/ " OODH-s   E) )F
	F
c                     U R                   R                  U5      n/ nU HD  nU(       d  M  UR                  [        U R	                  U5      R                  S5      5      5        MF     U$ )NrX   )r   rf   r_   rn   r   rF   )rS   r7   
pre_tokensrc   rd   s        r   	_tokenizeHerbertTokenizer._tokenize  sZ    ,,55d;
Eu##D%)>)>s)C$DE   r   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r   r   r   )rS   rd   s     r   _convert_token_to_id%HerbertTokenizer._convert_token_to_id  s*    ||||'7'7'GHHr   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r   r   r   )rS   r   s     r   _convert_id_to_token%HerbertTokenizer._convert_id_to_token  s    ||~~66r   c                 d    SR                  U5      R                  SS5      R                  5       nU$ )z:Converts a sequence of tokens (string) in a single string.r;   r   rX   )r@   r4   rE   )rS   rG   
out_strings      r   convert_tokens_to_string)HerbertTokenizer.convert_tokens_to_string  s+    WWV_,,VS9??A
r   token_ids_0token_ids_1returnc                 d    U R                   /nU R                  /nUc  X1-   U-   $ X1-   U-   U-   U-   $ )a0  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An XLM sequence has the following format:

- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s> B </s>`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.

)bos_token_idsep_token_id)rS   r  r  bosseps        r    build_inputs_with_special_tokens1HerbertTokenizer.build_inputs_with_special_tokens  sL    (   !  !$s** 3&4s::r   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Ub+  S/S/[        U5      -  -   S/-   S/[        U5      -  -   S/-   $ S/S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r  r  r  r   r   )r   get_special_tokens_maskro   )rS   r  r  r  r   s       r   r  (HerbertTokenizer.get_special_tokens_mask  s    & &72']a 3   "31#K 001QC7A3[AQ;QRVWUXXXsqcC,,-33r   c                     U R                   /nU R                  /nUc  [        XA-   U-   5      S/-  $ [        XA-   U-   5      S/-  [        X#-   5      S/-  -   $ )aY  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
pair mask has the following format:

```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence    | second sequence |
```

If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
r   r   )r  cls_token_idro   )rS   r  r  r  clss        r   $create_token_type_ids_from_sequences5HerbertTokenizer.create_token_type_ids_from_sequences8  sn    .   !  !s(3./1#553$s*+qc1C8I4JaS4PPPr   save_directoryfilename_prefixc           
      B   [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  X(       a  US-   OS[        S   -   5      n[        USSS	9 nUR                  [        R                  " U R                  S
SSS9S-   5        S S S 5        Sn[        USSS	9 n[        U R                  R                  5       S S9 HM  u  pXi:w  a  [        R                  SU S35        U	nUR                  SR                  U5      S-   5        US-  nMO     S S S 5        X44$ ! , (       d  f       N= f! , (       d  f       X44$ = f)NzVocabulary path (z) should be a directoryr.   r;   r   r   wr   r   r   TF)indent	sort_keysensure_asciir   r   c                     U S   $ )Nr   r|   )kvs    r   r   2HerbertTokenizer.save_vocabulary.<locals>.<lambda>f  s    Y[\]Y^r   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rX   r   )r   r   isdirr   r   r@   VOCAB_FILES_NAMESr   writer   dumpsr   sortedr   r   warning)
rS   r  r  r   
merge_filefr   writer
bpe_tokenstoken_indexs
             r   save_vocabulary HerbertTokenizer.save_vocabularyV  sy   ww}}^,,LL,^,<<STUWW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4GGDJJt||ATYZ]aab 5 *cG4+1$..2F2F2HN^+_'
'NN/
| <M M (ESXXj1D89
 ,` 5 %% 54 54 %%s   44E==A5F=
F
Fc                 D    U R                   R                  5       nS US'   U$ )Nr   )__dict__copy)rS   states     r   __getstate__HerbertTokenizer.__getstate__s  s"    ""$dr   c                 X    Xl          SS KnX l        g ! [         a    [        S5      ef = f)Nr   znYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.)r1  r   r   r   )rS   dr   s      r   __setstate__HerbertTokenizer.__setstate__y  s:    	   	M 	s    ))r1  r   r   r   r   r   r   r   r   r   r   r   r   r   r   rM   )NF)#r   r   r   r   r   r$  vocab_files_namesrT   propertyrN   r   r   r   r   r   r   r   r   r   r   r  r   intr   r  boolr  r  strr   r.  r4  r8  r   __classcell__)r   s   @r   r   r     s    * ',#
 3R
h 3 30N8* ! !?*XI
7
 JN;9;3;DI3F;	c;: sx4943;DI3F4ko4	c4 4< JNQ9Q3;DI3FQ	cQ<&c &HSM &]bcf]g &: r   r   )r   r   r5   r<   typingr   r   r   tokenization_utilsr   r   r	   r
   utilsr   
get_loggerr   r   r$  r   r8   rC   rH   rJ   r   __all__r|   r   r   <module>rE     s     	 	  ( ( c c  
		H	%  
(X
^ ^Bo* od 
r   