
    fTh?K                         S r SSKrSSKrSSKrSSKrSSKJrJrJrJ	r	  SSK
Jr  SSKJr  \R                  " \5      rSSS	S
.rS rS rS r " S S\5      rS/rg)zTokenization classes for FSMT.    N)DictListOptionalTuple   )PreTrainedTokenizer)loggingzvocab-src.jsonzvocab-tgt.jsonz
merges.txt)src_vocab_filetgt_vocab_filemerges_filec                 d    [        5       nU S   nU SS  H  nUR                  X#45        UnM     U$ )zw
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
r      N)setadd)wordpairs	prev_charchars       b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/fsmt/tokenization_fsmt.py	get_pairsr   $   s?    
 EEQIQR		9#$	  L    c                 .   U R                  SS5      n [        R                  " SSU 5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  S	S
5      n U R                  SS
5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  S S!5      n U R                  S"S#5      n U R                  S$S%5      n U R                  S&S'5      n U R                  S(S)5      n U R                  S*S+5      n U R                  S,S-5      n [        R                  " S.SU 5      n U R                  S/S05      n U R                  S1S25      n U R                  S3S45      n U R                  S5S65      n U R                  S7S85      n U R                  S9S:5      n U R                  S;S<5      n U R                  S=S>5      n U R                  S?S@5      n U $ )Azr
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl
u   ，,u   。\s*z. u   、u   ”"u   “u   ∶:u   ：u   ？?u   《u   》u   ）)u   ！!u   （(u   ；;u   １1u   」u   「u   ０0u   ３3u   ２2u   ５5u   ６6u   ９9u   ７7u   ８8u   ４4u   ．\s*u   ～~u   ’'u   …z...u   ━-u   〈<u   〉>u   【[u   】]u   ％%)replaceresub)texts    r   replace_unicode_punctr7   1   sM    <<s#D66)T4(D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#D66)T4(D<<s#D<<s#D<<u%D<<s#D<<s#D<<s#D<<s#D<<s#D<<s#DKr   c                     / nU  HB  n[         R                  " U5      nUR                  S5      (       a  M1  UR                  U5        MD     SR	                  U5      $ )zo
Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl
C )unicodedatacategory
startswithappendjoin)r6   outputr   cats       r   remove_non_printing_charrB   \   sQ     F""4(>>#d	 
 776?r   c            
         ^  \ rS rSrSr\rSS/r         S"U 4S jjrS\	\
\4   4S jr\S\4S j5       rS	 rS
 rS rS r\S 5       r\S 5       rS rS rS rS#S jrS rS rS r S$S\\   S\\\      S\\   4S jjr S%S\\   S\\\      S\S\\   4U 4S jjjr  S$S\\   S\\\      S\\   4S jjr!S$S\
S\\
   S\"\
   4S jjr#S r$S  r%S!r&U =r'$ )&FSMTTokenizerr   au  
Construct an FAIRSEQ Transformer tokenizer. Based on Byte-Pair Encoding. The tokenization process is the following:

- Moses preprocessing and tokenization.
- Normalizing all inputs text.
- The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols (like
  "__classify__") to a vocabulary.
- The argument `langs` defines a pair of languages.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    langs (`List[str]`, *optional*):
        A list of two languages to translate from and to, for instance `["en", "ru"]`.
    src_vocab_file (`str`, *optional*):
        File containing the vocabulary for the source language.
    tgt_vocab_file (`st`, *optional*):
        File containing the vocabulary for the target language.
    merges_file (`str`, *optional*):
        File containing the merges.
    do_lower_case (`bool`, *optional*, defaults to `False`):
        Whether or not to lowercase the input when tokenizing.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the beginning of
        sequence. The token used is the `cls_token`.

        </Tip>

    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.

	input_idsattention_maskc
                   >  SS K nXl        X l        X0l        X@l        XPl        0 U l        0 U l        0 U l	        U(       a  [        U5      S:X  a  Uu  U l        U l        O[        SU S35      e[        USS9 n[        R                   " U5      U l        S S S 5        [        USS9 n[        R                   " U5      nUR%                  5        VVs0 s H	  u  nnUU_M     snnU l        S S S 5        [        USS9 nUR)                  5       R+                  S5      S S	 nS S S 5        W Vs/ s H  n[-        UR+                  5       S S 5      PM!     nn[/        [1        U[3        [        U5      5      5      5      U l        0 U l        [8        TU ]t  " SUUUUUUUUU	S
.	U
D6  g ! [         a    [        S5      ef = f! , (       d  f       GN,= fs  snnf ! , (       d  f       N= f! , (       d  f       N= fs  snf )Nr   nYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.   zFarg `langs` needs to be a list of 2 langs, e.g. ['en', 'ru'], but got zw. Usually that means that tokenizer can't find a mapping for the given model path in  and other maps of this tokenizer.utf-8encoding
)	langsr
   r   r   do_lower_case	unk_token	bos_token	sep_token	pad_token )
sacremosesImportErrorsmr
   r   r   rQ   cache_moses_punct_normalizercache_moses_tokenizercache_moses_detokenizerlensrc_langtgt_lang
ValueErroropenjsonloadencoderitemsdecoderreadsplittupledictziprange	bpe_rankscachesuper__init__)selfrP   r
   r   r   rQ   rR   rS   rT   rU   kwargsrW   src_vocab_handletgt_vocab_handle	tgt_vocabkvmerges_handlemergesmerge	__class__s                       r   rp   FSMTTokenizer.__init__   s   	 ,,&* -/)%'"')$SZ1_+0(DM4=XY^X_ `8 8  .737G99%56DL 4.737G		"23I-6__->?->TQAqD->?DL 4 +0M"'')//5cr:F 18>?u%bq)*?c&%F*<=>
 	
))#'	
 	
M  	M 	8 43 @ 43 10?sG   F! 9F:'*GG!	G<#G#+&G4!F7:
G	G
G #
G1returnc                 "    U R                  5       $ N)get_src_vocabrq   s    r   	get_vocabFSMTTokenizer.get_vocab   s    !!##r   c                     U R                   $ r   )src_vocab_sizer   s    r   
vocab_sizeFSMTTokenizer.vocab_size   s    """r   c                     X R                   ;  a'  U R                  R                  US9nX0R                   U'   U R                   U   R                  U5      $ Nlang)rZ   rY   MosesPunctNormalizer	normalize)rq   r6   r   punct_normalizers       r   moses_punct_normFSMTTokenizer.moses_punct_norm   sP    888#ww;;;F6F--d3006@@FFr   c                     X R                   ;  a'  U R                  R                  US9nX0R                   U'   U R                   U   R                  USSSS9$ )Nr   TF)aggressive_dash_splits
return_strescape)r[   rY   MosesTokenizertokenize)rq   r6   r   moses_tokenizers       r   moses_tokenizeFSMTTokenizer.moses_tokenize   s_    111"gg44$4?O/>&&t,))$/88% 9 
 	
r   c                     X R                   ;  a'  U R                  R                  US9nX0R                   U'   U R                   U   R                  U5      $ r   )r\   rY   MosesDetokenizer
detokenize)rq   tokensr   moses_detokenizers       r   moses_detokenizeFSMTTokenizer.moses_detokenize   sP    333 $ 8 8d 8 C1B((.++D1<<VDDr   c                 T    [        U5      nU R                  X5      n[        U5      nU$ r   )r7   r   rB   )rq   r6   r   s      r   moses_pipelineFSMTTokenizer.moses_pipeline  s+    $T*$$T0'-r   c                 ,    [        U R                  5      $ r   )r]   rd   r   s    r   r   FSMTTokenizer.src_vocab_size      4<<  r   c                 ,    [        U R                  5      $ r   )r]   rf   r   s    r   tgt_vocab_sizeFSMTTokenizer.tgt_vocab_size  r   r   c                 B    [        U R                  40 U R                  D6$ r   )rj   rd   added_tokens_encoderr   s    r   r   FSMTTokenizer.get_src_vocab      DLL>D$=$=>>r   c                 B    [        U R                  40 U R                  D6$ r   )rj   rf   added_tokens_decoderr   s    r   get_tgt_vocabFSMTTokenizer.get_tgt_vocab  r   r   c                   ^  [        US S 5      US   S-   4-   nUT R                  ;   a  T R                  U   $ [        U5      nU(       d  US-   $  [        UU 4S jS9nUT R                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUS	:X  a  S
nUT R                  U'   U$ ! [         a    UR                  X(S  5         Mt  f = f)NrO   </w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)rm   getfloat)pairrq   s    r   <lambda>#FSMTTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1Wr   keyr   r   rJ    z
  </w>z
</w>)ri   rn   r   minrm   r]   indexextendr`   r>   r?   )
rq   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpeFSMTTokenizer.bpe  s   U3BZ E"I$6#88DJJ::e$$$6>!$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~:D 

5/ " OODH-s   E) )F
	F
c                 d   U R                   nU R                  (       a  UR                  5       nU(       a  UR                  5       nOU R	                  XS9nU R                  XS9n/ nU HD  nU(       d  M  UR                  [        U R                  U5      R                  S5      5      5        MF     U$ )a  
Tokenize a string given language code using Moses.

Details of tokenization:

    - [sacremoses](https://github.com/alvations/sacremoses): port of Moses
    - Install with `pip install sacremoses`

Args:
    - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported
      languages. However, we don't enforce it.
    - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False)
      (bool). If True, we only apply BPE.

Returns:
    List of tokens.
r   r   )	r^   rQ   lowerrh   r   r   r   listr   )rq   r6   r   bypass_tokenizersplit_tokensr   s         r   	_tokenizeFSMTTokenizer._tokenizeB  s    * }}::<D::<D&&t&7D&&t&7DEu##D%)>)>s)C$DE  r   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)rd   r   rR   )rq   r   s     r   _convert_token_to_id"FSMTTokenizer._convert_token_to_idi  s*    ||||'7'7'GHHr   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)rf   r   rR   )rq   r   s     r   _convert_id_to_token"FSMTTokenizer._convert_id_to_tokenm  s    ||~~66r   c                     U Vs/ s H$  o"R                  SS5      R                  SS5      PM&     nnSR                  U5      R                  5       nU R                  XR                  5      nU$ s  snf )z:Converts a sequence of tokens (string) in a single string.r   r:   r   )r3   r?   rh   r   r_   )rq   r   tr6   s       r   convert_tokens_to_string&FSMTTokenizer.convert_tokens_to_stringq  se     DJJ6a))C$,,VS96J&&($$V]];	 Ks   +A.token_ids_0token_ids_1c                 >    U R                   /nUc  X-   $ X-   U-   U-   $ )a>  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A FAIRSEQ Transformer sequence has the following format:

- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s> B </s>`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)sep_token_idrq   r   r   seps       r    build_inputs_with_special_tokens.FSMTTokenizer.build_inputs_with_special_tokens{  s6    &   ! $$ ;.44r   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Ub'  S/[        U5      -  S/-   S/[        U5      -  -   S/-   $ S/[        U5      -  S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r   r   r   )ro   get_special_tokens_maskr]   )rq   r   r   r   r{   s       r   r   %FSMTTokenizer.get_special_tokens_mask  sx    & &72']a 3   "C#k**qc1aS3{;K5KLPQsRRc+&&1#--r   c                     U R                   /nUc  [        X-   5      S/-  $ [        X-   5      S/-  [        X#-   5      S/-  -   $ )a  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
Transformer sequence pair mask has the following format:

```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence    | second sequence |
```

If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).

Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An
FAIRSEQ_TRANSFORMER sequence pair mask has the following format:
r   r   )r   r]   r   s       r   $create_token_type_ids_from_sequences2FSMTTokenizer.create_token_type_ids_from_sequences  sX    4   ! {()QC//;$%+c+2C.Ds.JJJr   save_directoryfilename_prefixc           
         [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  X(       a  US-   OS[        S   -   5      n[        USS	S
9 nUR                  [        R                  " U R                  SSSS9S-   5        S S S 5        [        USS	S
9 nU R                  R                  5        VVs0 s H  u  pxX_M	     n	nnUR                  [        R                  " U	SSSS9S-   5        S S S 5        Sn
[        USS	S
9 n[        U R                  R                  5       S S9 HM  u  pX:w  a  [        R!                  SU S35        Un
UR                  SR                  U5      S-   5        U
S-  n
MO     S S S 5        X4U4$ ! , (       d  f       GN= fs  snnf ! , (       d  f       N= f! , (       d  f       N;= f)NzVocabulary path (z) should be a directoryr-   r:   r
   r   r   wrK   rL   rJ   TF)indent	sort_keysensure_asciirN   r   c                     U S   $ )Nr   rV   )kvs    r   r   /FSMTTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^r   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r   r   )ospathisdirloggererrorr?   VOCAB_FILES_NAMESra   writerb   dumpsrd   rf   re   sortedrm   warning)rq   r   r   r
   r   r   frv   rw   ru   r   writer
bpe_tokenstoken_indexs                 r   save_vocabularyFSMTTokenizer.save_vocabulary  s
   ww}}^,,LL,^,<<STUo_s22QbcsQtt
 o_s22QbcsQtt
 ggllo_s22QbcpQqq
 .#8AGGDJJt||ATYZ]aab 9 .#8A*.,,*<*<*>?*>$!*>I?GGDJJydQVWZ^^_ 9 +sW5+1$..2F2F2HN^+_'
'NN/} =M M (ESXXj1D89
 ,` 6 {::' 98 @ 98
 65s7   )4H0H6H0,H6A5I
H-0H66
I
Ic                 D    U R                   R                  5       nS US'   U$ )NrY   )__dict__copy)rq   states     r   __getstate__FSMTTokenizer.__getstate__  s"    ""$dr   c                 X    Xl          SS KnX l        g ! [         a    [        S5      ef = f)Nr   rI   )r  rW   rX   rY   )rq   drW   s      r   __setstate__FSMTTokenizer.__setstate__  s:    	   	M 	s    ))r  rm   rn   r\   rZ   r[   rf   rQ   rd   r   rY   r^   r
   r_   r   )	NNNNFz<unk>z<s>z</s>z<pad>)enFr   )NF)(__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesrp   r   strintr   propertyr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   boolr   r   r   r  r  r  __static_attributes____classcell__)r{   s   @r   rD   rD   r   s   +Z *$&67 @
F$4S> $ #C # #G
E ! ! ! !??*X%NI7 JN5953;DI3F5	c56 sx.9.3;DI3F.ko.	c. .: JNK9K3;DI3FK	cKB";c ";HSM ";]bcf]g ";H
 r   rD   )r  rb   r   r4   r;   typingr   r   r   r   tokenization_utilsr   utilsr	   
get_loggerr  r   r   r   r7   rB   rD   __all__rV   r   r   <module>r!     sq    %  	 	  . . 5  
		H	% '& 
(V
,T' Tn 
r   