
    fThB                     P   S SK r S SKrS SKrS SKrS SKJr  S SKJr  S SKJ	r	J
r
JrJrJrJr  S SKrSSKJr  SSKJr  SSKJr  \R,                  " \5      rS	S
SSSS.rSr\" SS9 " S S\5      5       rS\S\
\\	4   S\R:                  4S jrS\SS4S jrS\S\\
\4   4S jr S/r!g)    N)Path)copyfile)AnyDictListOptionalTupleUnion   )PreTrainedTokenizer)logging)requiresz
source.spmz
target.spmz
vocab.jsonztarget_vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabtarget_vocab_filetokenizer_config_fileu   ▁)sentencepiece)backendsc            
         ^  \ rS rSrSr\rSS/r\R                  " S5      r
         S+S\\\\4      SS4U 4S	 jjjrS
 rS\S\4S jrS rS\4S jrS\S\\   4S jrS\S\4S jrU 4S jrU 4S jrS\\   S\4S jrS,S\\   4S jjrS rS r\S\4S j5       rS,S\S\\   S\ \   4S jjr!S\4S jr"S r#S  r$S\4S! jr%S"\SS4S# jr&S$ r'S% r( S-S&\S'\\   S(\)S\\   4S) jjr*S*r+U =r,$ ).MarianTokenizer-   a  
Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    source_spm (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
        contains the vocabulary for the source language.
    target_spm (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
        contains the vocabulary for the target language.
    source_lang (`str`, *optional*):
        A string representing the source language.
    target_lang (`str`, *optional*):
        A string representing the target language.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    model_max_length (`int`, *optional*, defaults to 512):
        The maximum sentence length the model accepts.
    additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
        Additional special tokens used by the tokenizer.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

Examples:

```python
>>> from transformers import MarianForCausalLM, MarianTokenizer

>>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
>>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
>>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
>>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)

>>> outputs = model(**inputs)  # should work
```	input_idsattention_maskz>>.+<<Nsp_model_kwargsreturnc                 &  > Uc  0 OUU l         [        U5      R                  5       (       d
   SU 35       eXl        [	        U5      U l        [        U5      U R
                  ;  a  [        S5      e[        U	5      U R
                  ;   d   eU(       aL  [	        U5      U l        U R                  R                  5        VVs0 s H  u  pX_M	     snnU l
        / U l        OU R
                  R                  5        VVs0 s H  u  pX_M	     snnU l
        U R
                   Vs/ s H4  oR                  S5      (       d  M  UR                  S5      (       d  M2  UPM6     snU l        XPl        X`l        X/U l        [#        XR                   5      U l        [#        X R                   5      U l        U R$                  U l        U R
                  U l        U R-                  5         [.        TU ]`  " SUUUUU	U
U R                   UUS.	UD6  g s  snnf s  snnf s  snf )Nzcannot find spm source z <unk> token must be in the vocabz>>z<<)	source_langtarget_lang	unk_token	eos_token	pad_tokenmodel_max_lengthr   r   separate_vocabs )r   r   existsr$   	load_jsonencoderstrKeyErrortarget_encoderitemsdecodersupported_language_codes
startswithendswithr   r   	spm_filesload_spm
spm_source
spm_targetcurrent_spmcurrent_encoder_setup_normalizersuper__init__)selfr   r   r   r   r   r   r    r!   r"   r#   r   r$   kwargskv	__class__s                   f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/marian/tokenization_marian.pyr9   MarianTokenizer.__init__m   s     &5%<r/J&&((P,CJ<*PP(. 'y>-=>>9~---"+,=">D-1-@-@-F-F-HI-HTQAD-HIDL,.D)-1\\-?-?-AB-ATQAD-ABDL>Bll2vlll[_N`1efeoeopteu1l2vD)&&$1 #:/C/CD":/C/CD??#|| 	  	
##- 00/+	
 	
) J C2vs   ?H;H H=HHc                      SSK Jn  U" U R                  5      R                  U l        g ! [
        [        4 a!    [        R                  " S5        S U l         g f = f)Nr   )MosesPunctNormalizerz$Recommended: pip install sacremoses.c                     U $ Nr%   )xs    r?   <lambda>3MarianTokenizer._setup_normalizer.<locals>.<lambda>   s    Q    )	
sacremosesrB   r   	normalizepunc_normalizerImportErrorFileNotFoundErrorwarningswarn)r:   rB   s     r?   r7   !MarianTokenizer._setup_normalizer   sM    	/7#78H8H#I#S#SD ./ 	/MM@A#.D 	/s   '* .AArE   c                 6    U(       a  U R                  U5      $ S$ )zHCover moses empty string edge case. They return empty list for '' input! )rK   )r:   rE   s     r?   rJ   MarianTokenizer.normalize   s    *+t##A&33rH   c                 f    U R                   R                  XR                   U R                     5      $ rD   )r6   getr    )r:   tokens     r?   _convert_token_to_id$MarianTokenizer._convert_token_to_id   s(    ##''/C/CDNN/STTrH   textc                     U R                   R                  U5      nU(       a  UR                  S5      /O/ nX0R                   R                  SU5      4$ )z6Remove language codes like >>fr<< before sentencepiecer   rR   )language_code_rematchgroupsub)r:   rY   r\   codes       r?   remove_language_code$MarianTokenizer.remove_language_code   sH    %%++D1).ekk!n%B**..r4888rH   c                 l    U R                  U5      u  p!U R                  R                  U[        S9nX#-   $ )N)out_type)r`   r5   encoder)   )r:   rY   r_   piecess       r?   	_tokenizeMarianTokenizer._tokenize   s7    ..t4
!!(((<}rH   indexc                 L    U R                   R                  XR                  5      $ )z?Converts an index (integer) in a token (str) using the decoder.)r-   rU   r    )r:   rh   s     r?   _convert_id_to_token$MarianTokenizer._convert_id_to_token   s    ||~~66rH   c                 &   > [         TU ]  " U40 UD6$ )a  
Convert a list of lists of token ids into a list of strings by calling decode.

Args:
    sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
        List of tokenized input ids. Can be obtained using the `__call__` method.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.
    clean_up_tokenization_spaces (`bool`, *optional*):
        Whether or not to clean up the tokenization spaces. If `None`, will default to
        `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
    use_source_tokenizer (`bool`, *optional*, defaults to `False`):
        Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
        problems).
    kwargs (additional keyword arguments, *optional*):
        Will be passed to the underlying model specific decode method.

Returns:
    `List[str]`: The list of decoded sentences.
)r8   batch_decode)r:   	sequencesr;   r>   s      r?   rm   MarianTokenizer.batch_decode   s    * w#I888rH   c                 &   > [         TU ]  " U40 UD6$ )aj  
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
tokens and clean up tokenization spaces.

Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

Args:
    token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
        List of tokenized input ids. Can be obtained using the `__call__` method.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.
    clean_up_tokenization_spaces (`bool`, *optional*):
        Whether or not to clean up the tokenization spaces. If `None`, will default to
        `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
    use_source_tokenizer (`bool`, *optional*, defaults to `False`):
        Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
        problems).
    kwargs (additional keyword arguments, *optional*):
        Will be passed to the underlying model specific decode method.

Returns:
    `str`: The decoded sentence.
)r8   decode)r:   	token_idsr;   r>   s      r?   rq   MarianTokenizer.decode   s    0 w~i2622rH   tokensc                 Z   U R                   (       a  U R                  OU R                  n/ nSnU H@  nXPR                  ;   a  XBR	                  U5      U-   S-   -  n/ nM/  UR                  U5        MB     XBR	                  U5      -  nUR                  [        S5      nUR                  5       $ )zQUses source spm if _decode_use_source_tokenizer is True, and target spm otherwiserR    )	_decode_use_source_tokenizerr3   r4   all_special_tokensdecode_piecesappendreplaceSPIECE_UNDERLINEstrip)r:   rt   sp_modelcurrent_sub_tokens
out_stringrV   s         r?   convert_tokens_to_string(MarianTokenizer.convert_tokens_to_string   s    &*&G&G4??T__
E///445GH5PSVVV
%'""))%0  	,,-?@@
''(8#>
!!rH   c                 J    Uc  XR                   /-   $ X-   U R                   /-   $ )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r:   token_ids_0token_ids_1s      r?    build_inputs_with_special_tokens0MarianTokenizer.build_inputs_with_special_tokens  s1    "3"3!444(D,=,=+>>>rH   c                 H    U R                   U l        U R                  U l        g rD   )r3   r5   r(   r6   r:   s    r?   _switch_to_input_mode%MarianTokenizer._switch_to_input_mode  s    ??#||rH   c                 l    U R                   U l        U R                  (       a  U R                  U l        g g rD   )r4   r5   r$   r+   r6   r   s    r?   _switch_to_target_mode&MarianTokenizer._switch_to_target_mode  s*    ??#'#6#6D   rH   c                 ,    [        U R                  5      $ rD   )lenr(   r   s    r?   
vocab_sizeMarianTokenizer.vocab_size  s    4<<  rH   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g / nU R
                  (       a  [         R                  R                  UU(       a  US-   OS[        S   -   5      n[         R                  R                  UU(       a  US-   OS[        S   -   5      n[        U R                  U5        [        U R                  U5        UR                  U5        UR                  U5        O\[         R                  R                  X(       a  US-   OS[        S   -   5      n[        U R                  U5        UR                  U5        [        [        S   [        S   /U R                  U R                  U R                  /5       GH$  u  pxn	[         R                  R                  X(       a  US-   OSU-   5      n
[         R                  R!                  U5      [         R                  R!                  U
5      :w  aB  [         R                  R#                  U5      (       a  [%        X5        UR                  U
5        M  [         R                  R#                  U5      (       a  M  ['        U
S	5       nU	R)                  5       nUR+                  U5        S S S 5        UR                  U
5        GM'     [-        U5      $ ! , (       d  f       N/= f)
NzVocabulary path (z) should be a directory-rR   r   r   r   r   wb)ospathisdirloggererrorr$   joinVOCAB_FILES_NAMES	save_jsonr(   r+   rz   zipr1   r3   r4   abspathisfiler   openserialized_model_protowritetuple)r:   r   r   saved_filesout_src_vocab_fileout_tgt_vocab_fileout_vocab_filespm_save_filenamespm_orig_path	spm_modelspm_save_pathficontent_spiece_models                r?   save_vocabularyMarianTokenizer.save_vocabulary  sE   ww}}^,,LL,^,<<STU!#*93&rEVW^E__" "$*93&rEVWjEkk" dll$67d))+=>1212WW\\/3!6rUfgnUo oN dllN3~.;>|,.?.MNNN__doo.<
7i
 GGLL/3!6rUf fM ww}-1OOTVT[T[TbTbcpTqTq6""=1WW^^M22-."+4+K+K+M(HH12 / ""=1<
" [!! /.s   "K
K(	c                 "    U R                  5       $ rD   )get_src_vocabr   s    r?   	get_vocabMarianTokenizer.get_vocabL  s    !!##rH   c                 B    [        U R                  40 U R                  D6$ rD   )dictr(   added_tokens_encoderr   s    r?   r   MarianTokenizer.get_src_vocabO  s    DLL>D$=$=>>rH   c                 B    [        U R                  40 U R                  D6$ rD   )r   r+   added_tokens_decoderr   s    r?   get_tgt_vocabMarianTokenizer.get_tgt_vocabR  s    D''E4+D+DEErH   c                     U R                   R                  5       nUR                  [        R	                  / SQ5      5        U$ )N)r3   r4   r5   rK   r   )__dict__copyupdater   fromkeys)r:   states     r?   __getstate__MarianTokenizer.__getstate__U  s4    ""$MMmn	
 rH   dc                    ^  UT l         [        T S5      (       d  0 T l        U 4S jT R                   5       u  T l        T l        T R                  T l        T R                  5         g )Nr   c              3   P   >#    U  H  n[        UTR                  5      v   M     g 7frD   )r2   r   ).0fr:   s     r?   	<genexpr>/MarianTokenizer.__setstate__.<locals>.<genexpr>c  s$     +fWeRSHQ8L8L,M,MWes   #&)r   hasattrr   r1   r3   r4   r5   r7   )r:   r   s   ` r?   __setstate__MarianTokenizer.__setstate__\  sT     t.//#%D +fW[WeWe+f(?? rH   c                     g)zJust EOS   r%   )r:   argsr;   s      r?   num_special_tokens_to_add)MarianTokenizer.num_special_tokens_to_addg  s    rH   c                     [        U R                  5      nUR                  U R                  5        U Vs/ s H  o3U;   a  SOSPM     sn$ s  snf )Nr   r   )setall_special_idsremoveunk_token_id)r:   seqr   rE   s       r?   _special_token_mask#MarianTokenizer._special_token_maskk  sH    d223t001:=>#Q/)q0#>>>s   Ar   r   already_has_special_tokensc                     U(       a  U R                  U5      $ Uc  U R                  U5      S/-   $ U R                  X-   5      S/-   $ )zCGet list where entries are [1] if a token is [eos] or [pad] else 0.r   )r   )r:   r   r   r   s       r?   get_special_tokens_mask'MarianTokenizer.get_special_tokens_maskp  sQ     &++K88 ++K8A3>>++K,EF!LLrH   )r   r6   r5   r-   r(   rK   r$   r   r   r1   r3   r4   r.   r+   r   )	NNNz<unk>z</s>z<pad>i   NFrD   )NF)-__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesrecompiler[   r   r   r)   r   r9   r7   rJ   rW   r`   r   rf   intrj   rm   rq   r   r   r   r   propertyr   r	   r   r   r   r   r   r   r   r   boolr   __static_attributes____classcell__)r>   s   @r?   r   r   -   s   8t *$&67zz(+ 48<
 "$sCx.1<
 
<
 <
|/43 43 4U9 9c d3i 
7# 7# 79.34"tCy "S " ?QUVYQZ ?,7
 !C ! !+"c +"HSM +"]bcf]g +"Z$4 $?Fd 	!d 	!t 	!? in	M	M.6tn	Mae	M	c	M 	MrH   r   r   r   r   c                 T    [         R                  " S0 UD6nUR                  U 5        U$ )Nr%   )r   SentencePieceProcessorLoad)r   r   spms      r?   r2   r2   |  s%    

.
.
A
ACHHTNJrH   c                 z    [        US5       n[        R                  " XSS9  S S S 5        g ! , (       d  f       g = f)Nw   )indent)r   jsondump)datar   r   s      r?   r   r     s%    	dCA		$!$ 
s   ,
:c                 |    [        U S5       n[        R                  " U5      sS S S 5        $ ! , (       d  f       g = f)Nr)r   r   load)r   r   s     r?   r'   r'     s"    	dCAyy| 
s   -
;)"r   r   r   rN   pathlibr   shutilr   typingr   r   r   r   r	   r
   r   tokenization_utilsr   utilsr   utils.import_utilsr   
get_loggerr   r   r   r|   r   r)   r   r2   r   r'   __all__r%   rH   r?   <module>r     s     	 	    : :  5  * 
		H	% ,4   
 
%&KM) KM 'KM\
3 c3h M<`<` %# %$ %
C E$*- 
 
rH   