
    fThJ                         S SK r S SKJr  S SKJrJrJrJrJr  S SK	r
SSKJrJrJr  SSKJr  SSKJr  \R&                  " \5      rSrS	S
S.r/ SQ/ SQS.rSSSSSSSS.r\" SS9 " S S\5      5       rS/rg)    N)copyfile)AnyDictListOptionalTuple   )
AddedTokenBatchEncodingPreTrainedTokenizer)logging)requiresu   ▁zsentencepiece.bpe.modelztokenizer.json)
vocab_filetokenizer_file)__java__
__python__	__en_XX__)r   r   r   __javascript____php____ruby____go__)basemultir   r   r   r   r   r   r   )javapythonen_XX
javascriptphprubygo)sentencepiece)backendsc                     ^  \ rS rSr% Sr\rSS/r/ r\	\
   \S'   / r\	\
   \S'                 S.S\\\\4      4U 4S	 jjjrS
 rS r\S 5       r\S\4S j5       r\R.                  S\SS4S j5       r S/S\	\
   S\\	\
      S\S\	\
   4U 4S jjjr S0S\	\
   S\\	\
      S\	\
   4S jjr S0S\	\
   S\\	\
      S\	\
   4S jjrS\S\\   S\\   4S jrS rS\S\	\   4S jrS rS r S  r!S0S!\S"\\   S\"\   4S# jjr#   S1S$\	\   S\S%\\	\      S\S\$4
U 4S& jjjr%S' r&S( r'S2S) jr(S*\SS4S+ jr)S*\S\4S, jr*S-r+U =r,$ )3PLBartTokenizer2   a  
Construct an PLBART tokenizer.

Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).

The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    src_lang (`str`, *optional*):
        A string representing the source language.
    tgt_lang (`str`, *optional*):
        A string representing the target language.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The start of sequence token.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        The cls token, which is a special token used as the first token for all tasks.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    mask_token(`str`, *optional*, defaults to `"<mask>"`):
        The token used for masking values. This is the token used when training this model with masking tasks. This
        is only used in the `"base"` tokenizer type. For `"multi"` tokenizer, masking is never done for the
        downstream tasks.
    language_codes (`str`, *optional*, defaults to `"base"`):
        What language codes to use. Should be one of `"base"` or `"multi"`.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:
        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.
        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

Examples:

```python
>>> from transformers import PLBartTokenizer

>>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
>>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
>>> expected_translation_english = "Returns the maximum value of a b c."
>>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
```	input_idsattention_maskprefix_tokenssuffix_tokensNsp_model_kwargsc                 p  > [        U[        5      (       a  [        USSS9OUnUc  0 OUU l        U R	                  U5      nU R	                  U5      n[
        R                  " S0 U R                  D6U l        U R                  R                  [        U5      5        Xl	        Xl
        [        U R                     nSSSSS.U l        SU l        [        U R                  5      U l        [!        U5       VVs0 s H#  u  nnUU R                  U-   U R                  -   _M%     snnU l        U R"                  R%                  5        VVs0 s H	  u  nnUU_M     snnU l        U R                  S	:X  aE  [        U R                  5      [        U R"                  5      -   U R                  -   U R                  S
'   U R                  R)                  U R"                  5        U R                  R%                  5        VVs0 s H	  u  nnUU_M     snnU l        [-        U R"                  R/                  5       5      nUb*  UR1                  U Vs/ s H  nUU;  d  M  UPM     sn5        U R                  S	:X  a>  Xl        U R2                  b  U R"                  U R2                     OU R2                  U l        O*Ub  UOSU l        U R"                  U R2                     U l        [6        TU ]p  " SUUUUUUUU	U
UUUU R                  US.UD6  Xl        U R=                  U R2                  5        g s  snnf s  snnf s  snnf s  snf )NTF)lstriprstripr         r	   )<s><pad></s><unk>r   <mask>r   )	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokenlanguage_codesr   src_langtgt_langadditional_special_tokensr*   clean_up_tokenization_spaces )
isinstancestrr
   r*   !_convert_lang_code_special_formatspmSentencePieceProcessorsp_modelLoadr   r<   FAIRSEQ_LANGUAGE_CODESfairseq_tokens_to_idsfairseq_offsetlensp_model_size	enumeratelang_code_to_iditemsid_to_lang_codeupdatefairseq_ids_to_tokenslistkeysextend	_src_langcur_lang_code_idsuper__init__r>   set_src_lang_special_tokens)selfr   r5   r6   r8   r9   r7   r:   r;   r<   r   r=   r>   r*   r?   r@   kwargsfairseq_language_codesicodekv_additional_special_tokenst	__class__s                           f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/plbart/tokenization_plbart.pyrZ   PLBartTokenizer.__init__w   s   ( KUU_adJeJeZ
4Fku
%4%<r/99(C99(C22JT5I5IJ3z?+$,!78K8K!L ./APQ%R"   /NWXnNo 
No71dD$$$q(4+>+>>>No 
 261E1E1K1K1MN1MA11MN&(36t}}3EDL`L`Ha3adhdwdw3wD&&x0""))$*>*>?7;7Q7Q7W7W7Y%Z7Ytq!ad7Y%Z"%)$*>*>*C*C*E%F"$0&--5]5qB\9\5] &(%N8<8R$$T^^4X\XfXf ! *2)=X;DN$($8$8$HD! 	
!))&@ 00)E	
 	
$ !((8] 
  O &[ ^s   5*L!L'5L-
L3L3c                 ~    U R                   R                  5       nS US'   U R                  R                  5       US'   U$ )NrG   sp_model_proto)__dict__copyrG   serialized_model_proto)r\   states     rf   __getstate__PLBartTokenizer.__getstate__   s;    ""$ j"&--"F"F"H    c                     Xl         [        U S5      (       d  0 U l        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr*   rA   )rj   hasattrr*   rE   rF   rG   LoadFromSerializedProtori   )r\   ds     rf   __setstate__PLBartTokenizer.__setstate__   sR     t.//#%D 22JT5I5IJ--d.A.ABrp   c                    U R                   S:X  a;  [        U R                  5      [        U R                  5      -   U R                  -   S-   $ [        U R                  5      [        U R                  5      -   U R                  -   $ )Nr   r.   )r<   rL   rG   rO   rK   r\   s    rf   
vocab_sizePLBartTokenizer.vocab_size   sn    &(DMM"S)=)=%>>ATATTWXX t}}%D,@,@(AADDWDWWWrp   returnc                     U R                   $ N)rW   rx   s    rf   r=   PLBartTokenizer.src_lang   s    ~~rp   new_src_langc                 h    U R                  U5      nXl        U R                  U R                  5        g r}   )rD   rW   r[   )r\   r   s     rf   r=   r~      s)    ==lK%((8rp   token_ids_0token_ids_1already_has_special_tokensc                   > U(       a  [         TU ]  XSS9$ S/[        U R                  5      -  nS/[        U R                  5      -  nUc  US/[        U5      -  -   U-   $ US/[        U5      -  -   S/[        U5      -  -   U-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r   r.   r   )rY   get_special_tokens_maskrL   r(   r)   )r\   r   r   r   prefix_onessuffix_onesre   s         rf   r   'PLBartTokenizer.get_special_tokens_mask   s    & &72']a 3   cC 2 233cC 2 2331#K(8"89KGGqcC$445!s;?O9OPS^^^rp   c                 ~    Uc  U R                   U-   U R                  -   $ U R                   U-   U-   U R                  -   $ )a  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An PLBART sequence has the following format, where `X` represents the sequence:

- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)r(   r)   )r\   r   r   s      rf    build_inputs_with_special_tokens0PLBartTokenizer.build_inputs_with_special_tokens  sG    , %%3d6H6HHH!!K/+=@R@RRRrp   c                     U R                   /nU R                  /nUc  [        XA-   U-   5      S/-  $ [        XA-   U-   U-   U-   U-   5      S/-  $ )a{  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. PLBart does not
make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of zeros.
r   )sep_token_idcls_token_idrL   )r\   r   r   sepclss        rf   $create_token_type_ids_from_sequences4PLBartTokenizer.create_token_type_ids_from_sequences1  si    "   !  !s(3./1#553$s*S0;>DEKKrp   return_tensorsr=   r>   c                     Ub  Uc  [        S5      eU R                  U5      U l        U R                  U5      U l        U " U4SUS.UD6nU R	                  U R                  5      nXvS'   U$ )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorrD   r=   r>   convert_tokens_to_ids)r\   
raw_inputsr   r=   r>   extra_kwargsinputstgt_lang_ids           rf   _build_translation_inputs)PLBartTokenizer._build_translation_inputsI  sx     x/`aa>>xH>>xHjiT.i\hi00?(3$%rp   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf r}   )rangery   convert_ids_to_tokensrR   added_tokens_encoder)r\   r_   vocabs      rf   	get_vocabPLBartTokenizer.get_vocabV  sL    ;@;QR;Qa++A.1;QRT../ Ss   Atextc                 >    U R                   R                  U[        S9$ )N)out_type)rG   encoderC   )r\   r   s     rf   	_tokenizePLBartTokenizer._tokenize[  s    }}##D3#77rp   c                     XR                   ;   a  U R                   U   $ U R                  R                  U5      nU(       a  X R                  -   $ U R                  $ )z0Converts a token (str) in an id using the vocab.)rJ   rG   	PieceToIdrK   unk_token_id)r\   tokenspm_ids      rf   _convert_token_to_id$PLBartTokenizer._convert_token_to_id^  sQ    ...--e44((/ 06v+++L4;L;LLrp   c                     XR                   ;   a  U R                   U   $ U R                  R                  XR                  -
  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)rS   rG   	IdToPiecerK   )r\   indexs     rf   _convert_id_to_token$PLBartTokenizer._convert_id_to_tokeng  s=    ...--e44}}&&u/B/B'BCCrp   c                 l    SR                  U5      R                  [        S5      R                  5       nU$ )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r\   tokens
out_strings      rf   convert_tokens_to_string(PLBartTokenizer.convert_tokens_to_stringm  s,    WWV_,,-=sCIIK
rp   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        U4$ U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openrG   rl   write)r\   r   r   out_vocab_fileficontent_spiece_models         rf   save_vocabularyPLBartTokenizer.save_vocabularyr  s,   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F		src_texts	tgt_textsc                 ~   > U R                  U5      U l        U R                  U5      U l        [        TU ]  " X40 UD6$ r}   )rD   r=   r>   rY   prepare_seq2seq_batch)r\   r   r=   r   r>   r]   re   s         rf   r   %PLBartTokenizer.prepare_seq2seq_batch  s>     >>xH>>xHw,YLVLLrp   c                 8    U R                  U R                  5      $ r}   )r[   r=   rx   s    rf   _switch_to_input_mode%PLBartTokenizer._switch_to_input_mode      //>>rp   c                 8    U R                  U R                  5      $ r}   )set_tgt_lang_special_tokensr>   rx   s    rf   _switch_to_target_mode&PLBartTokenizer._switch_to_target_mode  r   rp   c                     U R                  U5      nUb  U R                  U   OSU l        / U l        U R                  b  U R                  U R                  /U l        gU R                  /U l        g)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].NrD   rO   cur_lang_coder(   eos_token_idr)   )r\   r=   s     rf   r[   +PLBartTokenizer.set_src_lang_special_tokens  sk    99(C?G?ST11(;Y])"&"3"3T5G5G!HD"&"3"3!4Drp   langc                     U R                  U5      nUb  U R                  U   OSU l        / U l        U R                  b  U R                  U R                  /U l        gU R                  /U l        g)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   r\   r   s     rf   r   +PLBartTokenizer.set_tgt_lang_special_tokens  sk    55d;;?;KT11$7QU)"&"3"3T5G5G!HD"&"3"3!4Drp   c                 P    U[         R                  5       ;   a  [         U   nU$ UnU$ )z;Convert Language Codes to format tokenizer uses if required)FAIRSEQ_LANGUAGE_CODES_MAPrU   r   s     rf   rD   1PLBartTokenizer._convert_lang_code_special_format  s1    37;U;Z;Z;\3\)$/ cgrp   )rj   rW   r   rX   rS   rK   rJ   rQ   rO   r<   r(   rG   r*   rM   r=   r)   r>   r   )r0   r2   r2   r0   r3   r1   r4   r   NNNNNT)NFr}   )r   Nr   )r{   N)-__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr(   r   int__annotations__r)   r   r   rC   r   rZ   rn   ru   propertyry   r=   setterboolr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r[   r   rD   __static_attributes____classcell__)re   s   @rf   r$   r$   2   s   ;z *$&67!M49!!M49!
 48"&%)!\9 "$sCx.1\9 \9|C X X #   __9S 9T 9 9 sx_9_3;DI3F_ko_	c_ _> JNS9S3;DI3FS	cS8 JNL9L3;DI3FL	cL0*-9A#RZ[^R_
8c 8d3i 8MD
!c !HSM !]bcf]g !(  )- 
M9
M 
M DI&	
M
 
M 

M 
M??5	5 	5 	5c c  rp   r$   )r   shutilr   typingr   r   r   r   r   r!   rE   tokenization_utilsr
   r   r   utilsr   utils.import_utilsr   
get_loggerr   r   r   r   rI   r   r$   __all__rA   rp   rf   <module>r     s     
  3 3  P P  * 
		H	% #<P`a  4g  "
  
%&z) z 'zz 
rp   