
    fTh7                         S SK r S SKJr  S SKJrJrJrJrJr  S SK	r
SSKJrJrJr  SSKJr  SSKJr  \R&                  " \5      rSrS	S
0r/ SQr\" SS9 " S S\5      5       rS/rg)    N)copyfile)AnyDictListOptionalTuple   )
AddedTokenBatchEncodingPreTrainedTokenizer)logging)requiresu   ▁
vocab_filezsentencepiece.bpe.model)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CN)sentencepiece)backendsc                     ^  \ rS rSr% Sr\rSS/r/ r\	\
   \S'   / r\	\
   \S'               S-S\\\\4      4U 4S	 jjjrS
 rS r\S 5       r\S\4S j5       r\R.                  S\SS4S j5       r S.S\	\
   S\\	\
      S\S\	\
   4U 4S jjjr S/S\	\
   S\\	\
      S\	\
   4S jjr S/S\	\
   S\\	\
      S\	\
   4S jjrS\S\\   S\\   4S jrS rS\S\	\   4S jrS rS r S  r!S/S!\S"\\   S\"\   4S# jjr#   S0S$\	\   S\S%\\	\      S\S\$4
U 4S& jjjr%S' r&S( r'S1S) jr(S*\SS4S+ jr)S,r*U =r+$ )2MBartTokenizer%   u   
Construct an MBART tokenizer.

Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).

The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.

Examples:

```python
>>> from transformers import MBartTokenizer

>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
```	input_idsattention_maskprefix_tokenssuffix_tokensNsp_model_kwargsc                 B  > [        U[        5      (       a  [        USSS9OUnUc  0 OUU l        [        R
                  " S0 U R                  D6U l        U R                  R                  [        U5      5        Xl        SSSSS.U l	        SU l
        [        U R                  5      U l        [        [        5       VVs0 s H#  u  nnUU R                  U-   U R                  -   _M%     snnU l        U R                  R!                  5        VVs0 s H	  u  nnUU_M     snnU l        [        U R                  5      [        U R                  5      -   U R                  -   U R                  S	'   U R                  R%                  U R                  5        U R                  R!                  5        VVs0 s H	  u  nnUU_M     snnU l        [)        U R                  R+                  5       5      nUb*  UR-                  U Vs/ s H  nUU;  d  M  UPM     sn5        [.        TU ]`  " SUUUUUUUS U
UUU R                  S
.UD6  U
b  U
OSU l        U R                  U R2                     U l        Xl        U R9                  U R2                  5        g s  snnf s  snnf s  snnf s  snf )NTF)lstrip
normalizedr         r	   )<s><pad></s><unk><mask>)	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokentokenizer_filesrc_langtgt_langadditional_special_tokensr2   r    )
isinstancestrr
   r2   spmSentencePieceProcessorsp_modelLoadr   fairseq_tokens_to_idsfairseq_offsetlensp_model_size	enumerateFAIRSEQ_LANGUAGE_CODESlang_code_to_iditemsid_to_lang_codeupdatefairseq_ids_to_tokenslistkeysextendsuper__init__	_src_langcur_lang_code_idrF   set_src_lang_special_tokens)selfr   r=   r>   r@   rA   r?   rB   rC   rD   rE   rF   r2   rG   kwargsicodekv_additional_special_tokenst	__class__s                        d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mbart/tokenization_mbart.pyr^   MBartTokenizer.__init__A   sz   & FPPZ\_E`E`Jz$5Afp 	 &5%<r/22JT5I5IJ3z?+$ ./APQ%R"   /NWXnNo 
No71dD$$$q(4+>+>>>No 
 261E1E1K1K1MN1MA11MN/24==/ACH\H\D]/]`d`s`s/s""8,""))$*>*>?7;7Q7Q7W7W7Y%Z7Ytq!ad7Y%Z"%)$*>*>*C*C*E%F"$0&--5]5qB\9\5] 	 	
!&@ 00	
 	
  &.%9w $ 4 4T^^ D ((8G 
  O &[ ^s   >*J
J.J<
J
Jc                 ~    U R                   R                  5       nS US'   U R                  R                  5       US'   U$ )NrM   sp_model_proto)__dict__copyrM   serialized_model_proto)rb   states     rk   __getstate__MBartTokenizer.__getstate__   s;    ""$ j"&--"F"F"H    c                     Xl         [        U S5      (       d  0 U l        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr2   rH   )ro   hasattrr2   rK   rL   rM   LoadFromSerializedProtorn   )rb   ds     rk   __setstate__MBartTokenizer.__setstate__   sR     t.//#%D 22JT5I5IJ--d.A.ABru   c                 x    [        U R                  5      [        U R                  5      -   U R                  -   S-   $ )Nr6   )rQ   rM   rU   rP   rb   s    rk   
vocab_sizeMBartTokenizer.vocab_size   s2    4==!C(<(<$==@S@SSVWWWru   returnc                     U R                   $ N)r_   r}   s    rk   rE   MBartTokenizer.src_lang   s    ~~ru   new_src_langc                 F    Xl         U R                  U R                   5        g r   )r_   ra   )rb   r   s     rk   rE   r      s    %((8ru   token_ids_0token_ids_1already_has_special_tokensc                   > U(       a  [         TU ]  XSS9$ S/[        U R                  5      -  nS/[        U R                  5      -  nUc  US/[        U5      -  -   U-   $ US/[        U5      -  -   S/[        U5      -  -   U-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r   r6   r   )r]   get_special_tokens_maskrQ   r0   r1   )rb   r   r   r   prefix_onessuffix_onesrj   s         rk   r   &MBartTokenizer.get_special_tokens_mask   s    & &72']a 3   cC 2 233cC 2 2331#K(8"89KGGqcC$445!s;?O9OPS^^^ru   c                 ~    Uc  U R                   U-   U R                  -   $ U R                   U-   U-   U R                  -   $ )a  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:

- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)r0   r1   )rb   r   r   s      rk    build_inputs_with_special_tokens/MBartTokenizer.build_inputs_with_special_tokens   sG    , %%3d6H6HHH!!K/+=@R@RRRru   c                     U R                   /nU R                  /nUc  [        XA-   U-   5      S/-  $ [        XA-   U-   U-   U-   U-   5      S/-  $ )a{  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of zeros.

r   )sep_token_idcls_token_idrQ   )rb   r   r   sepclss        rk   $create_token_type_ids_from_sequences3MBartTokenizer.create_token_type_ids_from_sequences   si    $   !  !s(3./1#553$s*S0;>DEKKru   return_tensorsrE   rF   c                 v    Ub  Uc  [        S5      eX0l        U " U4SUS.UD6nU R                  U5      nXvS'   U$ )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorrE   convert_tokens_to_ids)rb   
raw_inputsr   rE   rF   extra_kwargsinputstgt_lang_ids           rk   _build_translation_inputs(MBartTokenizer._build_translation_inputs   sU     x/`aa jiT.i\hi00:(3$%ru   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf r   )ranger~   convert_ids_to_tokensrX   added_tokens_encoder)rb   rd   vocabs      rk   	get_vocabMBartTokenizer.get_vocab
  sL    ;@;QR;Qa++A.1;QRT../ Ss   Atextc                 >    U R                   R                  U[        S9$ )N)out_type)rM   encoderJ   )rb   r   s     rk   	_tokenizeMBartTokenizer._tokenize  s    }}##D3#77ru   c                     XR                   ;   a  U R                   U   $ U R                  R                  U5      nU(       a  X R                  -   $ U R                  $ )z0Converts a token (str) in an id using the vocab.)rO   rM   	PieceToIdrP   unk_token_id)rb   tokenspm_ids      rk   _convert_token_to_id#MBartTokenizer._convert_token_to_id  sQ    ...--e44((/ 06v+++L4;L;LLru   c                     XR                   ;   a  U R                   U   $ U R                  R                  XR                  -
  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)rY   rM   	IdToPiecerP   )rb   indexs     rk   _convert_id_to_token#MBartTokenizer._convert_id_to_token  s=    ...--e44}}&&u/B/B'BCCru   c                 l    SR                  U5      R                  [        S5      R                  5       nU$ )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)rb   tokens
out_strings      rk   convert_tokens_to_string'MBartTokenizer.convert_tokens_to_string!  s,    WWV_,,-=sCIIK
ru   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        U4$ U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openrM   rq   write)rb   r   r   out_vocab_fileficontent_spiece_models         rk   save_vocabularyMBartTokenizer.save_vocabulary&  s,   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F		src_texts	tgt_textsc                 >   > X l         X@l        [        TU ]  " X40 UD6$ r   )rE   rF   r]   prepare_seq2seq_batch)rb   r   rE   r   rF   rc   rj   s         rk   r   $MBartTokenizer.prepare_seq2seq_batch7  s$     ! w,YLVLLru   c                 8    U R                  U R                  5      $ r   )ra   rE   r}   s    rk   _switch_to_input_mode$MBartTokenizer._switch_to_input_modeC      //>>ru   c                 8    U R                  U R                  5      $ r   )set_tgt_lang_special_tokensrF   r}   s    rk   _switch_to_target_mode%MBartTokenizer._switch_to_target_modeF  r   ru   c                 t    U R                   U   U l        / U l        U R                  U R                  /U l        g)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].NrU   cur_lang_coder0   eos_token_idr1   )rb   rE   s     rk   ra   *MBartTokenizer.set_src_lang_special_tokensI  s6    !11(;"//1C1CDru   langc                 t    U R                   U   U l        / U l        U R                  U R                  /U l        g)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   )rb   r   s     rk   r   *MBartTokenizer.set_tgt_lang_special_tokensO  s6    !11$7"//1C1CDru   )ro   r_   r   r`   rY   rP   rO   rW   rU   r0   rM   r2   rR   rE   r1   rF   r   )r8   r:   r:   r8   r;   r9   r<   NNNNN)NFr   )r   Nr#   )r   N),__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr0   r   int__annotations__r1   r   r   rJ   r   r^   rs   rz   propertyr~   rE   setterboolr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ra   r   __static_attributes____classcell__)rj   s   @rk   r,   r,   %   s   ( *$&67!M49!!M49!
 48"&L9 "$sCx.1L9 L9\C X X #   __9S 9T 9 9
 sx_9_3;DI3F_ko_	c_ _> JNS9S3;DI3FS	cS8 JNL9L3;DI3FL	cL2
*-
9A#
RZ[^R_

8c 8d3i 8MD
!c !HSM !]bcf]g !(  )-
M9
M 
M DI&	
M
 
M 

M 
M??EE E E Eru   r,   )r   shutilr   typingr   r   r   r   r   r)   rK   tokenization_utilsr
   r   r   utilsr   utils.import_utilsr   
get_loggerr   r   r   r   rT   r,   __all__rH   ru   rk   <module>r     s     
  3 3  P P  * 
		H	% !#<=  {  
%&mE( mE 'mE`	 
ru   