
    fTh31                         S r SSKrSSKJr  SSKJrJrJrJrJ	r	  SSK
rSSKJr  SSKJr  SSKJr  \R$                  " \5      rS	rS
S0r\" SS9 " S S\5      5       rS/rg)zTokenization classes for .    N)copyfile)AnyDictListOptionalTuple   )PreTrainedTokenizer)logging)requiresu   ▁
vocab_filezsentencepiece.bpe.model)sentencepiece)backendsc            
       ~  ^  \ rS rSrSr\rSS/r       SS\\	\
\4      SS4U 4S jjjrS	 rS
 r SS\\   S\\\      S\\   4S jjr SS\\   S\\\      S\S\\   4U 4S jjjr SS\\   S\\\      S\\   4S jjr\S 5       rS rS\
S\\
   4S jrS rS rS rSS\
S\\
   S\\
   4S jjrSrU =r$ )XGLMTokenizer#   a  
Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the beginning of
        sequence. The token used is the `cls_token`.

        </Tip>

    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        The classifier token which is used when doing sequence classification (classification of the whole sequence
        instead of per-token classification). It is the first token of the sequence when built with special tokens.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

Attributes:
    sp_model (`SentencePieceProcessor`):
        The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
	input_idsattention_maskNsp_model_kwargsreturnc	                 v  > Uc  0 OUU l         SU l        [        U R                  5       V
s/ s H	  n
SU
 S3PM     nn
U	R                  S/ 5      =(       d    / U	S'   U	S==   U Vs/ s H  oU	S   ;  d  M  UPM     sn-  ss'   [        R
                  " S0 U R                   D6U l        U R                  R                  [        U5      5        Xl	        SU l
        SSSSS	.U l        [        U R                  5      n[        U R                  5       V
s0 s H  n
SU
 S3X-   U R                  -   _M     nn
U R                  R                  U5        U R                  R                  5        VVs0 s H  u  pX_M	     snnU l        [         TU ]D  " SUUUUUUU R                   S
.U	D6  g s  sn
f s  snf s  sn
f s  snnf )N   z<madeupword>additional_special_tokens   r      r	   )<s><pad></s><unk>)	bos_token	eos_token	unk_token	sep_token	cls_token	pad_tokenr    )r   num_madeup_wordsrangegetspmSentencePieceProcessorsp_modelLoadstrr   fairseq_offsetfairseq_tokens_to_idslenupdateitemsfairseq_ids_to_tokenssuper__init__)selfr   r!   r"   r$   r%   r#   r&   r   kwargsimadeup_wordswordsp_sizekv	__class__s                   b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/xglm/tokenization_xglm.pyr7   XGLMTokenizer.__init__g   s    &5%<r/ !"49$:O:O4PQ4Pq+aS*4PQ.4jj9TVX.Y._]_*+*+)0
)T@[9\-\D\0
 	
+ 22JT5I5IJ3z?+$   ./APQ%R"dmm$W\]a]r]rWstWsRS+aS*GK$:M:M,MMWst"")),77;7Q7Q7W7W7Y%Z7Ytqad7Y%Z" 		
 00		
 		
; R0
* u &[s   F&&F+
6F+
 F0,F5c                 ~    U R                   R                  5       nS US'   U R                  R                  5       US'   U$ )Nr-   sp_model_proto)__dict__copyr-   serialized_model_proto)r8   states     rA   __getstate__XGLMTokenizer.__getstate__   s;    ""$ j"&--"F"F"H    c                     Xl         [        U S5      (       d  0 U l        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr   r'   )rE   hasattrr   r+   r,   r-   LoadFromSerializedProtorD   )r8   ds     rA   __setstate__XGLMTokenizer.__setstate__   sR     t.//#%D 22JT5I5IJ--d.A.ABrK   token_ids_0token_ids_1c                 \    Uc  U R                   /U-   $ U R                   /nX1-   U-   U-   U-   $ )a;  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An XLM-RoBERTa sequence has the following format:

- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)sep_token_idr8   rR   rS   seps       rA    build_inputs_with_special_tokens.XGLMTokenizer.build_inputs_with_special_tokens   sD    ( %%&44  ! 3&,{::rK   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/S/[        U5      -  -   $ S/S/[        U5      -  -   SS/-   S/[        U5      -  -   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)rR   rS   rZ   r   r   )r6   get_special_tokens_maskr2   )r8   rR   rS   rZ   r@   s       rA   r\   %XGLMTokenizer.get_special_tokens_mask   sy    & &72']a 3   31#K 0011sqcC,,-A61#K@P:PQQrK   c                 x    U R                   /nUc  [        X1-   5      S/-  $ [        X1-   U-   U-   U-   5      S/-  $ )a  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
not make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of zeros.

r   )rU   r2   rV   s       rA   $create_token_type_ids_from_sequences2XGLMTokenizer.create_token_type_ids_from_sequences   sQ    $   !s()QC//3$s*S0;>?1#EErK   c                 `    [        U R                  5      U R                  -   U R                  -   $ N)r2   r-   r0   r(   )r8   s    rA   
vocab_sizeXGLMTokenizer.vocab_size   s'    4==!D$7$77$:O:OOOrK   c                     [        U R                  5       Vs0 s H  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf rb   )r)   rc   convert_ids_to_tokensr3   added_tokens_encoder)r8   r:   vocabs      rA   	get_vocabXGLMTokenizer.get_vocab   sL    ;@;QR;Qa++A.1;QRT../ Ss   Atextc                 >    U R                   R                  U[        S9$ )N)out_type)r-   encoder/   )r8   rk   s     rA   	_tokenizeXGLMTokenizer._tokenize  s    }}##D3#77rK   c                     XR                   ;   a  U R                   U   $ U R                  R                  U5      nU(       a  X R                  -   $ U R                  $ )z0Converts a token (str) in an id using the vocab.)r1   r-   	PieceToIdr0   unk_token_id)r8   tokenspm_ids      rA   _convert_token_to_id"XGLMTokenizer._convert_token_to_id  sQ    ...--e44((/ 06v+++L4;L;LLrK   c                     XR                   ;   a  U R                   U   $ U R                  R                  XR                  -
  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r5   r-   	IdToPiecer0   )r8   indexs     rA   _convert_id_to_token"XGLMTokenizer._convert_id_to_token  s=    ...--e44}}&&u/B/B'BCCrK   c                 l    SR                  U5      R                  [        S5      R                  5       nU$ )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r8   tokens
out_strings      rA   convert_tokens_to_string&XGLMTokenizer.convert_tokens_to_string  s,    WWV_,,-=sCIIK
rK   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        U4$ U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-r~   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openr-   rG   write)r8   r   r   out_vocab_fileficontent_spiece_models         rA   save_vocabularyXGLMTokenizer.save_vocabulary  s,   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F	)rE   r5   r0   r1   r(   r-   r   r   )r   r   r   r   r    r   Nrb   )NF) __name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr   r   r/   r   r7   rI   rP   r   intrX   boolr\   r_   propertyrc   ri   ro   rv   r{   r   r   r   __static_attributes____classcell__)r@   s   @rA   r   r   #   s   =~ *$&67
 486
 "$sCx.16
 
6
 6
pC JN;9;3;DI3F;	c;4 sxR9R3;DI3FRkoR	cR R: JNF9F3;DI3FF	cF0 P P
8c 8d3i 8MD
!c !HSM !]bcf]g ! !rK   r   )r   r   shutilr   typingr   r   r   r   r   r   r+   tokenization_utilsr
   utilsr   utils.import_utilsr   
get_loggerr   r   r   r   r   __all__r'   rK   rA   <module>r      sv    ! 	  3 3  5  * 
		H	% !#<=  
%&G!' G! 'G!T 
rK   