
    fTh3                         S r SSKrSSKrSSKJrJrJr  SSKJr  SSK	J
r
  \
R                  " \5      rSSS	.rS
 r " S S\5      rS/rg)z Tokenization classes for BioGPT.    N)ListOptionalTuple   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 d    [        5       nU S   nU SS  H  nUR                  X#45        UnM     U$ )zw
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
r      N)setadd)wordpairs	prev_charchars       f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/biogpt/tokenization_biogpt.py	get_pairsr   !   s?    
 EEQIQR		9#$	  L    c            
       b  ^  \ rS rSrSr\rSS/r     SU 4S jjr\	S 5       r
S rS rS	 rS
 rSS jrS rS rS r SS\\   S\\\      S\\   4S jjr SS\\   S\\\      S\S\\   4U 4S jjjr SS\\   S\\\      S\\   4S jjrSS\S\\   S\\   4S jjrS rS rSrU =r $ ) BioGptTokenizer.   a  
Construct an FAIRSEQ Transformer tokenizer. Moses tokenization followed by Byte-Pair Encoding.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Merges file.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the beginning of
        sequence. The token used is the `cls_token`.

        </Tip>

    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
	input_idsattention_maskc           
        >  SS K n	SU l        Xl        0 U l        0 U l         [        USS9 n
[        R                  " U
5      U l	        S S S 5        U R                  R                  5        VVs0 s H  u  pX_M	     snnU l        [        USS9 nUR                  5       R                  S5      S S nS S S 5        W Vs/ s H  n[        UR                  5       S S 5      PM!     nn[        [!        U[#        [%        U5      5      5      5      U l        0 U l        [*        TU ]X  " S
UUUUUS	.UD6  g ! [         a    [        S5      ef = f! , (       d  f       GN= fs  snnf ! , (       d  f       N= fs  snf )Nr   zqYou need to install sacremoses to use BioGptTokenizer. See https://pypi.org/project/sacremoses/ for installation.enutf-8encoding
   )	bos_token	eos_token	sep_token	unk_token	pad_token )
sacremosesImportErrorlangsmcache_moses_tokenizercache_moses_detokenizeropenjsonloadencoderitemsdecoderreadsplittupledictziprangelen	bpe_rankscachesuper__init__)selfr	   r
   r&   r#   r$   r%   r'   kwargsr)   vocab_handlekvmerges_handlemergesmerge	__class__s                   r   r?   BioGptTokenizer.__init__\   sS   	 	%'"')$*w/<99\2DL 0)-););)=>)=)=>+0M"'')//5cr:F 18>?u%bq)*?c&%F*<=>
 	
	
 	
-  	M 	 0/>00?s.   D0 E	.E#E!>&E20E	
E!
E/c                 ,    [        U R                  5      $ )zReturns vocab size)r;   r2   r@   s    r   
vocab_sizeBioGptTokenizer.vocab_size   s     4<<  r   c                 B    [        U R                  40 U R                  D6$ N)r8   r2   added_tokens_encoderrK   s    r   	get_vocabBioGptTokenizer.get_vocab   s    DLL>D$=$=>>r   c                     X R                   ;  a'  U R                  R                  US9nX0R                   U'   U R                   U   R                  USSSS9$ )Nr+   TF)aggressive_dash_splits
return_strescape)r-   r,   MosesTokenizertokenize)r@   textr+   moses_tokenizers       r   moses_tokenizeBioGptTokenizer.moses_tokenize   s_    111"gg44$4?O/>&&t,))$/88% 9 
 	
r   c                     X R                   ;  a'  U R                  R                  US9nX0R                   U'   U R                   U   R                  U5      $ )NrT   )r.   r,   MosesDetokenizer
detokenize)r@   tokensr+   moses_detokenizers       r   moses_detokenize BioGptTokenizer.moses_detokenize   sP    333 $ 8 8d 8 C1B((.++D1<<VDDr   c                   ^  [        US S 5      US   S-   4-   nUT R                  ;   a  T R                  U   $ [        U5      nU(       d  US-   $  [        UU 4S jS9nUT R                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUS	:X  a  S
nUT R                  U'   U$ ! [         a    UR                  X(S  5         Mt  f = f)Nr!   </w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)r<   getfloat)pairr@   s    r   <lambda>%BioGptTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1Wr   keyr   r   r"    z
  </w>z
</w>)r7   r=   r   minr<   r;   indexextend
ValueErrorappendjoin)
r@   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpeBioGptTokenizer.bpe   s   U3BZ E"I$6#88DJJ::e$$$6>!$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~:D 

5/ " OODH-s   E) )F
	F
c                    U(       a  UR                  5       nOU R                  XR                  5      n/ nU HD  nU(       d  M  UR                  [	        U R                  U5      R                  S5      5      5        MF     U$ )zReturns a tokenized string.rp   )r6   r\   r+   rs   listr~   )r@   rZ   bypass_tokenizersplit_tokensrw   s        r   	_tokenizeBioGptTokenizer._tokenize   sf    ::<D&&tYY7DEu##D%)>)>s)C$DE  r   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r2   ri   r&   )r@   rw   s     r   _convert_token_to_id$BioGptTokenizer._convert_token_to_id   s*    ||||'7'7'GHHr   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r4   ri   r&   )r@   rr   s     r   _convert_id_to_token$BioGptTokenizer._convert_id_to_token   s    ||~~66r   c                     U Vs/ s H$  o"R                  SS5      R                  SS5      PM&     nnSR                  U5      R                  5       nU R                  XR                  5      nU$ s  snf )z:Converts a sequence of tokens (string) in a single string.rp    rf   )replacerv   r6   rc   r+   )r@   ra   trZ   s       r   convert_tokens_to_string(BioGptTokenizer.convert_tokens_to_string   se     DJJ6a))C$,,VS96J&&($$VYY7	 Ks   +A.token_ids_0token_ids_1returnc                 V    Uc  U R                   /U-   $ U R                   /nX1-   U-   U-   $ )a+  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BioGPT sequence has the following format:

- single sequence: `</s> X `
- pair of sequences: `</s> A </s> B `

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)sep_token_idr@   r   r   seps       r    build_inputs_with_special_tokens0BioGptTokenizer.build_inputs_with_special_tokens   s?    & %%&44  ! 3&44r   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Ub'  S/S/[        U5      -  -   S/-   S/[        U5      -  -   $ S/S/[        U5      -  -   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r   r   r   )r>   get_special_tokens_maskr;   )r@   r   r   r   rH   s       r   r   'BioGptTokenizer.get_special_tokens_mask  sw    $ &72']a 3   "31#K 001QC7A3[AQ;QRRsqcC,,--r   c                     U R                   /nUc  [        X-   5      S/-  $ [        X-   5      S/-  [        X#-   5      S/-  -   $ )ah  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
Transformer sequence pair mask has the following format:

```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence    | second sequence |
```

If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
r   r   )r   r;   r   s       r   $create_token_type_ids_from_sequences4BioGptTokenizer.create_token_type_ids_from_sequences  sX    .   ! {()QC//;$%+c+2C.Ds.JJJr   save_directoryfilename_prefixc           
      B   [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  X(       a  US-   OS[        S   -   5      n[        USSS	9 nUR                  [        R                  " U R                  S
SSS9S-   5        S S S 5        Sn[        USSS	9 n[        U R                  R                  5       S S9 HM  u  pXi:w  a  [        R                  SU S35        U	nUR                  SR                  U5      S-   5        US-  nMO     S S S 5        X44$ ! , (       d  f       N= f! , (       d  f       X44$ = f)NzVocabulary path (z) should be a directory-r   r	   r
   wr   r   r"   TF)indent	sort_keysensure_asciir    r   c                     U S   $ )Nr   r(   )kvs    r   rl   1BioGptTokenizer.save_vocabulary.<locals>.<lambda>J  s    Y[\]Y^r   rn   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!rp   r   )ospathisdirloggererrorrv   VOCAB_FILES_NAMESr/   writer0   dumpsr2   sortedr<   r3   warning)
r@   r   r   r	   
merge_filefrr   writer
bpe_tokenstoken_indexs
             r   save_vocabularyBioGptTokenizer.save_vocabulary:  sy   ww}}^,,LL,^,<<STUWW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4GGDJJt||ATYZ]aab 5 *cG4+1$..2F2F2HN^+_'
'NN/
| <M M (ESXXj1D89
 ,` 5 %% 54 54 %%s   44E==A5F=
F
Fc                 D    U R                   R                  5       nS US'   U$ )Nr,   )__dict__copy)r@   states     r   __getstate__BioGptTokenizer.__getstate__V  s"    ""$dr   c                 X    Xl          SS KnX l        g ! [         a    [        S5      ef = f)Nr   znYou need to install sacremoses to use XLMTokenizer. See https://pypi.org/project/sacremoses/ for installation.)r   r)   r*   r,   )r@   dr)   s      r   __setstate__BioGptTokenizer.__setstate__[  s:    	   	M 	s    ))	r   r<   r=   r.   r-   r4   r2   r+   r,   )z<unk>z<s></s>r   z<pad>)FrO   )NF)!__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr?   propertyrL   rQ   r\   rc   r~   r   r   r   r   r   intr   r   boolr   r   strr   r   r   r   __static_attributes____classcell__)rH   s   @r   r   r   .   sW   (T *$&67 *
X ! !?
E*XI7 JN5953;DI3F5	c52 sx.9.3;DI3F.ko.	c. .8 JNK9K3;DI3FK	cK<&c &HSM &]bcf]g &8
 r   r   )r   r0   r   typingr   r   r   tokenization_utilsr   utilsr   
get_loggerr   r   r   r   r   __all__r(   r   r   <module>r      s]    '  	 ( ( 5  
		H	%  
x) xv	 
r   