
    fTh	                         S r SSKrSSKJrJr  SSKJr  SSKJr  \R                  " \
5      rSS0rS	 r " S
 S\5      rS/rg)zTokenization classes for ESM.    N)ListOptional   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                     [        U S5       nUR                  5       R                  5       nU Vs/ s H  o3R                  5       PM     snsS S S 5        $ s  snf ! , (       d  f       g = f)Nr)openread
splitlinesstrip)r   flinesls       `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/esm/tokenization_esm.pyload_vocab_filer      sL    	j#	!##%#()5a	5) 
	) 
	s   #AA	AA
A(c            
         ^  \ rS rSrSr\rSS/r     SU 4S jjrS\	S\
4S jrS	\
S\	4S
 jrS rS rS	\
S\	4S jrS\	S\
4S jr SS\\	   S\\\	      S\\	   4S jjr SS\S\\   S\S\\	   4S jjrS r\S\	4S j5       rSrU =r$ )EsmTokenizer#   z
Constructs an ESM tokenizer.
	input_idsattention_maskc           	      Z  > [        U5      U l        [        [        U R                  5      5      U l        [        U R                  5       VV	s0 s H  u  pX_M	     sn	nU l        [        T
U ]  " SUUUUUS.UD6  U R                  U l        U R                  U R                  5        g s  sn	nf )N)	unk_token	cls_token	pad_token
mask_token	eos_token )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)selfr   r   r   r   r   r   kwargsindtok	__class__s             r   r'   EsmTokenizer.__init__+   s     **5 4??!;<6?6PQ6P(#SX6PQ 	
!	
 	
 '+oo#$556 Rs   B'indexreturnc                 L    U R                   R                  XR                  5      $ Nr$   getr   r*   r0   s     r   _convert_id_to_token!EsmTokenizer._convert_id_to_tokenG         $$UNN;;    tokenc                 ~    U R                   R                  XR                   R                  U R                  5      5      $ r3   r%   r5   r   r*   r;   s     r   _convert_token_to_id!EsmTokenizer._convert_token_to_idJ   .      $$U,=,=,A,A$..,QRRr:   c                 "    UR                  5       $ r3   )split)r*   textr+   s      r   	_tokenizeEsmTokenizer._tokenizeM   s    zz|r:   c                 p    U R                   R                  5       nUR                  U R                  5        U$ r3   )r%   copyupdateadded_tokens_encoder)r*   
base_vocabs     r   	get_vocabEsmTokenizer.get_vocabP   s0    &&++-
$334r:   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ r3   r=   r>   s     r   token_to_idEsmTokenizer.token_to_idU   rA   r:   c                 L    U R                   R                  XR                  5      $ r3   r4   r6   s     r   id_to_tokenEsmTokenizer.id_to_tokenX   r9   r:   token_ids_0token_ids_1c                     U R                   /nU R                  /nUc  U R                  c  X1-   $ X1-   U-   $ U R                  c  [        S5      eX1-   U-   U-   U-   $ )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r*   rT   rU   clsseps        r    build_inputs_with_special_tokens-EsmTokenizer.build_inputs_with_special_tokens[   sy       !  !  ((((3..&\]] 3&4s::r:   already_has_special_tokensc                     U(       a2  Ub  [        S5      eU Vs/ s H  oDU R                  ;   a  SOSPM     sn$ S/S/[        U5      -  -   S/-   nUb  US/[        U5      -  S/-   -  nU$ s  snf )at  
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

Args:
    token_ids_0 (`List[int]`):
        List of ids of the first sequence.
    token_ids_1 (`List[int]`, *optional*):
        List of ids of the second sequence.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.   r   )rY   all_special_idslen)r*   rT   rU   r^   r;   masks         r   get_special_tokens_mask$EsmTokenizer.get_special_tokens_maski   s    $ && R 
 LWW;%$"6"66AA=;WWsqcC,,-3"QC#k**aS00D	 Xs   A,c                    [         R                  R                  X(       a  US-   OSS-   5      n[        US5       nUR	                  SR                  U R
                  5      5        S S S 5        U4$ ! , (       d  f       U4$ = f)N- r	   w
)ospathjoinr   writer!   )r*   save_directoryfilename_prefixr   r   s        r   save_vocabularyEsmTokenizer.save_vocabulary   si    WW\\.O?S3Hacgr2rs
*c"aGGDIIdoo./ #} #"}s   +A11
Bc                 ,    [        U R                  5      $ r3   )rb   r!   )r*   s    r   
vocab_sizeEsmTokenizer.vocab_size   s    4??##r:   )r$   r%   r!   r(   )z<unk>z<cls>z<pad>z<mask>z<eos>r3   )NF)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr'   intstrr7   r?   rE   rL   rO   rR   r   r   r\   boolrd   rq   propertyrt   __static_attributes____classcell__)r.   s   @r   r   r   #   s    *$&67
 78<# <# <S# S# S
S S S< < < JN;9;3;DI3F;	c; in.6tnae	c> $C $ $r:   r   )rz   rk   typingr   r   tokenization_utilsr   utilsr   
get_loggerrv   loggerr{   r   r   __all__r    r:   r   <module>r      sR    $ 	 ! 5  
		H	%!;/ *m$& m$` 
r:   