o
    Zh	                     @   sh   d Z ddlZddlmZmZ ddlmZ ddlmZ e	e
ZddiZd	d
 ZG dd deZdgZdS )zTokenization classes for ESM.    N)ListOptional   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                 C   sH   t | d}|  }dd |D W  d    S 1 sw   Y  d S )Nrc                 S   s   g | ]}|  qS  )strip).0lr
   r
   W/var/www/auris/lib/python3.10/site-packages/transformers/models/esm/tokenization_esm.py
<listcomp>    s    z#load_vocab_file.<locals>.<listcomp>)openread
splitlines)r   flinesr
   r
   r   load_vocab_file   s   $r   c                
       s   e Zd ZdZeZddgZ					d' fd	d
	Zdede	fddZ
de	defddZdd Zdd Zde	defddZdede	fddZ	d(dee deee  dee fddZ	d)dedee d edee fd!d"Zd#d$ Zedefd%d&Z  ZS )*EsmTokenizerz&
    Constructs an ESM tokenizer.
    Z	input_idsZattention_mask<unk><cls><pad><mask><eos>c                    sf   t || _tt| j| _dd t| jD | _t jd|||||d| | j| _| 	| j d S )Nc                 S   s   i | ]\}}||qS r
   r
   )r   indtokr
   r
   r   
<dictcomp>7   s    z)EsmTokenizer.__init__.<locals>.<dictcomp>)	unk_token	cls_token	pad_token
mask_token	eos_tokenr
   )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__Zunique_no_split_tokensZ_update_trie)selfr   r   r    r!   r"   r#   kwargs	__class__r
   r   r*   +   s   

zEsmTokenizer.__init__indexreturnc                 C      | j || jS Nr'   getr   r+   r/   r
   r
   r   _convert_id_to_tokenG      z!EsmTokenizer._convert_id_to_tokentokenc                 C      | j || j | jS r2   r(   r4   r   r+   r8   r
   r
   r   _convert_token_to_idJ      z!EsmTokenizer._convert_token_to_idc                 K   s   |  S r2   )split)r+   textr,   r
   r
   r   	_tokenizeM   s   zEsmTokenizer._tokenizec                 C   s   | j  }|| j |S r2   )r(   copyupdateZadded_tokens_encoder)r+   Z
base_vocabr
   r
   r   	get_vocabP   s   
zEsmTokenizer.get_vocabc                 C   r9   r2   r:   r;   r
   r
   r   token_to_idU   r=   zEsmTokenizer.token_to_idc                 C   r1   r2   r3   r5   r
   r
   r   id_to_tokenX   r7   zEsmTokenizer.id_to_tokenNtoken_ids_0token_ids_1c                 C   s\   | j g}| jg}|d u r| jd u r|| S || | S | jd u r$td|| | | | S )Nz=Cannot tokenize multiple sequences when EOS token is not set!)Zcls_token_idZeos_token_id
ValueError)r+   rF   rG   clssepr
   r
   r    build_inputs_with_special_tokens[   s   

z-EsmTokenizer.build_inputs_with_special_tokensFalready_has_special_tokensc                    sd   |r|dur
t d fdd|D S dgdgt|  dg }|dur0|dgt| dg 7 }|S )a  
        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

        Args:
            token_ids_0 (`List[int]`):
                List of ids of the first sequence.
            token_ids_1 (`List[int]`, *optional*):
                List of ids of the second sequence.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        NzYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.c                    s   g | ]}| j v rd ndqS )   r   )Zall_special_ids)r   r8   r+   r
   r   r      s    z8EsmTokenizer.get_special_tokens_mask.<locals>.<listcomp>rM   r   )rH   len)r+   rF   rG   rL   maskr
   rN   r   get_special_tokens_maski   s   z$EsmTokenizer.get_special_tokens_maskc                 C   sd   t j||r
|d ndd }t|d}|d| j W d    |fS 1 s*w   Y  |fS )N- r   w
)ospathjoinr   writer$   )r+   Zsave_directoryZfilename_prefixr   r   r
   r
   r   save_vocabulary   s   
zEsmTokenizer.save_vocabularyc                 C   s
   t | jS r2   )rO   r$   rN   r
   r
   r   
vocab_size   s   
zEsmTokenizer.vocab_size)r   r   r   r   r   r2   )NF)__name__
__module____qualname____doc__VOCAB_FILES_NAMESZvocab_files_namesZmodel_input_namesr*   intstrr6   r<   r@   rC   rD   rE   r   r   rK   boolrQ   rZ   propertyr[   __classcell__r
   r
   r-   r   r   #   sJ    


r   )r_   rV   typingr   r   Ztokenization_utilsr   utilsr   Z
get_loggerr\   loggerr`   r   r   __all__r
   r
   r
   r   <module>   s   

p