o
    Zh &                     @   s   d Z ddlZddlZddlmZmZmZ ddlmZm	Z	 e r#ddl
Z
ddlmZ ddlmZ eeZdd	iZd
d ZG dd dZG dd deZdgZdS )z Tokenization classes for CPMAnt.    N)ListOptionalTuple)is_jieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                 C   sf   t  }t| ddd}| }W d   n1 sw   Y  t|D ]\}}|d}|||< q#|S )z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r
   vocabreadertokensindextoken r   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocab$   s   


r   c                   @   s   e Zd ZdddZdd ZdS )	WordpieceTokenizer<unk>   c                 C   s   || _ || _|| _d S N)r   	unk_tokenmax_input_chars_per_word)selfr   r"   r#   r   r   r   __init__0   s   
zWordpieceTokenizer.__init__c                 C   s   t |}t|| jkr| jgS d}g }|t|k rXt|}d }||k r<d||| }|| jv r4|}n|d8 }||k s#|d u rK|| j |d7 }n|| |}|t|k s|S )Nr       )listlenr#   r"   joinr   append)r$   r   charsstartZ
sub_tokensendZ
cur_substrsubstrr   r   r   tokenize5   s,   


zWordpieceTokenizer.tokenizeN)r   r    )__name__
__module____qualname__r%   r0   r   r   r   r   r   /   s    
r   c                
       s@  e Zd ZdZeZddgZdZ							
			d4 fdd	Ze	dd Z
e	dd Ze	dd Ze	defddZdd Zdd Z fddZdd  Zd!ee defd"d#Zd$d% Zd&d' Zd5d)ed*ee dee fd+d,Z	(d5d-ee d.eee  dee fd/d0Z	d6d-ee d.eee  d1edee f fd2d3Z  ZS )7CpmAntTokenizera  
    Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        bod_token (`str`, *optional*, defaults to `"<d>"`):
            The beginning of document token.
        eod_token (`str`, *optional*, defaults to `"</d>"`):
            The end of document token.
        bos_token (`str`, *optional*, defaults to `"<s>"`):
            The beginning of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token.
        line_token (`str`, *optional*, defaults to `"</n>"`):
            The line token.
        space_token (`str`, *optional*, defaults to `"</_>"`):
            The space token.
    Z	input_idsZattention_maskF<d></d><s></s><pad>r   </n></_>leftc                    s   t | dg || _|| _t|| _| j|	 | jd< | j| | jd< | j|	= | j|= tt| j dd d| _dd | j D | _	t
| j|d	| _t jd||||||||	|
d
	| d S )Njieba r   c                 S      | d S Nr'   r   xr   r   r   <lambda>       z*CpmAntTokenizer.__init__.<locals>.<lambda>keyc                 S   s   i | ]\}}||qS r   r   ).0kvr   r   r   
<dictcomp>   s    z,CpmAntTokenizer.__init__.<locals>.<dictcomp>)r   r"   )		bod_token	eod_token	bos_token	eos_token	pad_tokenr"   
line_tokenspace_tokenpadding_sider   )r   rK   rL   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr%   )r$   r
   rK   rL   rM   rN   rO   r"   rP   rQ   rR   kwargs	__class__r   r   r%   l   s0   


zCpmAntTokenizer.__init__c                 C      | j | j S r!   )rS   rK   r$   r   r   r   bod_token_id      zCpmAntTokenizer.bod_token_idc                 C   r\   r!   )rS   rL   r]   r   r   r   eod_token_id   r_   zCpmAntTokenizer.eod_token_idc                 C   s
   | j d S )Nr   rS   r]   r   r   r   
newline_id      
zCpmAntTokenizer.newline_idreturnc                 C   s
   t | jS r!   )r)   rS   r]   r   r   r   
vocab_size   rc   zCpmAntTokenizer.vocab_sizec                 C   s   t | jfi | jS r!   )dictrS   Zadded_tokens_encoderr]   r   r   r   	get_vocab   s   zCpmAntTokenizer.get_vocabc                 C   s.   g }t j|ddD ]}|| j| q	|S )zTokenize a string.F)Zcut_all)r=   cutextendrW   r0   )r$   textZoutput_tokensrB   r   r   r   	_tokenize   s   zCpmAntTokenizer._tokenizec                    s4   dd |D } fdd|D }t  j|fi |S )zDecode ids into a string.c                 S   s   g | ]}|d kr|qS )r   r   )rG   ir   r   r   
<listcomp>   s    z+CpmAntTokenizer._decode.<locals>.<listcomp>c                    s.   g | ]}| j kr| jkr| jkr|qS r   )Zpad_token_idZeos_token_idbos_token_id)rG   rB   r]   r   r   rm      s    ()rX   _decode)r$   Z	token_idsrY   rZ   r]   r   ro      s
   
zCpmAntTokenizer._decodec                 C   s
   || j v S r!   ra   r$   r   r   r   r   check      
zCpmAntTokenizer.checkr   c                 C   s
   d |S )Nr&   )r*   )r$   r   r   r   r   convert_tokens_to_string   rr   z(CpmAntTokenizer.convert_tokens_to_stringc                 C   s   | j || j | jS )z0Converts a token (str) in an id using the vocab.)rS   getr"   rp   r   r   r   _convert_token_to_id   s   z$CpmAntTokenizer._convert_token_to_idc                 C   s   | j || jS )z=Converts an index (integer) in a token (str) using the vocab.)rV   rt   r"   )r$   r   r   r   r   _convert_id_to_token   s   z$CpmAntTokenizer._convert_id_to_tokenNsave_directoryfilename_prefixc                 C   s*  t j|rt j||r|d ndtd  }n
|r|d nd| }d}d| jv r5| jd | jd< | jd= d| jv rF| jd | jd< | jd= tt| j	 d	d
 d| _t
|ddd.}| j	 D ]\}}||krutd| d |}||d  |d7 }qbW d    |fS 1 sw   Y  |fS )N-r&   r
   r   r>   r;   r   r:   c                 S   r?   r@   r   rA   r   r   r   rC      rD   z1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>rE   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r'   )ospathisdirr*   VOCAB_FILES_NAMESrS   r   r   rT   rU   r   loggerwarningwrite)r$   rw   rx   r
   r   writerr   Ztoken_indexr   r   r   save_vocabulary   s6   






zCpmAntTokenizer.save_vocabularytoken_ids_0token_ids_1c                 C   s,   |du r
| j g| S | j g| | j g | S )a1  
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
        adding special tokens. A CPMAnt sequence has the following format:

        - single sequence: `[BOS] Sequence`.

        Args:
            token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
            token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.

        Returns:
            `List[int]`: The model input with special tokens.
        N)rn   )r$   r   r   r   r   r    build_inputs_with_special_tokens   s   z0CpmAntTokenizer.build_inputs_with_special_tokensalready_has_special_tokensc                    sZ   |rt  j||ddS |dur#dgdgt|  dg dgt|  S dgdgt|  S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`): List of IDs.
            token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)r   r   r   Nr'   r   )rX   get_special_tokens_maskr)   )r$   r   r   r   rZ   r   r   r      s   (z'CpmAntTokenizer.get_special_tokens_mask)	r5   r6   r7   r8   r9   r   r:   r;   r<   r!   )NF)r1   r2   r3   __doc__r~   Zvocab_files_namesZmodel_input_namesZadd_prefix_spacer%   propertyr^   r`   rb   intre   rg   rk   ro   rq   r   strrs   ru   rv   r   r   r   r   boolr   __classcell__r   r   rZ   r   r4   O   sb    *


 


r4   )r   r   r{   typingr   r   r   Ztransformers.utilsr   r   r=   Ztokenization_utilsr   utilsr	   Z
get_loggerr1   r   r~   r   r   r4   __all__r   r   r   r   <module>   s    
  
B