
    fTh &                         S r SSKrSSKrSSKJrJrJr  SSKJrJ	r	  \" 5       (       a  SSK
r
SSKJr  SSKJr  \R                  " \5      rSS	0rS
 r " S S5      r " S S\5      rS/rg)z Tokenization classes for CPMAnt.    N)ListOptionalTuple)is_jieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                     [         R                  " 5       n[        U SSS9 nUR                  5       nSSS5        [	        W5       H  u  pEUR                  S5      nXAU'   M     U$ ! , (       d  f       N9= f)z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r   vocabreadertokensindextokens         f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocabr   $   sg    ##%E	j#	0F!!# 
1!&)T"e * L 
1	0s   A%%
A3c                   $    \ rS rSrSS jrS rSrg)WordpieceTokenizer/   c                 (    Xl         X l        X0l        g N)r   	unk_tokenmax_input_chars_per_word)selfr   r$   r%   s       r   __init__WordpieceTokenizer.__init__0   s    
"(@%    c                    [        U5      n[        U5      U R                  :  a  U R                  /$ Sn/ nU[        U5      :  a  [        U5      nS nX5:  a1  SR	                  X#U 5      nXpR
                  ;   a  UnOUS-  nX5:  a  M1  Uc!  UR                  U R                  5        US-  nOUR                  U5        UnU[        U5      :  a  M  U$ )Nr       )listlenr%   r$   joinr   append)r&   r   charsstart
sub_tokensend
cur_substrsubstrs           r   tokenizeWordpieceTokenizer.tokenize5   s    Uu:555NN##
c%j e*CJ+S!12ZZ'!'Jq + !!!$..1
!!*- c%j   r)   )r%   r$   r   N)<unk>   )__name__
__module____qualname____firstlineno__r'   r7   __static_attributes__ r)   r   r    r    /   s    A
r)   r    c            
       t  ^  \ rS rSrSr\rSS/rSr         SU 4S jjr	\
S 5       r\
S 5       r\
S	 5       r\
S
\4S j5       rS rS rU 4S jrS rS\\   S
\4S jrS rS rSS\S\\   S
\\   4S jjr SS\\   S\\\      S
\\   4S jjr SS\\   S\\\      S\S
\\   4U 4S jjjrSrU =r $ ) CpmAntTokenizerO   a^  
Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    bod_token (`str`, *optional*, defaults to `"<d>"`):
        The beginning of document token.
    eod_token (`str`, *optional*, defaults to `"</d>"`):
        The end of document token.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token.
    line_token (`str`, *optional*, defaults to `"</n>"`):
        The line token.
    space_token (`str`, *optional*, defaults to `"</_>"`):
        The space token.
	input_idsattention_maskFc                 N  > [        U S/5        X l        X0l        [        U5      U l        U R                  U	   U R                  S'   U R                  U   U R                  S'   U R                  U		 U R                  U	 [
        R                  " [        U R                  R                  5       S S95      U l        U R                  R                  5        VVs0 s H  u  pX_M	     snnU l	        [        U R                  US9U l        [        TU ]4  " SUUUUUUUU	U
S.	UD6  g s  snnf )	Njieba r   c                     U S   $ Nr,   r@   xs    r   <lambda>*CpmAntTokenizer.__init__.<locals>.<lambda>       Z[\]Z^r)   key)r   r$   )		bod_token	eod_token	bos_token	eos_token	pad_tokenr$   
line_tokenspace_tokenpadding_sider@   )r   rR   rS   r   encoderr   r   sorteditemsdecoderr    wordpiece_tokenizersuperr'   )r&   r   rR   rS   rT   rU   rV   r$   rW   rX   rY   kwargskv	__class__s                 r   r'   CpmAntTokenizer.__init__l   s    	$	*""!*- LL5S!\\*5TLL%LL$"..vdll6H6H6JP^/_`)-););)=>)=)=>#5DLLT]#^  	
!#%	
 	
	 ?s   D!c                 4    U R                   U R                     $ r#   )rZ   rR   r&   s    r   bod_token_idCpmAntTokenizer.bod_token_id       ||DNN++r)   c                 4    U R                   U R                     $ r#   )rZ   rS   rf   s    r   eod_token_idCpmAntTokenizer.eod_token_id   ri   r)   c                      U R                   S   $ )Nr   rZ   rf   s    r   
newline_idCpmAntTokenizer.newline_id   s    ||D!!r)   returnc                 ,    [        U R                  5      $ r#   )r.   rZ   rf   s    r   
vocab_sizeCpmAntTokenizer.vocab_size   s    4<<  r)   c                 B    [        U R                  40 U R                  D6$ r#   )dictrZ   added_tokens_encoderrf   s    r   	get_vocabCpmAntTokenizer.get_vocab   s    DLL>D$=$=>>r)   c                     / n[         R                  " USS9 H-  nUR                  U R                  R	                  U5      5        M/     U$ )zTokenize a string.F)cut_all)rG   cutextendr^   r7   )r&   textoutput_tokensrL   s       r   	_tokenizeCpmAntTokenizer._tokenize   sA    4/A  !9!9!B!B1!EF 0r)   c                    > U Vs/ s H  o3S:  d  M
  UPM     nnU Vs/ s H8  oDU R                   :w  d  M  X@R                  :w  d  M%  X@R                  :w  d  M6  UPM:     nn[        TU ]  " U40 UD6$ s  snf s  snf )zDecode ids into a string.r   )pad_token_ideos_token_idbos_token_idr_   _decode)r&   	token_idsr`   irL   rc   s        r   r   CpmAntTokenizer._decode   s}     )4	1!VQ		4 
 !):):$:AqDUDU?UAZ[_p_pZpAy 	 
 wy3F33	 5
s    	A1A1A6A6A6A6c                     XR                   ;   $ r#   rn   r&   r   s     r   checkCpmAntTokenizer.check   s    $$r)   r   c                 $    SR                  U5      $ )Nr+   )r/   )r&   r   s     r   convert_tokens_to_string(CpmAntTokenizer.convert_tokens_to_string   s    wwvr)   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)rZ   getr$   r   s     r   _convert_token_to_id$CpmAntTokenizer._convert_token_to_id   s*    ||||'7'7'GHHr)   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r]   r   r$   )r&   r   s     r   _convert_id_to_token$CpmAntTokenizer._convert_id_to_token   s    ||~~66r)   save_directoryfilename_prefixc                 D   [         R                  R                  U5      (       a6  [         R                  R                  X(       a  US-   OS[        S   -   5      nOU(       a  US-   OSU-   nSnSU R
                  ;   a)  U R
                  S   U R
                  S'   U R
                  S	 SU R
                  ;   a)  U R
                  S   U R
                  S'   U R
                  S	 [        R                  " [        U R
                  R                  5       S	 S
95      U l        [        USSS9 nU R
                  R                  5        H>  u  pgXG:w  a  [        R                  SU S35        UnUR                  US-   5        US-  nM@     S S S 5        U4$ ! , (       d  f       U4$ = f)N-r+   r   r   rH   </_>r   </n>c                     U S   $ rJ   r@   rK   s    r   rM   1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>   rO   r)   rP   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r,   )ospathisdirr/   VOCAB_FILES_NAMESrZ   r   r   r[   r\   r   loggerwarningwrite)r&   r   r   r   r   writerr   token_indexs           r   save_vocabularyCpmAntTokenizer.save_vocabulary   sp   77==((/3!6rUfgsUt tJ 4C/C/n\J$,,#'<<#4DLL S!4<<#'<<#5DLL T""..vdll6H6H6JP^/_`*cG4&*ll&8&8&:"'NN/
| <N N (EUT\*
 '; 5 } 54 }s   'AF
Ftoken_ids_0token_ids_1c                 j    Uc  U R                   /U-   $ U R                   /U-   U R                   /-   U-   $ )a  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A CPMAnt sequence has the following format:

- single sequence: `[BOS] Sequence`.

Args:
    token_ids_0 (`List[int]`): The first tokenized sequence that special tokens will be added.
    token_ids_1 (`List[int]`): The optional second tokenized sequence that special tokens will be added.

Returns:
    `List[int]`: The model input with special tokens.
)r   )r&   r   r   s      r    build_inputs_with_special_tokens0CpmAntTokenizer.build_inputs_with_special_tokens   sE      %%&44!!"[0D4E4E3FFTTr)   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Ub'  S/S/[        U5      -  -   S/-   S/[        U5      -  -   $ S/S/[        U5      -  -   $ )aT  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`): List of IDs.
    token_ids_1 (`List[int]`, *optional*): Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r   r,   r   )r_   get_special_tokens_maskr.   )r&   r   r   r   rc   s       r   r   'CpmAntTokenizer.get_special_tokens_mask   sw    " &72']a 3   "31#K 001QC7A3[AQ;QRRsqcC,,--r)   )rR   r]   rZ   rS   r^   )	z<d>z</d>z<s>z</s>z<pad>r9   r   r   leftr#   )NF)!r;   r<   r=   r>   __doc__r   vocab_files_namesmodel_input_namesadd_prefix_spacer'   propertyrg   rk   ro   intrs   rx   r   r   r   r   strr   r   r   r   r   r   r   boolr   r?   __classcell__)rc   s   @r   rB   rB   O   sr   0 *$&67
 (
T , , , , " " !C ! !?4%tCy S I7c HSM ]bcf]g 8 JNU9U3;DI3FU	cU* sx.9.3;DI3F.ko.	c. .r)   rB   )r   r   r   typingr   r   r   transformers.utilsr   r   rG   tokenization_utilsr	   utilsr
   
get_loggerr;   r   r   r   r    rB   __all__r@   r)   r   <module>r      st    '  	 ( ( D  5  
		H	%!;/  @~.) ~.B 
r)   