
    fTh                        S r SSKrSSKrSSKJrJr  SSKrSSKJ	r	  SSK
Jr  \R                  " \5      rSSS	.r0 S
S_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_S S!_S"S#_S$S%_S&S'_S(S)_S*S+_0 S,S-_S.S/_S0S1_S2S3_S4S5_S6S7_S8S9_S:S;_S<S=_S>S?_S@SA_SBSC_SDSE_SFSG_SHSI_SJSK_SLSM_E0 SNSO_SPSQ_SRSS_STSU_SVSW_SXSY_SZS[_S\S]_S^S__S`Sa_SbSc_SdSe_SfSg_ShSi_SjSk_SlSm_SnSo_ESpSqSrSsSt.ErSu r " Sv Sw\	5      rSw/rg)xz)Tokenization classes for Salesforce CTRL.    N)OptionalTuple   )PreTrainedTokenizer)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_file	Pregnancyi Christianityi  Explaini Fitnessi  Savingi  Aski#j  Assiv Jokei~ 	Questionsi6  Thoughtsi  Retailiv  Feminismi Writingi.  Atheismi Netflixi  	Computingiך  Opinioniͨ  Alonei  Funnyi%  Gamingi  Humani  Indiai3  JokeriR- Dietin  LegaliS.  NormaniK  Tipi Weightiw  Moviesi  Runningi[  Sciencei*  Horrori  
Confessioni  Financei/  Politicsi?  Scaryi Supportin1  Technologiesi  Teenageip Eventi  Learnedi Notioni 	Wikipediaiϒ  Booksi	  Extracti) Confessionsi- 
Conspiracyi( Linksi  	NarcissusiK Relationshipi  Relationshipsi iǢ  i  ih  i )ReviewsNewsTranslationmultilingualc                 z    [        5       nU S   nU SS  H  nUR                  X#45        UnM     [        U5      nU$ )zy
Return set of symbol pairs in a word.

Word is represented as tuple of symbols (symbols being variable-length strings).
r      N)setadd)wordpairs	prev_charchars       b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/ctrl/tokenization_ctrl.py	get_pairsrJ   ^   sH     EEQIQR		9#$	  JEL    c                      ^  \ rS rSrSr\r\rSU 4S jjr	\
S 5       rS rS rS rS rS	 rS
 rSS\S\\   S\\   4S jjrSrU =r$ )CTRLTokenizern   a0  
Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
c           
      j  > [        USS9 n[        R                  " U5      U l        S S S 5        U R                  R	                  5        VVs0 s H  u  pgXv_M	     snnU l        [        USS9 nUR                  5       R                  S5      SS n	S S S 5        W	 V
s/ s H  n
[        U
R                  5       5      PM     n	n
[        [        U	[        [        U	5      5      5      5      U l        0 U l        [        TU ]@  " SSU0UD6  g ! , (       d  f       N= fs  snnf ! , (       d  f       N= fs  sn
f )Nutf-8encoding
rB   	unk_token )openjsonloadencoderitemsdecoderreadsplittupledictziprangelen	bpe_rankscachesuper__init__)selfr   r	   rU   kwargsvocab_handlekvmerges_handlemergesmerge	__class__s              rI   rg   CTRLTokenizer.__init__   s    *w/<99\2DL 0)-););)=>)=)=>+0M"'')//5a;F 14:;F5%&F;c&%F*<=>
7977 0/>00;s#   DD.#D#D0
D
D-c                 ,    [        U R                  5      $ N)rc   rZ   rh   s    rI   
vocab_sizeCTRLTokenizer.vocab_size   s    4<<  rK   c                 B    [        U R                  40 U R                  D6$ rs   )r`   rZ   added_tokens_encoderrt   s    rI   	get_vocabCTRLTokenizer.get_vocab   s    DLL>D$=$=>>rK   c                 6  ^  UT R                   ;   a  T R                   U   $ [        U5      n[        [        US S 5      US   S-   /-   5      n[        U5      nU(       d  U$  [	        UU 4S jS9nUT R
                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUS S	 nUT R                   U'   U$ ! [         a    UR                  X(S  5         Mq  f = f)
NrT   z</w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)rd   getfloat)pairrh   s    rI   <lambda>#CTRLTokenizer.bpe.<locals>.<lambda>   s    1C1CD%PU,1WrK   keyr   rB      @@ )re   r_   listrJ   minrd   rc   indexextend
ValueErrorappendjoin)
rh   tokenrE   rF   bigramfirstsecondnew_wordijs
   `         rI   bpeCTRLTokenizer.bpe   s   DJJ::e$$U|T$s)_R6(9'::;$L$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : zz$CRy 

5- " OODH-s   E7 7FFc                     / n[         R                  " SU5      nU H;  nUR                  [        U R	                  U5      R                  S5      5      5        M=     U$ )zTokenize a string.z\S+\n? )refindallr   r   r   r^   )rh   textsplit_tokenswordsr   s        rI   	_tokenizeCTRLTokenizer._tokenize   sM    

9d+ETXXe_%:%:3%? @A rK   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)rZ   r~   rU   )rh   r   s     rI   _convert_token_to_id"CTRLTokenizer._convert_token_to_id   s*    ||||'7'7'GHHrK   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r\   r~   rU   )rh   r   s     rI   _convert_id_to_token"CTRLTokenizer._convert_id_to_token   s    ||~~66rK   c                 d    SR                  U5      R                  SS5      R                  5       nU$ )z:Converts a sequence of tokens (string) in a single string.r   r    )r   replacestrip)rh   tokens
out_strings      rI   convert_tokens_to_string&CTRLTokenizer.convert_tokens_to_string   s,    XXf%--eR8>>@
rK   save_directoryfilename_prefixreturnc           
      d   [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  X(       a  US-   OS[        S   -   5      n[        USSS	9 nUR                  [        R                  " U R                  S
SSS9S-   5        S S S 5        Sn[        USSS	9 nUR                  S5        [        U R                  R                  5       S S9 HM  u  pXi:w  a  [        R                  SU S35        U	nUR                  SR                  U5      S-   5        US-  nMO     S S S 5        X44$ ! , (       d  f       N= f! , (       d  f       X44$ = f)NzVocabulary path (z) should be a directory-r   r   r	   wrP   rQ   r   TF)indent	sort_keysensure_asciirS   r   z#version: 0.2
c                     U S   $ )NrB   rV   )kvs    rI   r   /CTRLTokenizer.save_vocabulary.<locals>.<lambda>   s    Y[\]Y^rK   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r   rB   )ospathisdirloggererrorr   VOCAB_FILES_NAMESrW   writerX   dumpsrZ   sortedrd   r[   warning)
rh   r   r   r   
merge_filefr   writer
bpe_tokenstoken_indexs
             rI   save_vocabularyCTRLTokenizer.save_vocabulary   s   ww}}^,,LL,^,<<STUWW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4GGDJJt||ATYZ]aab 5 *cG4LL*++1$..2F2F2HN^+_'
'NN/
| <M M (ESXXj1D89
 ,` 5 %%! 54 54 %%s   44F=BF
F
F/)rd   re   r\   rZ   )z<unk>rs   )__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesCONTROL_CODEScontrol_codesrg   propertyru   ry   r   r   r   r   r   strr   r   r   __static_attributes____classcell__)rp   s   @rI   rM   rM   n   sx      *!M	8 ! !?*XI7
&c &HSM &]bcf]g & &rK   rM   )r   rX   r   typingr   r   regexr   tokenization_utilsr   utilsr   
get_loggerr   r   r   r   rJ   rM   __all__rV   rK   rI   <module>r      s   0  	 "  5  
		H	%  88D8 v8 u	8
 e8 
58 
58 F8 8 8 e8 8 u8 v8 u8  !8" u#8$ U%8& U'8( e)8* T+8, T-8. U/80 E182 U384 d586 
5788 e98: e;8< u=8> t?8@ eA8B %C8D uE8F G8H VI8J uK8L EM8N uO8P UQ8R uS8T fU8V W8X TY8Z u[8\ 6]8^ %_8` Ua8b c8d Ee8f Vg8h o8v D&' D&Z 
rK   