
    fTh~P                         S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	  SSK
rSSKJrJrJrJrJr  SSKJr  \R(                  " \5      rSS	S
.r\" 5       S 5       rS rS rS r " S S5      r " S S\5      rS/rg)zTokenization classes for CLIP.    N)	lru_cache)ListOptionalTuple   )
AddedTokenPreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec            	         [        [        [        S5      [        S5      S-   5      5      [        [        [        S5      [        S5      S-   5      5      -   [        [        [        S5      [        S5      S-   5      5      -   n U SS nS	n[        S
5       H4  nX0;  d  M
  U R                  U5        UR                  S
U-   5        US-  nM6     U Vs/ s H  n[	        U5      PM     nn[        [        X5      5      $ s  snf )a  
Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
characters the bpe code barfs on.

The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
tables between utf-8 bytes and unicode strings.
!~      ¡   ¬   ®   ÿNr      )listrangeordappendchrdictzip)bscsnbs       b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/clip/tokenization_clip.pybytes_to_unicoder%   %   s     	U3s8SX\*+d5TCIPQM3R.SSVZ[`adeiajloptluxyly[zV{{  
AB	A4[;IIaLIIdQhFA	 
 	"Q#a&"B	B 
s   C:c                 d    [        5       nU S   nU SS  H  nUR                  X#45        UnM     U$ )zy
Return set of symbol pairs in a word.

Word is represented as tuple of symbols (symbols being variable-length strings).
r   r   N)setadd)wordpairs	prev_charchars       r$   	get_pairsr-   >   s?     EEQIQR		9#$	  L    c                 V    [         R                  " SSU 5      n U R                  5       n U $ )Nz\s+ )resubstrip)texts    r$   whitespace_cleanr5   L   s$    66&#t$D::<DKr.   c                 X    U R                  5       n U (       d  / $ U R                  5       nU$ )z@Runs basic whitespace cleaning and splitting on a piece of text.)r3   split)r4   tokenss     r$   whitespace_tokenizer9   S   s%    ::<D	ZZ\FMr.   c                   X    \ rS rSrSr     SS jrSS jrS rSS jrS r	S	 r
S
 rSrg)BasicTokenizer]   ab  
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

Args:
    do_lower_case (`bool`, *optional*, defaults to `True`):
        Whether or not to lowercase the input when tokenizing.
    never_split (`Iterable`, *optional*):
        Collection of tokens which will never be split during tokenization. Only has an effect when
        `do_basic_tokenize=True`
    tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
        Whether or not to tokenize Chinese characters.

        This should likely be deactivated for Japanese (see this
        [issue](https://github.com/huggingface/transformers/issues/328)).
    strip_accents (`bool`, *optional*):
        Whether or not to strip all accents. If this option is not specified, then it will be determined by the
        value for `lowercase` (as in the original BERT).
    do_split_on_punc (`bool`, *optional*, defaults to `True`):
        In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
        the full context of the words, such as contractions.
Nc                 ^    Uc  / nXl         [        U5      U l        X0l        X@l        XPl        g N)do_lower_caser'   never_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfr?   r@   rA   rB   rC   s         r$   __init__BasicTokenizer.__init__t   s4     K*{+&<#* 0r.   c                 z   U(       a$  U R                   R                  [        U5      5      OU R                   nU R                  U5      nU R                  (       a  U R                  U5      n[        R                  " SU5      n[        U5      n/ nU H  nXb;  ad  U R                  (       a1  UR                  5       nU R                  SLa  U R                  U5      nO"U R                  (       a  U R                  U5      nUR                  U R                  Xb5      5        M     [        SR                  U5      5      nU$ )a:  
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

Args:
    never_split (`List[str]`, *optional*)
        Kept for backward compatibility purposes. Now implemented directly at the base class level (see
        [`PreTrainedTokenizer.tokenize`]) List of token not to split.
NFCFr0   )r@   unionr'   _clean_textrA   _tokenize_chinese_charsunicodedata	normalizer9   r?   lowerrB   _run_strip_accentsextend_run_split_on_puncjoin)rD   r4   r@   unicode_normalized_textorig_tokenssplit_tokenstokenoutput_tokenss           r$   tokenizeBasicTokenizer.tokenize   s
    CNd&&,,S-=>SWScSc% &&//5D"-"7"7t"D)*AB E'%%!KKME))6 $ 7 7 >'' 33E:E 7 7 KL ! ,CHH\,BCr.   c                     [         R                  " SU5      n/ nU H2  n[         R                  " U5      nUS:X  a  M!  UR                  U5        M4     SR	                  U5      $ )z$Strips accents from a piece of text.NFDMn )rL   rM   categoryr   rR   )rD   r4   outputr,   cats        r$   rO   !BasicTokenizer._run_strip_accents   sY    $$UD1D&&t,Cd{MM$	 
 wwvr.   c                    U R                   (       a  Ub  X;   a  U/$ [        U5      nSnSn/ nU[        U5      :  am  X4   n[        U5      (       a  UR	                  U/5        SnO.U(       a  UR	                  / 5        SnUS   R	                  U5        US-  nU[        U5      :  a  Mm  U Vs/ s H  nSR                  U5      PM     sn$ s  snf )z&Splits punctuation on a piece of text.r   TFr   r]   )rC   r   lenr   r   rR   )	rD   r4   r@   charsistart_new_wordr_   r,   xs	            r$   rQ   !BasicTokenizer._run_split_on_punc   s    $$)@TEX6MT
#e*n8Dt$$tf%!%!MM"%!&r
!!$'FA #e*n %++Fq
F+++s   .Cc                    / nU Hj  n[        U5      nU R                  U5      (       a5  UR                  S5        UR                  U5        UR                  S5        MY  UR                  U5        Ml     SR                  U5      $ )z)Adds whitespace around any CJK character.r0   r]   )r   _is_chinese_charr   rR   rD   r4   r_   r,   cps        r$   rK   &BasicTokenizer._tokenize_chinese_chars   sk    DTB$$R((c"d#c"d#  wwvr.   c                     US:  a  US::  dT  US:  a  US::  dH  US:  a  US::  d<  US:  a  US::  d0  US	:  a  US
::  d$  US:  a  US::  d  US:  a  US::  d  US:  a  US::  a  gg)z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TF )rD   rm   s     r$   rk   BasicTokenizer._is_chinese_char   sr     6\bFlfvg"-g"-g"-g"-fvg"-r.   c                     / nU H`  n[        U5      nUS:X  d  US:X  d  [        U5      (       a  M,  [        U5      (       a  UR                  S5        MO  UR                  U5        Mb     SR	                  U5      $ )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r0   r]   )r   r
   r   r   rR   rl   s        r$   rJ   BasicTokenizer._clean_text   sg    DTBQw",+d*;*;d##c"d#  wwvr.   )r?   rC   r@   rB   rA   )TNTNTr>   )__name__
__module____qualname____firstlineno____doc__rE   rX   rO   rQ   rK   rk   rJ   __static_attributes__rp   r.   r$   r;   r;   ]   s<    0 #1 $L	,,0r.   r;   c            
       F  ^  \ rS rSrSr\rSS/r     SU 4S jjr\	S 5       r
S r SS\\   S	\\\      S
\\   4S jjr SS\\   S	\\\      S\S
\\   4U 4S jjjr SS\\   S	\\\      S
\\   4S jjrS rS rS rS rS rSS\S\\   S
\\   4S jjrSrU =r$ )CLIPTokenizer   a  
Construct a CLIP tokenizer. Based on byte-level Byte-Pair-Encoding.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
    errors (`str`, *optional*, defaults to `"replace"`):
        Paradigm to follow when decoding bytes to UTF-8. See
        [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
    unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
        The beginning of sequence token.
    eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
        The end of sequence token.
    pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
        The token used for padding, for example when batching sequences of different lengths.
	input_idsattention_maskc           
        > [        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn SS Kn	U	R                  U l        [        USS9 n
[        R                  " U
5      U l        S S S 5        U R                  R                  5        VVs0 s H  u  pX_M	     snnU l        X0l        [#        5       U l        U R$                  R                  5        VVs0 s H  u  pX_M	     snnU l        [        USS9 nUR)                  5       R+                  5       R-                  S5      S	S
 nS S S 5        W Vs/ s H  n[/        UR-                  5       5      PM     nn[1        [3        U[5        [7        U5      5      5      5      U l        SSS.U l        [<        R>                  " S[<        R@                  5      U l!        [D        TU ]  " SUUUUUS.UD6  g ! [
         a/    [        R                  S5        [        SSS9U l	        S U l         GNf = f! , (       d  f       GN= fs  snnf s  snnf ! , (       d  f       GN	= fs  snf )NF)lstriprstripr   zKftfy or spacy is not installed using custom BasicTokenizer instead of ftfy.)rB   rC   utf-8encoding
r   i  <|startoftext|><|endoftext|>)r   r   z[<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+)errors	unk_token	bos_token	eos_token	pad_tokenrp   )$
isinstancestrr   ftfyfix_textImportErrorloggerinfor;   nlpopenjsonloadencoderitemsdecoderr   r%   byte_encoderbyte_decoderreadr3   r7   tupler   r   r   rd   	bpe_rankscacher1   compile
IGNORECASEpatsuperrE   )rD   r   r   r   r   r   r   r   kwargsr   vocab_handlekvmerges_handle
bpe_mergesmerge	__class__s                   r$   rE   CLIPTokenizer.__init__  s    JTT]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir		! MMDM *w/<99\2DL 0)-););)=>)=)=>,..2.?.?.E.E.GH.GdaQT.GH+0M&++-335;;DA!FYZJ 18BC
ueEKKM*

Cc*eC
O.DEF):_]
::nMM

 	 	
	
 	
-  	!KKef%EERDH DM	!
 0/> I00Cs<   )H I	II 31I&0#I85II
I&
I5c                 ,    [        U R                  5      $ r>   )rd   r   rD   s    r$   
vocab_sizeCLIPTokenizer.vocab_sizeL  s    4<<  r.   c                 B    [        U R                  40 U R                  D6$ r>   )r   r   added_tokens_encoderr   s    r$   	get_vocabCLIPTokenizer.get_vocabP  s    DLL>D$=$=>>r.   token_ids_0token_ids_1returnc                 j    U R                   /nU R                  /nUc  X1-   U-   $ X1-   U-   U-   U-   U-   $ )a|  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A CLIP sequence has the following format:

- single sequence: `<|startoftext|> X <|endoftext|>`

Pairs of sequences are not the expected use case, but they will be handled without a separator.

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)bos_token_ideos_token_idrD   r   r   r   r   s        r$    build_inputs_with_special_tokens.CLIPTokenizer.build_inputs_with_special_tokensS  sQ    ( &&'	&&'	*Y66&2Y>LyXXr.   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/S/[        U5      -  -   S/-   $ S/S/[        U5      -  -   S/-   S/-   S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r   r   r   )r   get_special_tokens_maskrd   )rD   r   r   r   r   s       r$   r   %CLIPTokenizer.get_special_tokens_maskn  s    & &72']a 3   31#K 001QC77sqcC,,-3qc9aS3{CS=STXYWZZZr.   c                     U R                   /nU R                  /nUc  [        X1-   U-   5      S/-  $ [        X1-   U-   U-   U-   U-   5      S/-  $ )aG  
Create a mask from the two sequences passed. CLIP does not make use of token type ids, therefore a list of
zeros is returned.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of zeros.
r   )r   r   rd   r   s        r$   $create_token_type_ids_from_sequences2CLIPTokenizer.create_token_type_ids_from_sequences  sl      &&'	&&'	y.:;qcAA9*Y6B[PS\\]ab`cccr.   c                 
  ^  UT R                   ;   a  T R                   U   $ [        US S 5      US   S-   4-   n[        U5      nU(       d  US-   $  [        UU 4S jS9nUT R                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUT R                   U'   U$ ! [         a    UR                  X(S  5         Ml  f = f)	Nrc   </w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)r   getfloat)pairrD   s    r$   <lambda>#CLIPTokenizer.bpe.<locals>.<lambda>  s    1C1CD%PU,1Wr.   keyr   r      r0   )r   r   r-   minr   rd   indexrP   
ValueErrorr   rR   )
rD   rV   r)   r*   bigramfirstsecondnew_wordrf   js
   `         r$   bpeCLIPTokenizer.bpe  s   DJJ::e$$U3BZ E"I$6#88$6>!$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~ 

5+ " OODH-s   E! !FFc                   ^  / nT R                   c+  SR                  T R                  R                  U5      5      nO([	        T R                  U5      5      R                  5       n[        R                  " T R                  U5       Hc  nSR                  U 4S jUR                  S5       5       5      nUR                  S T R                  U5      R                  S5       5       5        Me     U$ )zTokenize a string.r0   r]   c              3   B   >#    U  H  nTR                   U   v   M     g 7fr>   )r   ).0r#   rD   s     r$   	<genexpr>*CLIPTokenizer._tokenize.<locals>.<genexpr>  s       .C!!!$.Cs   r   c              3   $   #    U  H  ov   M     g 7fr>   rp   )r   	bpe_tokens     r$   r   r     s     T9SIi9Ss   )r   rR   r   rX   r5   rN   r1   findallr   encoderP   r   r7   )rD   r4   
bpe_tokensrV   s   `   r$   	_tokenizeCLIPTokenizer._tokenize  s    
== 88DHH--d34D#DMM$$78>>@DZZ$/EGG .3ll7.C E T%9N9Ns9STT	 0
 r.   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r   r   r   )rD   rV   s     r$   _convert_token_to_id"CLIPTokenizer._convert_token_to_id  s*    ||||'7'7'GHHr.   c                 8    U R                   R                  U5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r   r   )rD   r   s     r$   _convert_id_to_token"CLIPTokenizer._convert_id_to_token  s    ||&&r.   c                     SR                  U5      n[        U Vs/ s H  o0R                  U   PM     sn5      nUR                  SU R                  S9R                  SS5      R                  5       nU$ s  snf )z:Converts a sequence of tokens (string) in a single string.r]   r   )r   r   r0   )rR   	bytearrayr   decoder   replacer3   )rD   r8   r4   c
byte_arrays        r$   convert_tokens_to_string&CLIPTokenizer.convert_tokens_to_string  sk    wwvdCd 1 1! 4dCD
   =EEfcRXXZ  Ds   A4save_directoryfilename_prefixc           
         [         R                  R                  U5      (       d%  [        R	                  SR                  U5      5        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  X(       a  US-   OS[        S   -   5      n[        USSS9 nUR                  [        R                  " U R                  S	S
SS9S-   5        S S S 5        Sn[        USSS9 nUR                  S5        [        U R                  R                  5       S S9 HX  u  pXi:w  a&  [        R!                  SR                  U5      5        U	nUR                  SR                  U5      S-   5        US-  nMZ     S S S 5        X44$ ! , (       d  f       N= f! , (       d  f       X44$ = f)Nz*Vocabulary path ({}) should be a directory-r]   r   r   wr   r   r   TF)indent	sort_keysensure_asciir   r   z#version: 0.2
c                     U S   $ )Nr   rp   )kvs    r$   r   /CLIPTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^r.   r   zqSaving vocabulary to {}: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r0   r   )ospathisdirr   errorformatrR   VOCAB_FILES_NAMESr   writer   dumpsr   sortedr   r   warning)
rD   r   r   r   
merge_filefr   writerr   token_indexs
             r$   save_vocabularyCLIPTokenizer.save_vocabulary  s   ww}}^,,LLELL^\]WW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4GGDJJt||ATYZ]aab 5 *cG4LL*++1$..2F2F2HN^+_'
'NNMMSVT^M_ (ESXXj1D89
 ,` 5 %%! 54 54 %%s   ?4F$BF5$
F25
G)
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r>   )NF)rt   ru   rv   rw   rx   r  vocab_files_namesmodel_input_namesrE   propertyr   r   r   intr   r   boolr   r   r   r   r   r   r   r   r   r  ry   __classcell__)r   s   @r$   r{   r{      sK   2 *$&67 !#!!/
b ! !? JNY9Y3;DI3FY	cY8 sx[9[3;DI3F[ko[	c[ [: JNd9d3;DI3Fd	cd.(TI'&c &HSM &]bcf]g & &r.   r{   )rx   r   r  rL   	functoolsr   typingr   r   r   regexr1   tokenization_utilsr   r	   r
   r   r   utilsr   
get_loggerrt   r   r  r%   r-   r5   r9   r;   r{   __all__rp   r.   r$   <module>r     s    %  	   ( (  o o  
		H	%    0^ ^BF&' F&R 
r.   