
    fThS;                         S r SSKrSSKrSSKrSSKrSSKJrJr  SSKJ	r	J
r
JrJr  SSKJr  \R                  " \5      rSSS	.rS
 r " S S5      rS rS r " S S\	5      rS/rg)z$Tokenization classes for OpenAI GPT.    N)OptionalTuple   )PreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 X    U R                  5       n U (       d  / $ U R                  5       nU$ )z@Runs basic whitespace cleaning and splitting on a piece of text.)stripsplit)texttokenss     f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/openai/tokenization_openai.pywhitespace_tokenizer   $   s%    ::<D	ZZ\FM    c                   X    \ rS rSrSr     SS jrSS jrS rSS jrS r	S	 r
S
 rSrg)BasicTokenizer.   ab  
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

Args:
    do_lower_case (`bool`, *optional*, defaults to `True`):
        Whether or not to lowercase the input when tokenizing.
    never_split (`Iterable`, *optional*):
        Collection of tokens which will never be split during tokenization. Only has an effect when
        `do_basic_tokenize=True`
    tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
        Whether or not to tokenize Chinese characters.

        This should likely be deactivated for Japanese (see this
        [issue](https://github.com/huggingface/transformers/issues/328)).
    strip_accents (`bool`, *optional*):
        Whether or not to strip all accents. If this option is not specified, then it will be determined by the
        value for `lowercase` (as in the original BERT).
    do_split_on_punc (`bool`, *optional*, defaults to `True`):
        In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
        the full context of the words, such as contractions.
Nc                 ^    Uc  / nXl         [        U5      U l        X0l        X@l        XPl        g N)do_lower_casesetnever_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfr   r   r   r   r   s         r   __init__BasicTokenizer.__init__E   s4     K*{+&<#* 0r   c                 z   U(       a$  U R                   R                  [        U5      5      OU R                   nU R                  U5      nU R                  (       a  U R                  U5      n[        R                  " SU5      n[        U5      n/ nU H  nXb;  ad  U R                  (       a1  UR                  5       nU R                  SLa  U R                  U5      nO"U R                  (       a  U R                  U5      nUR                  U R                  Xb5      5        M     [        SR                  U5      5      nU$ )a:  
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

Args:
    never_split (`List[str]`, *optional*)
        Kept for backward compatibility purposes. Now implemented directly at the base class level (see
        [`PreTrainedTokenizer.tokenize`]) List of token not to split.
NFCF )r   unionr   _clean_textr   _tokenize_chinese_charsunicodedata	normalizer   r   lowerr   _run_strip_accentsextend_run_split_on_puncjoin)r    r   r   unicode_normalized_textorig_tokenssplit_tokenstokenoutput_tokenss           r   tokenizeBasicTokenizer.tokenizeU   s
    CNd&&,,S-=>SWScSc% &&//5D"-"7"7t"D)*AB E'%%!KKME))6 $ 7 7 >'' 33E:E 7 7 KL ! ,CHH\,BCr   c                     [         R                  " SU5      n/ nU H2  n[         R                  " U5      nUS:X  a  M!  UR                  U5        M4     SR	                  U5      $ )z$Strips accents from a piece of text.NFDMn )r)   r*   categoryappendr/   )r    r   outputcharcats        r   r,   !BasicTokenizer._run_strip_accents{   sY    $$UD1D&&t,Cd{MM$	 
 wwvr   c                    U R                   (       a  Ub  X;   a  U/$ [        U5      nSnSn/ nU[        U5      :  am  X4   n[        U5      (       a  UR	                  U/5        SnO.U(       a  UR	                  / 5        SnUS   R	                  U5        US-  nU[        U5      :  a  Mm  U Vs/ s H  nSR                  U5      PM     sn$ s  snf )z&Splits punctuation on a piece of text.r   TF   r:   )r   listlenr   r<   r/   )	r    r   r   charsistart_new_wordr=   r>   xs	            r   r.   !BasicTokenizer._run_split_on_punc   s    $$)@TEX6MT
#e*n8Dt$$tf%!%!MM"%!&r
!!$'FA #e*n %++Fq
F+++s   .Cc                    / nU Hj  n[        U5      nU R                  U5      (       a5  UR                  S5        UR                  U5        UR                  S5        MY  UR                  U5        Ml     SR                  U5      $ )z)Adds whitespace around any CJK character.r%   r:   )ord_is_chinese_charr<   r/   r    r   r=   r>   cps        r   r(   &BasicTokenizer._tokenize_chinese_chars   sk    DTB$$R((c"d#c"d#  wwvr   c                     US:  a  US::  dT  US:  a  US::  dH  US:  a  US::  d<  US:  a  US::  d0  US	:  a  US
::  d$  US:  a  US::  d  US:  a  US::  d  US:  a  US::  a  gg)z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TF )r    rO   s     r   rM   BasicTokenizer._is_chinese_char   sr     6\bFlfvg"-g"-g"-g"-fvg"-r   c                     / nU H`  n[        U5      nUS:X  d  US:X  d  [        U5      (       a  M,  [        U5      (       a  UR                  S5        MO  UR                  U5        Mb     SR	                  U5      $ )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r%   r:   )rL   r   r	   r<   r/   rN   s        r   r'   BasicTokenizer._clean_text   sg    DTBQw",+d*;*;d##c"d#  wwvr   )r   r   r   r   r   )TNTNTr   )__name__
__module____qualname____firstlineno____doc__r!   r5   r,   r.   r(   rM   r'   __static_attributes__rR   r   r   r   r   .   s<    0 #1 $L	,,0r   r   c                 d    [        5       nU S   nU SS  H  nUR                  X#45        UnM     U$ )zw
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
r   rC   N)r   add)wordpairs	prev_charr>   s       r   	get_pairsra      s?    
 EEQIQR		9#$	  Lr   c                 f   U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n [        R                  " S	S
U 5      n [        R                  " SSU 5      n [        R                  " SSU 5      n U R                  5       $ )ze
fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization
u   —-u   –u   ―u   …z...   ´'zD(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)z \1 z\s*\n\s*z 
 z[^\S\n]+r%   )replaceresubr   )r   s    r   text_standardizeri      s     <<s#D<<s#D<<s#D<<u%D<<c"D66]_fhlmD66+vt,D66+sD)D::<r   c                      ^  \ rS rSrSr\rSS/rSU 4S jjr\	S 5       r
\	S 5       rS rS	 rS
 rS rS rS rSS\S\\   S\\   4S jjrSrU =r$ )OpenAIGPTTokenizer   a  
Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:

- lowercases all inputs,
- uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
  `BasicTokenizer` if not.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
	input_idsattention_maskc           
      J  >  SS K nSSKJn  U" 5       nUR                  U l        UR
                  U l        [        USS9 n[        R                  " U5      U l        S S S 5        U R                  R                  5        V	V
s0 s H  u  pX_M	     sn
n	U l        [        USS9 nUR!                  5       R#                  S5      S	S
 nS S S 5        W Vs/ s H  n[%        UR#                  5       5      PM     nn['        [)        U[+        [-        U5      5      5      5      U l        0 U l        [2        TU ]h  " SSU0UD6  g ! [         a.    [        R                  S5        [        SS9U l        S U l         GN=f = f! , (       d  f       GN"= fs  sn
n	f ! , (       d  f       N= fs  snf )Nr   )EnglishzQftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.T)r   utf-8encoding
rC   rB   	unk_tokenrR   )ftfyspacy.lang.enrp   	tokenizernlpfix_textImportErrorloggerwarningr   openjsonloadencoderitemsdecoderreadr   tupledictziprangerE   	bpe_rankscachesuperr!   )r    r   r   ru   kwargsrv   rp   _nlpvocab_handlekvmerges_handlemergesmerge	__class__s                 r   r!   OpenAIGPTTokenizer.__init__  sD   
	!-9D~~DH MMDM *w/<99\2DL 0)-););)=>)=)=>+0M"'')//5a;F 14:;F5%&F;c&%F*<=>
7977  	!NNno%D9DH DM	!
 0/>00;s5   3D<  E7F	"#F#F <4E43E47
F
Fc                     g)NTrR   r    s    r   r    OpenAIGPTTokenizer.do_lower_case  s    r   c                 ,    [        U R                  5      $ r   )rE   r   r   s    r   
vocab_sizeOpenAIGPTTokenizer.vocab_size  s    4<<  r   c                 B    [        U R                  40 U R                  D6$ r   )r   r   added_tokens_encoderr   s    r   	get_vocabOpenAIGPTTokenizer.get_vocab#  s    DLL>D$=$=>>r   c                   ^  [        US S 5      US   S-   4-   nUT R                  ;   a  T R                  U   $ [        U5      nU(       d  US-   $  [        UU 4S jS9nUT R                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUS	:X  a  S
nUT R                  U'   U$ ! [         a    UR                  X(S  5         Mt  f = f)NrB   </w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)r   getfloat)pairr    s    r   <lambda>(OpenAIGPTTokenizer.bpe.<locals>.<lambda>0  s    1C1CD%PU,1Wr   keyr   rC      r%   z
  </w>z
</w>)r   r   ra   minr   rE   indexr-   
ValueErrorr<   r/   )
r    r3   r^   r_   bigramfirstsecondnew_wordrG   js
   `         r   bpeOpenAIGPTTokenizer.bpe&  s   U3BZ E"I$6#88DJJ::e$$$6>!$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~:D 

5/ " OODH-s   E) )F
	F
c           	         / nU R                   c^  U R                  R                  U5      nU H;  nUR                  [	        U R                  U5      R                  S5      5      5        M=     U$ U R                  [        U R                  U5      5      5      nU HS  nUR                  [	        U R                  UR                  R                  5       5      R                  S5      5      5        MU     U$ )zTokenize a string.r%   )
rz   ry   r5   r-   rD   r   r   ri   r   r+   )r    r   r2   r3   s       r   	_tokenizeOpenAIGPTTokenizer._tokenizeR  s    == 88$$T*D##D%)>)>s)C$DE   88,T]]4-@ABD##D%**2B2B2D)E)K)KC)P$QR r   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r   r   ru   )r    r3   s     r   _convert_token_to_id'OpenAIGPTTokenizer._convert_token_to_ida  s*    ||||'7'7'GHHr   c                 L    U R                   R                  XR                  5      $ )z0Converts an id in a token (BPE) using the vocab.)r   r   ru   )r    r   s     r   _convert_id_to_token'OpenAIGPTTokenizer._convert_id_to_tokene  s    ||~~66r   c                 d    SR                  U5      R                  SS5      R                  5       nU$ )z:Converts a sequence of tokens (string) in a single string.r:   r   r%   )r/   rf   r   )r    r   
out_strings      r   convert_tokens_to_string+OpenAIGPTTokenizer.convert_tokens_to_stringi  s+    WWV_,,VS9??A
r   save_directoryfilename_prefixreturnc           
      d   [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  X(       a  US-   OS[        S   -   5      n[        USSS	9 nUR                  [        R                  " U R                  S
SSS9S-   5        S S S 5        Sn[        USSS	9 nUR                  S5        [        U R                  R                  5       S S9 HM  u  pXi:w  a  [        R                  SU S35        U	nUR                  SR                  U5      S-   5        US-  nMO     S S S 5        X44$ ! , (       d  f       N= f! , (       d  f       X44$ = f)NzVocabulary path (z) should be a directoryrc   r:   r   r   wrq   rr   r   TF)indent	sort_keysensure_asciirt   r   z#version: 0.2
c                     U S   $ )NrC   rR   )kvs    r   r   4OpenAIGPTTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^r   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r%   rC   )ospathisdirr|   errorr/   VOCAB_FILES_NAMESr~   writer   dumpsr   sortedr   r   r}   )
r    r   r   r   
merge_filefr   writer
bpe_tokenstoken_indexs
             r   save_vocabulary"OpenAIGPTTokenizer.save_vocabularyn  s   ww}}^,,LL,^,<<STUWW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4GGDJJt||ATYZ]aab 5 *cG4LL*++1$..2F2F2HN^+_'
'NN/
| <M M (ESXXj1D89
 ,` 5 %%! 54 54 %%s   44F=BF
F
F/)r   r   r   r   rz   ry   )z<unk>r   )rV   rW   rX   rY   rZ   r   vocab_files_namesmodel_input_namesr!   propertyr   r   r   r   r   r   r   r   strr   r   r   r[   __classcell__)r   s   @r   rk   rk      s    ( *$&6780   ! !?*XI7
&c &HSM &]bcf]g & &r   rk   )rZ   r   r   rg   r)   typingr   r   tokenization_utilsr   r   r   r	   utilsr
   
get_loggerrV   r|   r   r   r   ra   ri   rk   __all__rR   r   r   <module>r      sz    +  	 	  " c c  
		H	%  ^ ^B
^&, ^&B  
 r   