
    fThi              	       V   S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	  SSK
r
SSKJr  SSKJr  \R                  " \5      rSS	S
.rS r " S S\5      r SrSr\S\SSSSSS4	r\
R0                  " SSR3                  \5      -  \
R4                  \
R6                  -  \
R8                  -  5      r\
R0                  " S5      r\
R0                  " \\
R4                  \
R6                  -  \
R8                  -  5      r\
R0                  " S5      r S"S jr!S#S jr" " S S5      r#S r$S  r%S$S! jr&S/r'g)%z!Tokenization classes for BERTweet    N)copyfile)ListOptionalTuple   )PreTrainedTokenizer)loggingz	vocab.txtz	bpe.codes)
vocab_filemerges_filec                 z    [        5       nU S   nU SS  H  nUR                  X#45        UnM     [        U5      nU$ )zy
Return set of symbol pairs in a word.

Word is represented as tuple of symbols (symbols being variable-length strings).
r      N)setadd)wordpairs	prev_charchars       j/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/bertweet/tokenization_bertweet.py	get_pairsr   &   sH     EEQIQR		9#$	  JEL    c            
       V  ^  \ rS rSrSr\r        SU 4S jjr SS\\	   S\
\\	      S\\	   4S jjr SS\\	   S\
\\	      S\S\\	   4U 4S	 jjjr SS\\	   S\
\\	      S\\	   4S
 jjr\S 5       rS rS rS rS rS rS rS rS rSS\S\
\   S\\   4S jjrS rSrU =r$ )BertweetTokenizer6   a8	  
Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
    normalization (`bool`, *optional*, defaults to `False`):
        Whether or not to apply a normalization preprocess.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the beginning of
        sequence. The token used is the `cls_token`.

        </Tip>

    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.

        <Tip>

        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
        The token used is the `sep_token`.

        </Tip>

    sep_token (`str`, *optional*, defaults to `"</s>"`):
        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
        sequence classification or for a text and a question for question answering. It is also used as the last
        token of a sequence built with special tokens.
    cls_token (`str`, *optional*, defaults to `"<s>"`):
        The classifier token which is used when doing sequence classification (classification of the whole sequence
        instead of per-token classification). It is the first token of the sequence when built with special tokens.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    mask_token (`str`, *optional*, defaults to `"<mask>"`):
        The token used for masking values. This is the token used when training this model with masked language
        modeling. This is the token which the model will try to predict.
c                   >  SSK Jn  Xl        Xl        X l        0 U l        SU R                  [        U5      '   SU R                  [        U	5      '   SU R                  [        U5      '   SU R                  [        U5      '   U R                  U5        U R                  R                  5        VVs0 s H  u  pX_M	     snnU l        [        USS9 nUR                  5       R                  S	5      S S
 nS S S 5        W Vs/ s H  n[!        UR                  5       S S
 5      PM!     nn[#        [%        U['        [)        U5      5      5      5      U l        0 U l        X0l        [1        5       U l        SSS.U l        [6        TU ]p  " SUUUUUUU	U
S.UD6  g ! [         a     [        R                  S5        S U l         GNf = fs  snnf ! , (       d  f       N= fs  snf )Nr   )demojizezsemoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0r      r   utf-8encoding
'z...)u   ’u   …)normalization	bos_token	eos_token	sep_token	cls_token	unk_token	pad_token
mask_token )emojir   	demojizerImportErrorloggerwarningr
   r   encoderstradd_from_fileitemsdecoderopenreadsplittupledictziprangelen	bpe_rankscacher#   TweetTokenizertweetPreprocessorspecial_punctssuper__init__)selfr
   r   r#   r$   r%   r&   r'   r(   r)   r*   kwargsr   kvmerges_handlemergesmerge	__class__s                     r   rD   BertweetTokenizer.__init__k   s   		"&%N %&'(S^$'(S^$'(S^$'(S^$:&)-););)=>)=)=>+0M"'')//5cr:F 19?@%cr*+@c&%F*<=>
*!/!1&)%8 
	
'!
	
 
	
=  	"NN( "DN	"$ ?00@s)   F 0G#G &G&F>=F>
Gtoken_ids_0token_ids_1returnc                     Uc  U R                   /U-   U R                  /-   $ U R                   /nU R                  /nX1-   U-   U-   U-   U-   $ )a7  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A BERTweet sequence has the following format:

- single sequence: `<s> X </s>`
- pair of sequences: `<s> A </s></s> B </s>`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)cls_token_idsep_token_id)rE   rN   rO   clsseps        r    build_inputs_with_special_tokens2BertweetTokenizer.build_inputs_with_special_tokens   se    ( %%&48I8I7JJJ  !  ! 3&,{:S@@r   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Uc  S/S/[        U5      -  -   S/-   $ S/S/[        U5      -  -   SS/-   S/[        U5      -  -   S/-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)rN   rO   rX   r   r   )rC   get_special_tokens_maskr=   )rE   rN   rO   rX   rL   s       r   rZ   )BertweetTokenizer.get_special_tokens_mask   s    & &72']a 3   31#K 001QC77sqcC,,-A61#K@P:PQUVTWWWr   c                     U R                   /nU R                  /nUc  [        XA-   U-   5      S/-  $ [        XA-   U-   U-   U-   U-   5      S/-  $ )a}  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does
not make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of zeros.
r   )rS   rR   r=   )rE   rN   rO   rU   rT   s        r   $create_token_type_ids_from_sequences6BertweetTokenizer.create_token_type_ids_from_sequences   si    "   !  !s(3./1#553$s*S0;>DEKKr   c                 ,    [        U R                  5      $ N)r=   r1   rE   s    r   
vocab_sizeBertweetTokenizer.vocab_size   s    4<<  r   c                 B    [        U R                  40 U R                  D6$ r`   )r:   r1   added_tokens_encoderra   s    r   	get_vocabBertweetTokenizer.get_vocab   s    DLL>D$=$=>>r   c                 6  ^  UT R                   ;   a  T R                   U   $ [        U5      n[        [        US S 5      US   S-   /-   5      n[        U5      nU(       d  U$  [	        UU 4S jS9nUT R
                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUS S	 nUT R                   U'   U$ ! [         a    UR                  X(S  5         Mq  f = f)
Nr!   z</w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)r>   getfloat)pairrE   s    r   <lambda>'BertweetTokenizer.bpe.<locals>.<lambda>  s    1C1CD%PU,1Wr   )keyr   r   r   @@ )r?   r9   listr   minr>   r=   indexextend
ValueErrorappendjoin)
rE   tokenr   r   bigramfirstsecondnew_wordijs
   `         r   bpeBertweetTokenizer.bpe   s   DJJ::e$$U|T$s)_R6(9'::;$L$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : zz$CRy 

5- " OODH-s   E7 7FFc                     U R                   (       a  U R                  U5      n/ n[        R                  " SU5      nU H;  nUR	                  [        U R                  U5      R                  S5      5      5        M=     U$ )zTokenize a string.z\S+\n? )r#   normalizeTweetrefindallrv   rs   r   r8   )rE   textsplit_tokenswordsrz   s        r   	_tokenizeBertweetTokenizer._tokenize(  sf    &&t,D

9d+ETXXe_%:%:3%? @A r   c                 .   U R                    H!  nUR                  X R                   U   5      nM#     U R                  R                  U5      nSR	                  U Vs/ s H  o@R                  U5      PM     sn5      nUR                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  S	S
5      nUR                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  SS5      R                  SS5      nUR                  SS5      R                  SS5      R                  SS5      R                  SS5      nSR	                  UR                  5       5      $ s  snf )z
Normalize a raw Tweet
r   zcannot zcan not zn't z n't zn 't zca n'tzcan'tzai n'tzain'tz'm z 'm z're z 're z's z 's z'll z 'll z'd z 'd z've z 've z p . m .z  p.m.z p . m z p.m z a . m .z a.m.z a . m z a.m )rB   replacerA   tokenizery   normalizeTokenr8   )rE   tweetpuncttokensrz   	normTweets         r   r    BertweetTokenizer.normalizeTweet3  s^    ((EMM%)<)<U)CDE ) ''007HHfMfU11%8fMN	 i4WVW%WWg&WXw'WXw' 	 eV,WVW%WUF#WVW%WUF#WVW% 	 j(3WY(WZ)WY(	 	 xx	)**1 Ns   Fc                 H   UR                  5       nUR                  S5      (       a  gUR                  S5      (       d  UR                  S5      (       a  g[        U5      S:X  a>  XR                  ;   a  U R                  U   $ U R                  b  U R	                  U5      $ U$ U$ )z
Normalize tokens in a Tweet
@z@USERhttpwwwHTTPURLr   )lower
startswithr=   rB   r-   )rE   rz   lowercased_tokens      r   r    BertweetTokenizer.normalizeTokenU  s     !;;=C  ((004D4O4OPU4V4VZ1_+++**511~~)~~e,,Lr   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r1   rk   r(   )rE   rz   s     r   _convert_token_to_id&BertweetTokenizer._convert_token_to_idh  s*    ||||'7'7'GHHr   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r5   rk   r(   )rE   ru   s     r   _convert_id_to_token&BertweetTokenizer._convert_id_to_tokenl  s    ||~~66r   c                 d    SR                  U5      R                  SS5      R                  5       nU$ )z:Converts a sequence of tokens (string) in a single string.r   rq    )ry   r   strip)rE   r   
out_strings      r   convert_tokens_to_string*BertweetTokenizer.convert_tokens_to_stringp  s,    XXf%--eR8>>@
r   save_directoryfilename_prefixc                 4   [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aE  [         R                  R                  U R                  5      (       a  [        U R                  U5        On[         R                  R                  U R                  5      (       d@  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        [         R                  R                  U R                  5      [         R                  R                  U5      :w  a  [        U R                  U5        X44$ ! , (       d  f       Nq= f)NzVocabulary path (z) should be a directory-r   r
   r   wb)ospathisdirr/   errorry   VOCAB_FILES_NAMESabspathr
   isfiler   r6   sp_modelserialized_model_protowriter   )rE   r   r   out_vocab_fileout_merge_fileficontent_spiece_models          r   save_vocabulary!BertweetTokenizer.save_vocabularyu  sr   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 o_s22QbcpQqq
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n500nd+r'+}}'K'K'M$-. , 77??4++,0OOT%%~6-- ,+s   2,H		
Hc                    [        U[        5      (       a'   [        USSS9 nU R                  U5        SSS5        gUR                  5       nU H\  nUR                  5       nUR                  S5      nUS:X  a  [        S	5      eUSU n[        U R                  5      U R                  U'   M^     g! , (       d  f       g= f! [         a  nUeSnAf[
         a    [        SU S35      ef = f)
zY
Loads a pre-existing dictionary from a text file and adds its symbols to this instance.
rr   r   NzIncorrect encoding detected in z, please rebuild the datasetr   r!   z5Incorrect dictionary format, expected '<token> <cnt>')
isinstancer2   r6   r3   FileNotFoundErrorUnicodeError	Exception	readlinesr   rfindrw   r=   r1   )	rE   ffdfnfelineslineTmplineidxr   s	            r   r3   BertweetTokenizer.add_from_file  s     ac!S73r&&r* 4 G==?D**S/Cby !XYY:D!$T\\!2DLL  43 	 % 
 c"A!D` abbcs3   C B0C 0
B>:C >C 
C*CC*)
r>   r?   r5   r-   r1   r   r#   rB   rA   r
   )F<s></s>r   r   z<unk>z<pad>z<mask>r`   )NF)__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesrD   r   intr   rV   boolrZ   r]   propertyrb   rf   r   r   r   r   r   r   r   r2   r   r   r3   __static_attributes____classcell__)rL   s   @r   r   r   6   sY   0d * :
z JNA9A3;DI3FA	cA6 sxX9X3;DI3FXkoX	cX X: JNL9L3;DI3FL	cL0 ! !?*X	 +D&I7
.c .HSM .]bcf]g .:3 3r   r   ac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      <3                         # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
    (?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
    z(%s)|z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);c                 \    Uc  Sn[        U [        5      (       a  U R                  X5      $ U $ )Nr   )r   bytesdecode)r   r   errorss      r   _str_to_unicoder   \  s.    ${{8,,Kr   c                 R   ^^ UU4S jn[         R                  U[        X5      5      $ )u  
Remove entities from text by converting them to their corresponding unicode character.

Args:
    text:
        A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
    keep (list):
        List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
        `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
    remove_illegal (bool):
        If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
        kept "as is".

Returns: A unicode string with the entities removed.

See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

Examples:

```python
>>> from nltk.tokenize.casual import _replace_html_entities

>>> _replace_html_entities(b"Price: &pound;100")
'Price: \xa3100'

>>> print(_replace_html_entities(b"Price: &pound;100"))
Price: £100
```c                 ,  > U R                  S5      nU R                  S5      (       a\   U R                  S5      (       a  [        US5      nO[        US5      nSUs=::  a  S::  a  O  O\[        U45      R                  S5      $ O@UT;   a  U R                  S	5      $ [
        R                  R                  R                  U5      nUb   [        U5      $ T(       a  S
$ U R                  S	5      $ ! [         a    S n N7f = f! [        [        4 a     N>f = f)Nr   r   r      
         cp1252r   r   )groupr   r   r   rw   htmlentitiesname2codepointrk   chrOverflowError)matchentity_bodynumberkeepremove_illegals      r   _convert_entity/_replace_html_entities.<locals>._convert_entity  s    kk!n;;q>>;;q>> b1F b1F
 6)T) &+228<< *
 d"{{1~%5599+F6{" $r7Q7   . s)   >C. )C. 	
D  .C=<C= DD)ENT_REsubr   )r   r   r   r   r   s    ``  r   _replace_html_entitiesr   d  s     <8: ::ot'FGGr   c                   (    \ rS rSrSrSS jrS rSrg)r@   i  a  
Examples:

```python
>>> # Tokenizer for tweets.
>>> from nltk.tokenize import TweetTokenizer

>>> tknzr = TweetTokenizer()
>>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
>>> tknzr.tokenize(s0)
['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']

>>> # Examples using *strip_handles* and *reduce_len parameters*:
>>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
>>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
>>> tknzr.tokenize(s1)
[':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
```c                 (    Xl         X l        X0l        g r`   preserve_case
reduce_lenstrip_handles)rE   r   r   r   s       r   rD   TweetTokenizer.__init__  s    *$*r   c                    [        U5      nU R                  (       a  [        U5      nU R                  (       a  [	        U5      n[
        R                  SU5      n[        R                  U5      nU R                  (       d<  U Vs/ s H/  n[        R                  U5      (       a  UOUR                  5       PM1     nnU$ s  snf )z
Args:
    text: str

Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if
`preserve_case=False`
\1\1\1)r   r   remove_handlesr   reduce_lengtheningHANG_REr   WORD_REr   r   EMOTICON_REsearchr   )rE   r   	safe_textr   xs        r   r   TweetTokenizer.tokenize  s     &d+!$'D??%d+DKK	40		*!!HMN1+,,Q//QQWWY>EN Os   6B>r   NTFF)r   r   r   r   r   rD   r   r   r+   r   r   r@   r@     s    &+
r   r@   c                 R    [         R                  " S5      nUR                  SU 5      $ )zY
Replace repeated character sequences of length 3 or greater with sequences of length 3.
z	(.)\1{2,}r  regexcompiler   r   patterns     r   r  r    s#     mmL)G;;y$''r   c                 R    [         R                  " S5      nUR                  SU 5      $ )z,
Remove Twitter username handles from text.
zv(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)r   r  r  s     r   r  r    s+     mm 	BG ;;sD!!r   c                 4    [        XUS9R                  U 5      $ )z2
Convenience function for wrapping the tokenizer.
r   )r@   r   )r   r   r   r   s       r   casual_tokenizer    s"     \ijss r   )Nstrict)r+   Tr   r  )(r   r   r   r   shutilr   typingr   r   r   r  tokenization_utilsr   utilsr	   
get_loggerr   r/   r   r   r   	EMOTICONSURLSREGEXPSr  ry   VERBOSEIUNICODEr  r  r  r   r   r   r@   r  r  r  __all__r+   r   r   <module>r#     sP    (  	 	  ( (  5  
		H	%   q3+ q3~L		$)\ 		  (.
A+` --chhw&779PSXS`S`9`
a --/
0 mmIu}}uww'>'NO 
.	/;HB0 0p("  
r   