
    /h=              
          S r SSKrSSKJr  SSKrSSKJr  SrSrSr	Sr
\\S	S
SSSS\	S4
r\S   \
/\SS Q7r\R                  " S5      r\R                  " \\R                  \R                   -  \R"                  -  5      r\R                  " S5      r\R                  " S5      rSS jrSS jr " S S\5      rS rS r    SS jrg)a  
Twitter-aware tokenizer, designed to be flexible and easy to adapt to new
domains and tasks. The basic logic is this:

1. The tuple REGEXPS defines a list of regular expression
   strings.

2. The REGEXPS strings are put, in order, into a compiled
   regular expression object called WORD_RE, under the TweetTokenizer
   class.

3. The tokenization is done by WORD_RE.findall(s), where s is the
   user-supplied string, inside the tokenize() method of the class
   TweetTokenizer.

4. When instantiating Tokenizer objects, there are several options:
    * preserve_case. By default, it is set to True. If it is set to
      False, then the tokenizer will downcase everything except for
      emoticons.
    * reduce_len. By default, it is set to False. It specifies whether
      to replace repeated character sequences of length 3 or greater
      with sequences of length 3.
    * strip_handles. By default, it is set to False. It specifies
      whether to remove Twitter handles of text used in the
      `tokenize` method.
    * match_phone_numbers. By default, it is set to True. It indicates
      whether the `tokenize` method should look for phone numbers.
    N)List)
TokenizerIac  
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      </?3                       # heart
    )u  			# Capture 1: entire matched URL
  (?:
  https?:				# URL protocol and colon
    (?:
      /{1,3}				# 1-3 slashes
      |					#   or
      [a-z0-9%]				# Single letter or digit or '%'
                                       # (Trying not to match e.g. "URI::Escape")
    )
    |					#   or
                                       # looks like domain name followed by a slash:
    [a-z0-9.\-]+[.]
    (?:[a-z]{2,13})
    /
  )
  (?:					# One or more:
    [^\s()<>{}\[\]]+			# Run of non-space, non-()<>{}[]
    |					#   or
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
  )+
  (?:					# End with:
    \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...)
    |
    \([^\s]+?\)				# balanced parens, non-recursive: (...)
    |					#   or
    [^\s`!()\[\]{};:'".,<>?«»“”‘’]	# not a space or one of these punct chars
  )
  |					# OR, the following to match naked domains:
  (?:
  	(?<!@)			        # not preceded by a @, avoid matching foo@_gmail.com_
    [a-z0-9]+
    (?:[.\-][a-z0-9]+)*
    [.]
    (?:[a-z]{2,13})
    \b
    /?
    (?!@)			        # not succeeded by a @,
                            # avoid matching "foo.na" in "foo.na@example.com"
  )
u  
  (?:
    [\U0001F1E6-\U0001F1FF]{2}  # all enclosed letter pairs
    |
    # English flag
    \U0001F3F4\U000E0067\U000E0062\U000E0065\U000E006e\U000E0067\U000E007F
    |
    # Scottish flag
    \U0001F3F4\U000E0067\U000E0062\U000E0073\U000E0063\U000E0074\U000E007F
    |
    # For Wales? Why Richard, it profit a man nothing to give his soul for the whole world … but for Wales!
    \U0001F3F4\U000E0067\U000E0062\U000E0077\U000E006C\U000E0073\U000E007F
  )
a	  
    (?:
      (?:            # (international)
        \+?[01]
        [ *\-.\)]*
      )?
      (?:            # (area code)
        [\(]?
        \d{3}
        [ *\-.\)]*
      )?
      \d{3}          # exchange
      [ *\-.\)]*
      \d{4}          # base
    )z	<[^>\s]+>z[\-]+>|<[\-]+z(?:@[\w_]+)z(?:\#+[\w_]+[\w\'_\-]*[\w_]+)z#[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]uR   .(?:
        [🏻-🏿]?(?:‍.[🏻-🏿]?)+
        |
        [🏻-🏿]
    )a  
    (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes.
    |
    (?:[+\-]?\d+[,/.:-]\d+[+\-]?)  # Numbers, including fractions, decimals.
    |
    (?:[\w_]+)                     # Words without apostrophes or dashes.
    |
    (?:\.(?:\s*\.){1,})            # Ellipsis dots.
    |
    (?:\S)                         # Everything else that isn't whitespace.
       z([^a-zA-Z0-9])\1{3,}z&(#?(x?))([^&;\s]+);zZ(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){15}(?!@)|([A-Za-z0-9_]){1,14}(?![A-Za-z0-9_]*@))c                 \    Uc  Sn[        U [        5      (       a  U R                  X5      $ U $ )Nutf-8)
isinstancebytesdecode)textencodingerrorss      L/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/casual.py_str_to_unicoder      s.    ${{8,,K    c                 R   ^^ UU4S jn[         R                  U[        X5      5      $ )us  
Remove entities from text by converting them to their
corresponding unicode character.

:param text: a unicode string or a byte string encoded in the given
`encoding` (which defaults to 'utf-8').

:param list keep:  list of entity names which should not be replaced.    This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``)
and named entities (such as ``&nbsp;`` or ``&gt;``).

:param bool remove_illegal: If `True`, entities that can't be converted are    removed. Otherwise, entities that can't be converted are kept "as
is".

:returns: A unicode string with the entities removed.

See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py

    >>> from nltk.tokenize.casual import _replace_html_entities
    >>> _replace_html_entities(b'Price: &pound;100')
    'Price: \xa3100'
    >>> print(_replace_html_entities(b'Price: &pound;100'))
    Price: £100
    >>>
c                 ,  > U R                  S5      nU R                  S5      (       a\   U R                  S5      (       a  [        US5      nO[        US5      nSUs=::  a  S::  a  O  O\[        U45      R                  S5      $ O@UT;   a  U R                  S	5      $ [
        R                  R                  R                  U5      nUb   [        U5      $ T(       a  S
$ U R                  S	5      $ ! [         a    S n N7f = f! [        [        4 a     N>f = f)N   r         
         cp1252r    )groupintr	   r
   
ValueErrorhtmlentitiesname2codepointgetchrOverflowError)matchentity_bodynumberkeepremove_illegals      r   _convert_entity/_replace_html_entities.<locals>._convert_entity  s    kk!n;;q>>;;q>> b1F b1F
 6)T) &+228<< *
 d"{{1~%]]1155kBF6{" $r7Q7   . s)   >C. )C. 	
D  .C=<C= DD)ENT_REsubr   )r   r'   r(   r   r)   s    ``  r   _replace_html_entitiesr-      s     888 ::ot'FGGr   c                   r    \ rS rSrSrSrSr    SS jrS\S\	\   4S jr
\SS j5       r\SS	 j5       rS
rg)TweetTokenizeri2  a  
Tokenizer for tweets.

    >>> from nltk.tokenize import TweetTokenizer
    >>> tknzr = TweetTokenizer()
    >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
    >>> tknzr.tokenize(s0) # doctest: +NORMALIZE_WHITESPACE
    ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->',
     '<--']

Examples using `strip_handles` and `reduce_len parameters`:

    >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
    >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
    >>> tknzr.tokenize(s1)
    [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
Nc                 4    Xl         X l        X0l        X@l        g)a  
Create a `TweetTokenizer` instance with settings for use in the `tokenize` method.

:param preserve_case: Flag indicating whether to preserve the casing (capitalisation)
    of text used in the `tokenize` method. Defaults to True.
:type preserve_case: bool
:param reduce_len: Flag indicating whether to replace repeated character sequences
    of length 3 or greater with sequences of length 3. Defaults to False.
:type reduce_len: bool
:param strip_handles: Flag indicating whether to remove Twitter handles of text used
    in the `tokenize` method. Defaults to False.
:type strip_handles: bool
:param match_phone_numbers: Flag indicating whether the `tokenize` method should look
    for phone numbers. Defaults to True.
:type match_phone_numbers: bool
Npreserve_case
reduce_lenstrip_handlesmatch_phone_numbers)selfr2   r3   r4   r5   s        r   __init__TweetTokenizer.__init__L  s    . +$*#6 r   r   returnc                    [        U5      nU R                  (       a  [        U5      nU R                  (       a  [	        U5      n[
        R                  SU5      nU R                  (       a  U R                  R                  U5      nOU R                  R                  U5      nU R                  (       d  [        [        S U5      5      nU$ )zTokenize the input text.

:param text: str
:rtype: list(str)
:return: a tokenized list of strings; joining this list returns        the original string if `preserve_case=False`.
\1\1\1c                 Z    [         R                  U 5      (       a  U $ U R                  5       $ )N)EMOTICON_REsearchlower)xs    r   <lambda>)TweetTokenizer.tokenize.<locals>.<lambda>  s"    K$6$6q$9$9qHqwwyHr   )r-   r4   remove_handlesr3   reduce_lengtheningHANG_REr,   r5   PHONE_WORD_REfindallWORD_REr2   listmap)r6   r   	safe_textwordss       r   tokenizeTweetTokenizer.tokenizeh  s     &d+!$'D??%d+DKK	40	##&&..y9ELL((3E!!H5QE r   c                 8   [        U 5      R                  (       dl  [        R                  " SSR	                  [
        5       S3[        R                  [        R                  -  [        R                  -  5      [        U 5      l        [        U 5      R                  $ )zCore TweetTokenizer regex(|))	type_WORD_REregexcompilejoinREGEXPSVERBOSEIUNICODEr6   s    r   rH   TweetTokenizer.WORD_RE  sg     Dz"""'--CHHW%&a('%--7#DJ Dz"""r   c                 8   [        U 5      R                  (       dl  [        R                  " SSR	                  [
        5       S3[        R                  [        R                  -  [        R                  -  5      [        U 5      l        [        U 5      R                  $ )z#Secondary core TweetTokenizer regexrP   rQ   rR   )	rS   _PHONE_WORD_RErU   rV   rW   REGEXPS_PHONErY   rZ   r[   r\   s    r   rF   TweetTokenizer.PHONE_WORD_RE  sg     Dz(((-CHH]+,A.'%--7)DJ% Dz(((r   )r5   r2   r3   r4   TFFT)r9   zregex.Pattern)__name__
__module____qualname____firstlineno____doc__rT   r_   r7   strr   rM   propertyrH   rF   __static_attributes__ r   r   r/   r/   2  se    ( HN  78S T#Y < # # ) )r   r/   c                 R    [         R                  " S5      nUR                  SU 5      $ )zY
Replace repeated character sequences of length 3 or greater with sequences
of length 3.
z	(.)\1{2,}r;   )rU   rV   r,   )r   patterns     r   rD   rD     s#    
 mmL)G;;y$''r   c                 .    [         R                  SU 5      $ )z,
Remove Twitter username handles from text.
 )
HANDLES_REr,   )r   s    r   rC   rC     s    
 >>#t$$r   c                 8    [        UUUUS9R                  U 5      $ )z2
Convenience function for wrapping the tokenizer.
r1   )r/   rM   )r   r2   r3   r4   r5   s        r   casual_tokenizerr     s(     ##/	
 htnr   )Nstrict)rk   Tr   rb   )rg   r   typingr   rU   nltk.tokenize.apir   	EMOTICONSURLSFLAGSPHONE_REGEXrX   r`   rV   rE   rY   rZ   r[   r=   r+   rp   r   r-   r/   rD   rC   rr   rk   r   r   <module>rz      s  @    ($		$)f	 	$ 	(.	 

/"J [7712;7 --/
0 mmIu}}uww'>'NO 
.	/ ]]H
8H|h)Z h)`(% r   