
    /h$                     j    S SK r S SKrS SKJrJrJr  S SKJr  S SKJ	r	   " S S5      r
 " S S\5      rg)	    N)IteratorListTuple)
TokenizerI)align_tokensc                   0    \ rS rSrSr/ SQrSS/rSS/rSrg	)
MacIntyreContractions   zA
List of contractions adapted from Robert MacIntyre's tokenizer.
)z(?i)\b(can)(?#X)(not)\bz(?i)\b(d)(?#X)('ye)\bz(?i)\b(gim)(?#X)(me)\bz(?i)\b(gon)(?#X)(na)\bz(?i)\b(got)(?#X)(ta)\bz(?i)\b(lem)(?#X)(me)\bz(?i)\b(more)(?#X)('n)\bz(?i)\b(wan)(?#X)(na)(?=\s)z(?i) ('t)(?#X)(is)\bz(?i) ('t)(?#X)(was)\bz(?i)\b(whad)(dd)(ya)\bz(?i)\b(wha)(t)(cha)\b N)	__name__
__module____qualname____firstlineno____doc__CONTRACTIONS2CONTRACTIONS3CONTRACTIONS4__static_attributes__r       Q/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/destructive.pyr	   r	      s&    	M -.FGM.0HIMr   r	   c                      \ rS rSrSr\R                  " S\R                  5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S	4\R                  " S
\R                  5      S4/r\R                  " S\R                  5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4/r	\R                  " S\R                  5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S\R                  5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S 5      S!4\R                  " S"\R                  5      S4/	r
\R                  " S#5      S4r\R                  " S$5      S%4\R                  " S&5      S'4\R                  " S(5      S)4\R                  " S*5      S+4\R                  " S,5      S-4\R                  " S.5      S/4/r\R                  " S05      S14r\" 5       r\" \" \R                  \R$                  5      5      r\" \" \R                  \R&                  5      5      r S:S2\S3\S4\S5\\   4S6 jjrS2\S5\\\\4      4S7 jrS8rg9);NLTKWordTokenizer%   a!  
The NLTK tokenizer that has improved upon the TreebankWordTokenizer.

This is the method that is invoked by ``word_tokenize()``.  It assumes that the
text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

The tokenizer is "destructive" such that the regexes applied will munge the
input string to a state beyond re-construction. It is possible to apply
`TreebankWordDetokenizer.detokenize` to the tokenized outputs of
`NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
revert to the original string.
u   ([«“‘„]|[`]+)z \1 z^\"``z(``)z([ \(\[{<])(\"|\'{2})z\1 `` z$(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\bz\1 \2u   ([»”’])''z '' "z\s+ z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) u&   ([^\.])(\.)([\]\)}>"\'»”’ ]*)\s*$z	\1 \2 \3 z([:,])([^\d])z \1 \2z([:,])$z\.{2,}z \g<0> z[;@#$%&]z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[*]z[\]\[\(\)\{\}\<\>]z\(z-LRB-z\)z-RRB-z\[z-LSB-z\]z-RSB-z\{z-LCB-z\}z-RCB-z--z -- textconvert_parentheses
return_strreturnc                    U(       a  [         R                  " S[        SS9  U R                   H  u  pEUR	                  XQ5      nM     U R
                   H  u  pEUR	                  XQ5      nM     U R                  u  pEUR	                  XQ5      nU(       a&  U R                   H  u  pEUR	                  XQ5      nM     U R                  u  pEUR	                  XQ5      nSU-   S-   nU R                   H  u  pEUR	                  XQ5      nM     U R                   H  nUR	                  SU5      nM     U R                   H  nUR	                  SU5      nM     UR                  5       $ )aJ  Return a tokenized copy of `text`.

>>> from nltk.tokenize import NLTKWordTokenizer
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
>>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']


:param text: A string with a sentence or sentences.
:type text: str
:param convert_parentheses: if True, replace parentheses to PTB symbols,
    e.g. `(` to `-LRB-`. Defaults to False.
:type convert_parentheses: bool, optional
:param return_str: If True, return tokens as space-separated string,
    defaults to False.
:type return_str: bool, optional
:return: List of tokens from `text`.
:rtype: List[str]
zHParameter 'return_str' has been deprecated and should no longer be used.   )category
stacklevelr   z \1 \2 )warningswarnDeprecationWarningSTARTING_QUOTESsubPUNCTUATIONPARENS_BRACKETSCONVERT_PARENTHESESDOUBLE_DASHESENDING_QUOTESr   r   split)selfr   r   r    regexpsubstitutions         r   tokenizeNLTKWordTokenizer.tokenizey   sF   8 MM"+	 %)$8$8 F::l1D %9 %)$4$4 F::l1D %5  $33zz,-(,(@(@$zz,5 )A  $11zz,- TzC$($6$6 F::l1D %7 ((F::j$/D )((F::j$/D ) zz|r   c              #   D  #    U R                  U5      nSU;   d  SU;   a^  [        R                  " SU5       Vs/ s H  o3R                  5       PM     nnU Vs/ s H  nUS;   a  UR	                  S5      OUPM     nnOUn[        Xa5       Sh  vN   gs  snf s  snf  N7f)a  
Returns the spans of the tokens in ``text``.
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

    >>> from nltk.tokenize import NLTKWordTokenizer
    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
    >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
    ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
    ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
    ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
    >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
    True
    >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
    ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
    ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
    >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
    True

:param text: A string with a sentence or sentences.
:type text: str
:yield: Tuple[int, int]
r   r   z
``|'{2}|\")r   r   r   r   N)r4   refinditergrouppopr   )r1   r   
raw_tokensmmatchedtoktokenss          r   span_tokenizeNLTKWordTokenizer.span_tokenize   s     . ]]4(
 4KTT\*,++mT*JK*JQwwy*JGK
 &%C #&):":AC%  F
  F--- L 	.s(   7B BB #B;B BB r   N)FF)r   r   r   r   r   r7   compileUr)   r/   r+   r,   r-   r.   r	   _contractionslistmapr   r   strboolr   r4   r   r   intr@   r   r   r   r   r   r   %   s    
*BDD	17;	F	U#	G	g&	,	-y9	;RTT	BHMO 
NBDD	)73	E	F#	D	6"	F	S!	4	5yA	@	A9MM( 
Dbdd	K\Z	$	%y1	J	)JJy"$$'	
 
K	 *-JJ78	
 
G	j)	K	 (+JJvrtt$	
K, zz"78*EO 
E	G$	E	G$	E	G$	E	G$	E	G$	E	G$ ZZ&0M *+MRZZ)D)DEFMRZZ)D)DEFM PUFF.2FHLF	cFP).# ).(5c?*C ).r   r   )r7   r&   typingr   r   r   nltk.tokenize.apir   nltk.tokenize.utilr   r	   r   r   r   r   <module>rM      s3    
  ( ( ( +J J&E.
 E.r   