
    /h?                     |    S r SSKrSSKrSSKJrJrJr  SSKJr  SSK	J
r
  SSKJr   " S S\5      r " S	 S
\5      rg)a	  

Penn Treebank Tokenizer

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyre
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
    N)IteratorListTuple)
TokenizerI)MacIntyreContractions)align_tokensc            
       R   \ rS rSrSr\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4/r\R                  " S	5      S
4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4/r\R                  " S5      S4r	\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S 5      S!4/r
\R                  " S"5      S#4r\R                  " S$5      S%4\R                  " S&5      S%4\R                  " S'5      S(4\R                  " S)5      S(4/r\" 5       r\" \" \R                  \R"                  5      5      r\" \" \R                  \R$                  5      5      r S2S*\S+\S,\S-\\   4S. jjrS*\S-\\\\4      4S/ jrS0rg1)3TreebankWordTokenizer   a  
The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.

This tokenizer performs the following steps:

- split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
- treat most punctuation characters as separate tokens
- split off commas and single quotes, when followed by whitespace
- separate periods that appear at the end of line

>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
>>> TreebankWordTokenizer().tokenize(s)
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
>>> s = "They'll save and invest more."
>>> TreebankWordTokenizer().tokenize(s)
['They', "'ll", 'save', 'and', 'invest', 'more', '.']
>>> s = "hi, my name can't hello,"
>>> TreebankWordTokenizer().tokenize(s)
['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
z^\"``z(``)z \1 z([ \(\[{<])(\"|\'{2})z\1 `` z([:,])([^\d])z \1 \2z([:,])$z\.\.\.z ... z[;@#$%&]z \g<0> z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[\]\[\(\)\{\}\<\>]z\(-LRB-z\)-RRB-z\[-LSB-z\]-RSB-z\{-LCB-z\}-RCB--- -- ''z '' "z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) textconvert_parentheses
return_strreturnc                    USLa  [         R                  " S[        SS9  U R                   H  u  pEUR	                  XQ5      nM     U R
                   H  u  pEUR	                  XQ5      nM     U R                  u  pEUR	                  XQ5      nU(       a&  U R                   H  u  pEUR	                  XQ5      nM     U R                  u  pEUR	                  XQ5      nSU-   S-   nU R                   H  u  pEUR	                  XQ5      nM     U R                   H  nUR	                  SU5      nM     U R                   H  nUR	                  SU5      nM     UR                  5       $ )aU  Return a tokenized copy of `text`.

>>> from nltk.tokenize import TreebankWordTokenizer
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
>>> TreebankWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']
>>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
'of', 'them.', 'Thanks', '.']

:param text: A string with a sentence or sentences.
:type text: str
:param convert_parentheses: if True, replace parentheses to PTB symbols,
    e.g. `(` to `-LRB-`. Defaults to False.
:type convert_parentheses: bool, optional
:param return_str: If True, return tokens as space-separated string,
    defaults to False.
:type return_str: bool, optional
:return: List of tokens from `text`.
:rtype: List[str]
FzHParameter 'return_str' has been deprecated and should no longer be used.   )category
stacklevel z \1 \2 )warningswarnDeprecationWarningSTARTING_QUOTESsubPUNCTUATIONPARENS_BRACKETSCONVERT_PARENTHESESDOUBLE_DASHESENDING_QUOTESCONTRACTIONS2CONTRACTIONS3split)selfr   r   r   regexpsubstitutions         N/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/treebank.pytokenizeTreebankWordTokenizer.tokenizee   sK   6 U"MM"+	 %)$8$8 F::l1D %9 %)$4$4 F::l1D %5  $33zz,-(,(@(@$zz,5 )A  $11zz,- TzC$($6$6 F::l1D %7 ((F::j$/D )((F::j$/D ) zz|    c              #   D  #    U R                  U5      nSU;   d  SU;   a^  [        R                  " SU5       Vs/ s H  o3R                  5       PM     nnU Vs/ s H  nUS;   a  UR	                  S5      OUPM     nnOUn[        Xa5       Sh  vN   gs  snf s  snf  N7f)a  
Returns the spans of the tokens in ``text``.
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

    >>> from nltk.tokenize import TreebankWordTokenizer
    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
    >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
    ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
    ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
    ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
    >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
    True
    >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
    ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
    ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
    >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
    True

:param text: A string with a sentence or sentences.
:type text: str
:yield: Tuple[int, int]
r   r   z
``|'{2}|\")r   r   r   r   N)r1   refinditergrouppopr   )r-   r   
raw_tokensmmatchedtoktokenss          r0   span_tokenize#TreebankWordTokenizer.span_tokenize   s     . ]]4(
 4KTT\*,++mT*JK*JQwwy*JGK
 &%C #&):":AC%  F
  F--- L 	.s(   7B BB #B;B BB  N)FF)__name__
__module____qualname____firstlineno____doc__r5   compiler#   r%   r&   r'   r(   r)   r   _contractionslistmapr*   r+   strboolr   r1   r   r   intr>   __static_attributes__r@   r3   r0   r
   r
      sK   0 
F	U#	G	g&	,	-y9O 
$	%y1	J	)	I	)	K	 *-JJ78	
 
G	j)	K	 (+K zz"78*EO 
E	G$	E	G$	E	G$	E	G$	E	G$	E	G$ ZZ&0M 
E	F#	D	6"	4	5yA	@	A9M	M *+MRZZ)D)DEFMRZZ)D)DEFM PUEE.2EHLE	cEN).# ).(5c?*C ).r3   r
   c                   b   \ rS rSrSr\" 5       r\R                   V VVs/ s H)  n[        R                  " UR                  SS5      5      PM+     snnn r\R                   V VVs/ s H)  n[        R                  " UR                  SS5      5      PM+     snnn r\R                  " S5      S4\R                  " S5      S4\R                  " S5      S	4\R                  " S
5      S	4\R                  " S5      S4/r\R                  " S5      S4r\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4\R                  " S5      S4/r\R                  " S5      S4\R                  " S5      S4\R                  " S5      S	4/r\R                  " S5      S 4\R                  " S!5      S4\R                  " S"5      S#4\R                  " S$5      S4\R                  " S%5      S4\R                  " S&5      S'4\R                  " S(5      S)4/r\R                  " S*5      S+4\R                  " S,5      S)4\R                  " S-5      S4/rS5S.\\   S/\S0\4S1 jjrS5S.\\   S/\S0\4S2 jjrS3rg4s  snnn f s  snnn f )6TreebankWordDetokenizer   a
  
The Treebank detokenizer uses the reverse regex operations corresponding to
the Treebank tokenizer's regexes.

Note:

- There're additional assumption mades when undoing the padding of ``[;@#$%&]``
  punctuation symbols that isn't presupposed in the TreebankTokenizer.
- There're additional regexes added in reversing the parentheses tokenization,
   such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
   padding added to the closing parentheses precedding ``[:;,.]``.
- It's not possible to return the original whitespaces as they were because
  there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
  the text.split() operation.

>>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
>>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
>>> d = TreebankWordDetokenizer()
>>> t = TreebankWordTokenizer()
>>> toks = t.tokenize(s)
>>> d.detokenize(toks)
'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'

The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
parameter:

>>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
>>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
>>> expected_tokens == t.tokenize(s, convert_parentheses=True)
True
>>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
>>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
True

During tokenization it's safe to add more spaces but during detokenization,
simply undoing the padding doesn't really help.

- During tokenization, left and right pad is added to ``[!?]``, when
  detokenizing, only left shift the ``[!?]`` is needed.
  Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.

- During tokenization ``[:,]`` are left and right padded but when detokenizing,
  only left shift is necessary and we keep right pad after comma/colon
  if the string after is a non-digit.
  Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.

>>> from nltk.tokenize.treebank import TreebankWordDetokenizer
>>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
>>> twd = TreebankWordDetokenizer()
>>> twd.detokenize(toks)
"hello, i can't feel my feet! Help!!"

>>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
>>> twd.detokenize(toks)
"hello, i can't feel; my feet! Help!! He said: Help, help?!"
z(?#X)z\sz+([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) z\1\2 z([^' ])\s('[sS]|'[mM]|'[dD]|') z(\S)\s(\'\')\1\2z(\'\')\s([.,:)\]>};%])r   r   r   r   r   (r   )r   [r   ]r   {r   }z([\[\(\{\<])\sz\g<1>z\s([\]\)\}\>])z([\]\)\}\>])\s([:;,.])z([^'])\s'\sz\1' z\s([?!])z([^\.])\s(\.)([\]\)}>"\']*)\s*$z\1\2\3z([#$])\sz\s([;%])z
\s\.\.\.\sz...z\s([:,])z\1z([ (\[{<])\s``z\1``z(``)\sr   r=   r   r   c                    SR                  U5      nSU-   S-   nU R                   H  nUR                  SU5      nM     U R                   H  nUR                  SU5      nM     U R                   H  u  pEUR                  XS5      nM     UR                  5       nU R                  u  pEUR                  XS5      nU(       a&  U R                   H  u  pEUR                  XS5      nM     U R                   H  u  pEUR                  XS5      nM     U R                   H  u  pEUR                  XS5      nM     U R                   H  u  pEUR                  XS5      nM     UR                  5       $ )a]  
Treebank detokenizer, created by undoing the regexes from
the TreebankWordTokenizer.tokenize.

:param tokens: A list of strings, i.e. tokenized text.
:type tokens: List[str]
:param convert_parentheses: if True, replace PTB symbols with parentheses,
    e.g. `-LRB-` to `(`. Defaults to False.
:type convert_parentheses: bool, optional
:return: str
r   rQ   )joinr+   r$   r*   r)   stripr(   r'   r&   r%   r#   )r-   r=   r   r   r.   r/   s         r0   r1    TreebankWordDetokenizer.tokenize[  sN    xx TzC ((F::gt,D )((F::gt,D ) %)$6$6 F::l1D %7 zz|  $11zz,-(,(@(@$zz,5 )A %)$8$8 F::l1D %9 %)$4$4 F::l1D %5 %)$8$8 F::l1D %9 zz|r3   c                 $    U R                  X5      $ )z&Duck-typing the abstract *tokenize()*.)r1   )r-   r=   r   s      r0   
detokenize"TreebankWordDetokenizer.detokenize  s    }}V99r3   r@   N)F)rA   rB   rC   rD   rE   r   rG   r*   r5   rF   replacer+   r)   r(   r'   r&   r%   r#   r   rJ   rK   r1   r]   rM   ).0patternr5   s   000r0   rO   rO      s   :x *+M %222G 	

7??7E232M %222G 	

7??7E232M 
B	CXN	6	7B	O	$g.JJ01	
 
E	C 	M ZZ(%0M 
G	c"	G	c"	G	c"	G	c"	G	c"	G	c" 
%	&1	%	&1	-	.8O 
N	#W-	K	 (+	6	7C
 
K	 (+	K	 (+	M	"F+ JJ{#	
K, 
%	&0	I	&	E	D!O3tCy 3t 3PS 3j:c : :RU : :us   0J#%0J*rO   )rE   r5   r    typingr   r   r   nltk.tokenize.apir   nltk.tokenize.destructiver   nltk.tokenize.utilr   r
   rO   r@   r3   r0   <module>rf      s>    
  ( ( ( ; +x.J x.vz:j z:r3   