o
    rZŽh‹?  ã                   @   sp   d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
 ddlmZ G dd„ deƒZG d	d
„ d
eƒZdS )a	  

Penn Treebank Tokenizer

The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.
This implementation is a port of the tokenizer sed script written by Robert McIntyre
and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed.
é    N)ÚIteratorÚListÚTuple)Ú
TokenizerI)ÚMacIntyreContractions)Úalign_tokensc                
   @   s–  e Zd ZdZe d¡dfe d¡dfe d¡dfgZe d¡d	fe d
¡dfe d¡dfe d¡dfe d¡dfe d¡dfe d¡dfgZe d¡dfZe d¡dfe d¡dfe d¡dfe d¡dfe d¡dfe d¡d fgZ	e d!¡d"fZ
e d#¡d$fe d%¡d$fe d&¡d'fe d(¡d'fgZeƒ ZeeejejƒƒZeeejejƒƒZ	)d3d*ed+ed,ed-ee fd.d/„Zd*ed-eeeef  fd0d1„Zd2S )4ÚTreebankWordTokenizera	  
    The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank.

    This tokenizer performs the following steps:

    - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll``
    - treat most punctuation characters as separate tokens
    - split off commas and single quotes, when followed by whitespace
    - separate periods that appear at the end of line

    >>> from nltk.tokenize import TreebankWordTokenizer
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
    >>> TreebankWordTokenizer().tokenize(s)
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
    >>> s = "They'll save and invest more."
    >>> TreebankWordTokenizer().tokenize(s)
    ['They', "'ll", 'save', 'and', 'invest', 'more', '.']
    >>> s = "hi, my name can't hello,"
    >>> TreebankWordTokenizer().tokenize(s)
    ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ',']
    z^\"ú``z(``)z \1 z([ \(\[{<])(\"|\'{2})z\1 `` z([:,])([^\d])z \1 \2z([:,])$z\.\.\.z ... z[;@#$%&]z \g<0> z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[\]\[\(\)\{\}\<\>]z\(ú-LRB-z\)ú-RRB-z\[ú-LSB-z\]ú-RSB-z\{ú-LCB-z\}ú-RCB-ú--ú -- ú''z '' ú"z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) FÚtextÚconvert_parenthesesÚ
return_strÚreturnc                 C   sü   |durt jdtdd | jD ]
\}}| ||¡}q| jD ]
\}}| ||¡}q| j\}}| ||¡}|rC| jD ]
\}}| ||¡}q8| j\}}| ||¡}d| d }| j	D ]
\}}| ||¡}qW| j
D ]}| d|¡}qe| jD ]}| d|¡}qq| ¡ S )aý  Return a tokenized copy of `text`.

        >>> from nltk.tokenize import TreebankWordTokenizer
        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> TreebankWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> TreebankWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']

        :param text: A string with a sentence or sentences.
        :type text: str
        :param convert_parentheses: if True, replace parentheses to PTB symbols,
            e.g. `(` to `-LRB-`. Defaults to False.
        :type convert_parentheses: bool, optional
        :param return_str: If True, return tokens as space-separated string,
            defaults to False.
        :type return_str: bool, optional
        :return: List of tokens from `text`.
        :rtype: List[str]
        FzHParameter 'return_str' has been deprecated and should no longer be used.é   )ÚcategoryÚ
stacklevelú z \1 \2 )ÚwarningsÚwarnÚDeprecationWarningÚSTARTING_QUOTESÚsubÚPUNCTUATIONÚPARENS_BRACKETSÚCONVERT_PARENTHESESÚDOUBLE_DASHESÚENDING_QUOTESÚCONTRACTIONS2ÚCONTRACTIONS3Úsplit)Úselfr   r   r   ÚregexpÚsubstitution© r,   úE/var/www/auris/lib/python3.10/site-packages/nltk/tokenize/treebank.pyÚtokenizee   s2   ü



zTreebankWordTokenizer.tokenizec                 #   s^    |   |¡}d|v sd|v r#dd„ t d|¡D ƒ‰ ‡ fdd„|D ƒ}n|}t||ƒE dH  dS )a‰  
        Returns the spans of the tokens in ``text``.
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import TreebankWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(TreebankWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in TreebankWordTokenizer().span_tokenize(s)] == expected
            True

        :param text: A string with a sentence or sentences.
        :type text: str
        :yield: Tuple[int, int]
        r   r   c                 S   s   g | ]}|  ¡ ‘qS r,   )Úgroup)Ú.0Úmr,   r,   r-   Ú
<listcomp>Ë   s    z7TreebankWordTokenizer.span_tokenize.<locals>.<listcomp>z
``|'{2}|\"c                    s"   g | ]}|d v rˆ   d¡n|‘qS ))r   r	   r   r   )Úpop)r0   Útok©Zmatchedr,   r-   r2   Î   s    ÿÿN)r.   ÚreÚfinditerr   )r)   r   Z
raw_tokensÚtokensr,   r5   r-   Úspan_tokenize¬   s   €

þz#TreebankWordTokenizer.span_tokenizeN)FF)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r6   Úcompiler   r!   r"   r#   r$   r%   r   Ú_contractionsÚlistÚmapr&   r'   ÚstrÚboolr   r.   r   r   Úintr9   r,   r,   r,   r-   r      sX    ýþöú	üÿÿÿÿ
þ"Gr   c                	   @   s°  e Zd ZdZeƒ Zdd„ ejD ƒZdd„ ejD ƒZe 	d¡dfe 	d¡dfe 	d¡d	fe 	d
¡d	fe 	d¡dfgZ
e 	d¡dfZe 	d¡dfe 	d¡dfe 	d¡dfe 	d¡dfe 	d¡dfe 	d¡dfgZe 	d¡dfe 	d¡dfe 	d¡d	fgZe 	d¡d fe 	d!¡dfe 	d"¡d#fe 	d$¡dfe 	d%¡dfe 	d&¡d'fe 	d(¡d)fgZe 	d*¡d+fe 	d,¡d)fe 	d-¡dfgZd7d/ee d0ed1efd2d3„Zd7d/ee d0ed1efd4d5„Zd6S )8ÚTreebankWordDetokenizera‹  
    The Treebank detokenizer uses the reverse regex operations corresponding to
    the Treebank tokenizer's regexes.

    Note:

    - There're additional assumption mades when undoing the padding of ``[;@#$%&]``
      punctuation symbols that isn't presupposed in the TreebankTokenizer.
    - There're additional regexes added in reversing the parentheses tokenization,
       such as the ``r'([\]\)\}\>])\s([:;,.])'``, which removes the additional right
       padding added to the closing parentheses precedding ``[:;,.]``.
    - It's not possible to return the original whitespaces as they were because
      there wasn't explicit records of where `'\n'`, `'\t'` or `'\s'` were removed at
      the text.split() operation.

    >>> from nltk.tokenize.treebank import TreebankWordTokenizer, TreebankWordDetokenizer
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks.'''
    >>> d = TreebankWordDetokenizer()
    >>> t = TreebankWordTokenizer()
    >>> toks = t.tokenize(s)
    >>> d.detokenize(toks)
    'Good muffins cost $3.88 in New York. Please buy me two of them. Thanks.'

    The MXPOST parentheses substitution can be undone using the ``convert_parentheses``
    parameter:

    >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
    >>> expected_tokens = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
    ... 'New', '-LRB-', 'York', '-RRB-', '.', 'Please', '-LRB-', 'buy',
    ... '-RRB-', 'me', 'two', 'of', 'them.', '-LRB-', 'Thanks', '-RRB-', '.']
    >>> expected_tokens == t.tokenize(s, convert_parentheses=True)
    True
    >>> expected_detoken = 'Good muffins cost $3.88 in New (York). Please (buy) me two of them. (Thanks).'
    >>> expected_detoken == d.detokenize(t.tokenize(s, convert_parentheses=True), convert_parentheses=True)
    True

    During tokenization it's safe to add more spaces but during detokenization,
    simply undoing the padding doesn't really help.

    - During tokenization, left and right pad is added to ``[!?]``, when
      detokenizing, only left shift the ``[!?]`` is needed.
      Thus ``(re.compile(r'\s([?!])'), r'\g<1>')``.

    - During tokenization ``[:,]`` are left and right padded but when detokenizing,
      only left shift is necessary and we keep right pad after comma/colon
      if the string after is a non-digit.
      Thus ``(re.compile(r'\s([:,])\s([^\d])'), r'\1 \2')``.

    >>> from nltk.tokenize.treebank import TreebankWordDetokenizer
    >>> toks = ['hello', ',', 'i', 'ca', "n't", 'feel', 'my', 'feet', '!', 'Help', '!', '!']
    >>> twd = TreebankWordDetokenizer()
    >>> twd.detokenize(toks)
    "hello, i can't feel my feet! Help!!"

    >>> toks = ['hello', ',', 'i', "can't", 'feel', ';', 'my', 'feet', '!',
    ... 'Help', '!', '!', 'He', 'said', ':', 'Help', ',', 'help', '?', '!']
    >>> twd.detokenize(toks)
    "hello, i can't feel; my feet! Help!! He said: Help, help?!"
    c                 C   ó   g | ]}t  | d d¡¡‘qS ©z(?#X)z\s©r6   r>   Úreplace©r0   Úpatternr,   r,   r-   r2     ó    ÿÿz"TreebankWordDetokenizer.<listcomp>c                 C   rF   rG   rH   rJ   r,   r,   r-   r2     rL   z+([^' ])\s('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) z\1\2 z([^' ])\s('[sS]|'[mM]|'[dD]|') z(\S)\s(\'\')ú\1\2z(\'\')\s([.,:)\]>};%])r   r   r   r   r
   ú(r   ú)r   ú[r   ú]r   Ú{r   Ú}z([\[\(\{\<])\sz\g<1>z\s([\]\)\}\>])z([\]\)\}\>])\s([:;,.])z([^'])\s'\sz\1' z\s([?!])z([^\.])\s(\.)([\]\)}>"\']*)\s*$z\1\2\3z([#$])\sz\s([;%])z
\s\.\.\.\sz...z\s([:,])z\1z([ (\[{<])\s``z\1``z(``)\sr	   Fr8   r   r   c                 C   sü   d  |¡}d| d }| jD ]}| d|¡}q| jD ]}| d|¡}q| jD ]
\}}| ||¡}q&| ¡ }| j\}}| ||¡}|rP| jD ]
\}}| ||¡}qE| jD ]
\}}| ||¡}qS| j	D ]
\}}| ||¡}qa| j
D ]
\}}| ||¡}qo| ¡ S )a¥  
        Treebank detokenizer, created by undoing the regexes from
        the TreebankWordTokenizer.tokenize.

        :param tokens: A list of strings, i.e. tokenized text.
        :type tokens: List[str]
        :param convert_parentheses: if True, replace PTB symbols with parentheses,
            e.g. `-LRB-` to `(`. Defaults to False.
        :type convert_parentheses: bool, optional
        :return: str
        r   rM   )Újoinr'   r    r&   r%   Ústripr$   r#   r"   r!   r   )r)   r8   r   r   r*   r+   r,   r,   r-   r.   [  s*   



z TreebankWordDetokenizer.tokenizec                 C   s   |   ||¡S )z&Duck-typing the abstract *tokenize()*.)r.   )r)   r8   r   r,   r,   r-   Ú
detokenize  s   z"TreebankWordDetokenizer.detokenizeN)F)r:   r;   r<   r=   r   r?   r&   r'   r6   r>   r%   r$   r#   r"   r!   r   r   rB   rC   r.   rV   r,   r,   r,   r-   rE   Ø   sZ    <þþþøúýþòý 5rE   )r=   r6   r   Útypingr   r   r   Znltk.tokenize.apir   Znltk.tokenize.destructiver   Znltk.tokenize.utilr   r   rE   r,   r,   r,   r-   Ú<module>   s   
	 <