o
    rZŽh$  ã                   @   s^   d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	 G dd„ dƒZ
G dd„ deƒZdS )	é    N)ÚIteratorÚListÚTuple)Ú
TokenizerI)Úalign_tokensc                   @   s(   e Zd ZdZg d¢ZddgZddgZdS )ÚMacIntyreContractionszI
    List of contractions adapted from Robert MacIntyre's tokenizer.
    )z(?i)\b(can)(?#X)(not)\bz(?i)\b(d)(?#X)('ye)\bz(?i)\b(gim)(?#X)(me)\bz(?i)\b(gon)(?#X)(na)\bz(?i)\b(got)(?#X)(ta)\bz(?i)\b(lem)(?#X)(me)\bz(?i)\b(more)(?#X)('n)\bz(?i)\b(wan)(?#X)(na)(?=\s)z(?i) ('t)(?#X)(is)\bz(?i) ('t)(?#X)(was)\bz(?i)\b(whad)(dd)(ya)\bz(?i)\b(wha)(t)(cha)\bN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚCONTRACTIONS2ÚCONTRACTIONS3ZCONTRACTIONS4© r   r   úH/var/www/auris/lib/python3.10/site-packages/nltk/tokenize/destructive.pyr      s
    
r   c                   @   sö  e Zd ZdZe dej¡dfe d¡dfe d¡dfe d¡dfe d	ej¡d
fgZe dej¡dfe d¡dfe d¡dfe d¡dfe d¡dfe d¡dfgZe dej¡dfe d¡dfe d¡dfe dej¡dfe d¡dfe d¡dfe d¡dfe d¡d fe d!ej¡dfg	Z	e d"¡dfZ
e d#¡d$fe d%¡d&fe d'¡d(fe d)¡d*fe d+¡d,fe d-¡d.fgZe d/¡d0fZeƒ ZeeejejƒƒZeeejejƒƒZ	1d;d2ed3ed4ed5ee fd6d7„Zd2ed5eeeef  fd8d9„Zd:S )<ÚNLTKWordTokenizeraE  
    The NLTK tokenizer that has improved upon the TreebankWordTokenizer.

    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

    The tokenizer is "destructive" such that the regexes applied will munge the
    input string to a state beyond re-construction. It is possible to apply
    `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
    `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
    revert to the original string.
    u   ([Â«â€œâ€˜â€ž]|[`]+)z \1 z^\"ú``z(``)z([ \(\[{<])(\"|\'{2})z\1 `` z$(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\bz\1 \2u   ([Â»â€â€™])ú''z '' ú"z\s+ú z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) u&   ([^\.])(\.)([\]\)}>"\'Â»â€â€™ ]*)\s*$z	\1 \2 \3 z([:,])([^\d])z \1 \2z([:,])$z\.{2,}z \g<0> z[;@#$%&]z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[*]z[\]\[\(\)\{\}\<\>]z\(z-LRB-z\)z-RRB-z\[z-LSB-z\]z-RSB-z\{z-LCB-z\}z-RCB-z--z -- FÚtextÚconvert_parenthesesÚ
return_strÚreturnc                 C   sø   |r
t jdtdd | jD ]
\}}| ||¡}q| jD ]
\}}| ||¡}q| j\}}| ||¡}|rA| jD ]
\}}| ||¡}q6| j\}}| ||¡}d| d }| j	D ]
\}}| ||¡}qU| j
D ]}| d|¡}qc| jD ]}| d|¡}qo| ¡ S )aò  Return a tokenized copy of `text`.

        >>> from nltk.tokenize import NLTKWordTokenizer
        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']


        :param text: A string with a sentence or sentences.
        :type text: str
        :param convert_parentheses: if True, replace parentheses to PTB symbols,
            e.g. `(` to `-LRB-`. Defaults to False.
        :type convert_parentheses: bool, optional
        :param return_str: If True, return tokens as space-separated string,
            defaults to False.
        :type return_str: bool, optional
        :return: List of tokens from `text`.
        :rtype: List[str]
        zHParameter 'return_str' has been deprecated and should no longer be used.é   )ÚcategoryÚ
stacklevelr   z \1 \2 )ÚwarningsÚwarnÚDeprecationWarningÚSTARTING_QUOTESÚsubÚPUNCTUATIONÚPARENS_BRACKETSÚCONVERT_PARENTHESESÚDOUBLE_DASHESÚENDING_QUOTESr   r   Úsplit)Úselfr   r   r   ÚregexpZsubstitutionr   r   r   Útokenizey   s2   ü



zNLTKWordTokenizer.tokenizec                 #   s^    |   |¡}d|v sd|v r#dd„ t d|¡D ƒ‰ ‡ fdd„|D ƒ}n|}t||ƒE dH  dS )a}  
        Returns the spans of the tokens in ``text``.
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import NLTKWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
            True

        :param text: A string with a sentence or sentences.
        :type text: str
        :yield: Tuple[int, int]
        r   r   c                 S   s   g | ]}|  ¡ ‘qS r   )Úgroup)Ú.0Úmr   r   r   Ú
<listcomp>à   s    z3NLTKWordTokenizer.span_tokenize.<locals>.<listcomp>z
``|'{2}|\"c                    s"   g | ]}|d v rˆ   d¡n|‘qS ))r   r   r   r   )Úpop)r+   Útok©Zmatchedr   r   r-   ã   s    ÿÿN)r)   ÚreÚfinditerr   )r'   r   Z
raw_tokensÚtokensr   r0   r   Úspan_tokenizeÁ   s   €

þzNLTKWordTokenizer.span_tokenizeN)FF)r   r	   r
   r   r1   ÚcompileÚUr   r%   r!   r"   r#   r$   r   Z_contractionsÚlistÚmapr   r   ÚstrÚboolr   r)   r   r   Úintr4   r   r   r   r   r   %   sl    û
úþþþñú	ÿÿÿÿ
þ"Hr   )r1   r   Útypingr   r   r   Znltk.tokenize.apir   Znltk.tokenize.utilr   r   r   r   r   r   r   Ú<module>   s   	