o
    rZh-                     @   s:  d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZmZ dd	lmZmZmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0 ej1dddZ2dddZ3e	 Z4dddZ5dS )a@	  
NLTK Tokenizer Package

Tokenizers divide strings into lists of substrings.  For example,
tokenizers can be used to find the words and punctuation in a string:

    >>> from nltk.tokenize import word_tokenize
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
    ... two of them.\n\nThanks.'''
    >>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

This particular tokenizer requires the Punkt sentence tokenization
models to be installed. NLTK also provides a simpler,
regular-expression based tokenizer, which splits text on whitespace
and punctuation:

    >>> from nltk.tokenize import wordpunct_tokenize
    >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

We can also operate at the level of sentences, using the sentence
tokenizer directly as follows:

    >>> from nltk.tokenize import sent_tokenize, word_tokenize
    >>> sent_tokenize(s)
    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
    >>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]

Caution: when tokenizing a Unicode string, make sure you are not
using an encoded version of the string (it may be necessary to
decode it first, e.g. with ``s.decode("utf8")``.

NLTK tokenizers can produce token-spans, represented as tuples of integers
having the same semantics as string slices, to support efficient comparison
of tokenizers.  (These methods are implemented as generators.)

    >>> from nltk.tokenize import WhitespaceTokenizer
    >>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]

There are numerous ways to tokenize text.  If you need more control over
tokenization, see the other methods provided in this package.

For further information, please see Chapter 3 of the NLTK book.
    N)load)TweetTokenizercasual_tokenize)NLTKWordTokenizer)LegalitySyllableTokenizer)MWETokenizer)PunktSentenceTokenizerPunktTokenizer)BlanklineTokenizerRegexpTokenizerWhitespaceTokenizerWordPunctTokenizerblankline_tokenizeregexp_tokenizewordpunct_tokenize)ReppTokenizer)SExprTokenizersexpr_tokenize)LineTokenizerSpaceTokenizerTabTokenizerline_tokenize)SyllableTokenizer)StanfordSegmenter)TextTilingTokenizer)ToktokTokenizer)TreebankWordDetokenizerTreebankWordTokenizer)regexp_span_tokenizestring_span_tokenizeenglishc                 C   s   t | S )z
    A constructor for the PunktTokenizer that utilizes
    a lru cache for performance.

    :param language: the model name in the Punkt corpus
    :type language: str
    )r	   )language r"   E/var/www/auris/lib/python3.10/site-packages/nltk/tokenize/__init__.py_get_punkt_tokenizer`   s   	r$   c                 C   s   t |}|| S )a  
    Return a sentence-tokenized copy of *text*,
    using NLTK's recommended sentence tokenizer
    (currently :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into sentences
    :param language: the model name in the Punkt corpus
    )r$   tokenize)textr!   Z	tokenizerr"   r"   r#   sent_tokenizem   s   

r'   Fc                 C   s"   |r| gnt | |}dd |D S )a  
    Return a tokenized copy of *text*,
    using NLTK's recommended word tokenizer
    (currently an improved :class:`.TreebankWordTokenizer`
    along with :class:`.PunktSentenceTokenizer`
    for the specified language).

    :param text: text to split into words
    :type text: str
    :param language: the model name in the Punkt corpus
    :type language: str
    :param preserve_line: A flag to decide whether to sentence tokenize the text or not.
    :type preserve_line: bool
    c                 S   s    g | ]}t |D ]}|q	qS r"   )_treebank_word_tokenizerr%   ).0senttokenr"   r"   r#   
<listcomp>   s
    
z!word_tokenize.<locals>.<listcomp>)r'   )r&   r!   Zpreserve_lineZ	sentencesr"   r"   r#   word_tokenize   s   r-   )r    )r    F)6__doc__	functoolsreZ	nltk.datar   Znltk.tokenize.casualr   r   Znltk.tokenize.destructiver   Z nltk.tokenize.legality_principler   Znltk.tokenize.mwer   Znltk.tokenize.punktr   r	   Znltk.tokenize.regexpr
   r   r   r   r   r   r   Znltk.tokenize.reppr   Znltk.tokenize.sexprr   r   Znltk.tokenize.simpler   r   r   r   Z!nltk.tokenize.sonority_sequencingr   Z nltk.tokenize.stanford_segmenterr   Znltk.tokenize.texttilingr   Znltk.tokenize.toktokr   Znltk.tokenize.treebankr   r   Znltk.tokenize.utilr   r   	lru_cacher$   r'   r(   r-   r"   r"   r"   r#   <module>   s0   	4$	
