
    /h-                     Z   S r SSKrSSKrSSKJr  SSKJrJr  SSKJ	r	  SSK
Jr  SSKJr  SSKJrJr  SS	KJrJrJrJrJrJrJr  SS
KJr  SSKJrJr  SSKJrJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  SSK'J(r(  SSK)J*r*  SSK+J,r,J-r-  SSK.J/r/J0r0  \Rb                  SS j5       r2SS jr3\	" 5       r4SS jr5g)a@	  
NLTK Tokenizer Package

Tokenizers divide strings into lists of substrings.  For example,
tokenizers can be used to find the words and punctuation in a string:

    >>> from nltk.tokenize import word_tokenize
    >>> s = '''Good muffins cost $3.88\nin New York.  Please buy me
    ... two of them.\n\nThanks.'''
    >>> word_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

This particular tokenizer requires the Punkt sentence tokenization
models to be installed. NLTK also provides a simpler,
regular-expression based tokenizer, which splits text on whitespace
and punctuation:

    >>> from nltk.tokenize import wordpunct_tokenize
    >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.',
    'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

We can also operate at the level of sentences, using the sentence
tokenizer directly as follows:

    >>> from nltk.tokenize import sent_tokenize, word_tokenize
    >>> sent_tokenize(s)
    ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.']
    >>> [word_tokenize(t) for t in sent_tokenize(s)] # doctest: +NORMALIZE_WHITESPACE
    [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'],
    ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']]

Caution: when tokenizing a Unicode string, make sure you are not
using an encoded version of the string (it may be necessary to
decode it first, e.g. with ``s.decode("utf8")``.

NLTK tokenizers can produce token-spans, represented as tuples of integers
having the same semantics as string slices, to support efficient comparison
of tokenizers.  (These methods are implemented as generators.)

    >>> from nltk.tokenize import WhitespaceTokenizer
    >>> list(WhitespaceTokenizer().span_tokenize(s)) # doctest: +NORMALIZE_WHITESPACE
    [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44),
    (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)]

There are numerous ways to tokenize text.  If you need more control over
tokenization, see the other methods provided in this package.

For further information, please see Chapter 3 of the NLTK book.
    N)load)TweetTokenizercasual_tokenize)NLTKWordTokenizer)LegalitySyllableTokenizer)MWETokenizer)PunktSentenceTokenizerPunktTokenizer)BlanklineTokenizerRegexpTokenizerWhitespaceTokenizerWordPunctTokenizerblankline_tokenizeregexp_tokenizewordpunct_tokenize)ReppTokenizer)SExprTokenizersexpr_tokenize)LineTokenizerSpaceTokenizerTabTokenizerline_tokenize)SyllableTokenizer)StanfordSegmenter)TextTilingTokenizer)ToktokTokenizer)TreebankWordDetokenizerTreebankWordTokenizer)regexp_span_tokenizestring_span_tokenizec                     [        U 5      $ )z
A constructor for the PunktTokenizer that utilizes
a lru cache for performance.

:param language: the model name in the Punkt corpus
:type language: str
)r
   )languages    N/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/__init__.py_get_punkt_tokenizerr$   `   s     (##    c                 :    [        U5      nUR                  U 5      $ )a  
Return a sentence-tokenized copy of *text*,
using NLTK's recommended sentence tokenizer
(currently :class:`.PunktSentenceTokenizer`
for the specified language).

:param text: text to split into sentences
:param language: the model name in the Punkt corpus
)r$   tokenize)textr"   	tokenizers      r#   sent_tokenizer*   m   s     %X.Id##r%   c                     U(       a  U /O
[        X5      nU VVs/ s H!  n[        R                  U5        H  oUPM     M#     snn$ s  snnf )a  
Return a tokenized copy of *text*,
using NLTK's recommended word tokenizer
(currently an improved :class:`.TreebankWordTokenizer`
along with :class:`.PunktSentenceTokenizer`
for the specified language).

:param text: text to split into words
:type text: str
:param language: the model name in the Punkt corpus
:type language: str
:param preserve_line: A flag to decide whether to sentence tokenize the text or not.
:type preserve_line: bool
)r*   _treebank_word_tokenizerr'   )r(   r"   preserve_line	sentencessenttokens         r#   word_tokenizer1      sK     (]4-JI##$1I1R1RSW1X1X)  s   (A)english)r2   F)6__doc__	functoolsre	nltk.datar   nltk.tokenize.casualr   r   nltk.tokenize.destructiver    nltk.tokenize.legality_principler   nltk.tokenize.mwer   nltk.tokenize.punktr	   r
   nltk.tokenize.regexpr   r   r   r   r   r   r   nltk.tokenize.reppr   nltk.tokenize.sexprr   r   nltk.tokenize.simpler   r   r   r   !nltk.tokenize.sonority_sequencingr    nltk.tokenize.stanford_segmenterr   nltk.tokenize.texttilingr   nltk.tokenize.toktokr   nltk.tokenize.treebankr   r   nltk.tokenize.utilr   r    	lru_cacher$   r*   r,   r1    r%   r#   <module>rH      s   2h  	  @ 7 F * F   - >  @ > 8 0 Q I $ $$ -. r%   