
    /h                         S r SSKJrJr  SSKJrJr   " S S\5      r " S S\5      r " S S	\5      r	 " S
 S\5      r
SS jrg)a  
Simple Tokenizers

These tokenizers divide strings into substrings using the string
``split()`` method.
When tokenizing using a particular delimiter string, use
the string ``split()`` method directly, as this is more efficient.

The simple tokenizers are *not* available as separate functions;
instead, you should just use the string ``split()`` method directly:

    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> s.split() # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    >>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    >>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
    ['Good muffins cost $3.88', 'in New York.  Please buy me',
    'two of them.', '', 'Thanks.']

The simple tokenizers are mainly useful because they follow the
standard ``TokenizerI`` interface, and so can be used with any code
that expects a tokenizer.  For example, these tokenizers can be used
to specify the tokenization conventions when building a `CorpusReader`.

    )StringTokenizer
TokenizerI)regexp_span_tokenizestring_span_tokenizec                       \ rS rSrSrSrSrg)SpaceTokenizer*   a  Tokenize a string using the space character as a delimiter,
which is the same as ``s.split(' ')``.

    >>> from nltk.tokenize import SpaceTokenizer
    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
  N__name__
__module____qualname____firstlineno____doc___string__static_attributes__r       L/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/simple.pyr   r   *   s     Gr   r   c                       \ rS rSrSrSrSrg)TabTokenizer8   zTokenize a string use the tab character as a delimiter,
the same as ``s.split('\t')``.

    >>> from nltk.tokenize import TabTokenizer
    >>> TabTokenizer().tokenize('a\tb c\n\t d')
    ['a', 'b c\n', ' d']
	r   Nr   r   r   r   r   r   8   s     Gr   r   c                   (    \ rS rSrSrSrS rS rSrg)CharTokenizerD   z|Tokenize a string into individual characters.  If this functionality
is ever required directly, use ``for char in string``.
Nc                     [        U5      $ N)listselfss     r   tokenizeCharTokenizer.tokenizeK   s    Awr   c              #   b   #    [        [        S[        U5      S-   5      5       S h  vN   g  N7f)N   )	enumeraterangelenr    s     r   span_tokenizeCharTokenizer.span_tokenizeN   s"     U1c!fqj1222s   %/-/r   )	r   r   r   r   r   r   r#   r*   r   r   r   r   r   r   D   s     G3r   r   c                   .    \ rS rSrSrSS jrS rS rSrg)	LineTokenizerR   aT  Tokenize a string into its lines, optionally discarding blank lines.
This is similar to ``s.split('\n')``.

    >>> from nltk.tokenize import LineTokenizer
    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good muffins cost $3.88', 'in New York.  Please buy me',
    'two of them.', '', 'Thanks.']
    >>> # same as [l for l in s.split('\n') if l.strip()]:
    >>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
    ['Good muffins cost $3.88', 'in New York.  Please buy me',
    'two of them.', 'Thanks.']

:param blanklines: Indicates how blank lines should be handled.  Valid values are:

    - ``discard``: strip blank lines out of the token list before returning it.
       A line is considered blank if it contains only whitespace characters.
    - ``keep``: leave all blank lines in the token list.
    - ``discard-eof``: if the string ends with a newline, then do not generate
       a corresponding token ``''`` after that newline.
c                 X    SnX;  a  [        SSR                  U5      -  5      eXl        g )N)discardkeepdiscard-eofzBlank lines must be one of: %sr
   )
ValueErrorjoin_blanklines)r!   
blanklinesvalid_blankliness      r   __init__LineTokenizer.__init__i   s5    =-0388<L3MM  &r   c                 "   UR                  5       nU R                  S:X  a*  U Vs/ s H  o3R                  5       (       d  M  UPM     nnU$ U R                  S:X  a/  U(       a(  US   R                  5       (       d  UR	                  5         U$ s  snf )Nr0   r2   )
splitlinesr5   rstripstrippop)r!   r"   linesls       r   r#   LineTokenizer.tokenizer   sr    y( %41QE4  .U2Y__..			 5s   BBc              #      #    U R                   S:X  a  [        US5       S h  vN   g [        US5       S h  vN   g  N N7f)Nr1   z\nz
\n(\s+\n)*)r5   r   r   r    s     r   r*   LineTokenizer.span_tokenize}   s;     v%+Au555+A}=== 6=s    A=A?AA)r5   Nr0   )	r   r   r   r   r   r8   r#   r*   r   r   r   r   r-   r-   R   s    ,&>r   r-   c                 6    [        U5      R                  U 5      $ r   )r-   r#   )textr6   s     r   line_tokenizerH      s    $--d33r   NrE   )r   nltk.tokenize.apir   r   nltk.tokenize.utilr   r   r   r   r   r-   rH   r   r   r   <module>rK      sH   : : I_ 	? 	3O 3/>J />p4r   