o
    rZh                     @   sr   d Z ddlmZmZ ddlmZmZ G dd deZG dd deZG dd	 d	eZ	G d
d deZ
dddZdS )a  
Simple Tokenizers

These tokenizers divide strings into substrings using the string
``split()`` method.
When tokenizing using a particular delimiter string, use
the string ``split()`` method directly, as this is more efficient.

The simple tokenizers are *not* available as separate functions;
instead, you should just use the string ``split()`` method directly:

    >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
    >>> s.split() # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.',
    'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']
    >>> s.split(' ') # doctest: +NORMALIZE_WHITESPACE
    ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
    'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
    >>> s.split('\n') # doctest: +NORMALIZE_WHITESPACE
    ['Good muffins cost $3.88', 'in New York.  Please buy me',
    'two of them.', '', 'Thanks.']

The simple tokenizers are mainly useful because they follow the
standard ``TokenizerI`` interface, and so can be used with any code
that expects a tokenizer.  For example, these tokenizers can be used
to specify the tokenization conventions when building a `CorpusReader`.

    )StringTokenizer
TokenizerI)regexp_span_tokenizestring_span_tokenizec                   @      e Zd ZdZdZdS )SpaceTokenizera  Tokenize a string using the space character as a delimiter,
    which is the same as ``s.split(' ')``.

        >>> from nltk.tokenize import SpaceTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> SpaceTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '',
        'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.']
     N__name__
__module____qualname____doc___string r   r   C/var/www/auris/lib/python3.10/site-packages/nltk/tokenize/simple.pyr   *   s    
r   c                   @   r   )TabTokenizerzTokenize a string use the tab character as a delimiter,
    the same as ``s.split('\t')``.

        >>> from nltk.tokenize import TabTokenizer
        >>> TabTokenizer().tokenize('a\tb c\n\t d')
        ['a', 'b c\n', ' d']
    	Nr	   r   r   r   r   r   8   s    r   c                   @   s$   e Zd ZdZdZdd Zdd ZdS )CharTokenizerzTokenize a string into individual characters.  If this functionality
    is ever required directly, use ``for char in string``.
    Nc                 C   s   t |S N)listselfsr   r   r   tokenizeK   s   zCharTokenizer.tokenizec                 c   s"    t tdt|d E d H  d S )N   )	enumeraterangelenr   r   r   r   span_tokenizeN   s    zCharTokenizer.span_tokenize)r
   r   r   r   r   r   r   r   r   r   r   r   D   s
    r   c                   @   s*   e Zd ZdZd
ddZdd Zdd Zd	S )LineTokenizera  Tokenize a string into its lines, optionally discarding blank lines.
    This is similar to ``s.split('\n')``.

        >>> from nltk.tokenize import LineTokenizer
        >>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
        >>> LineTokenizer(blanklines='keep').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', '', 'Thanks.']
        >>> # same as [l for l in s.split('\n') if l.strip()]:
        >>> LineTokenizer(blanklines='discard').tokenize(s) # doctest: +NORMALIZE_WHITESPACE
        ['Good muffins cost $3.88', 'in New York.  Please buy me',
        'two of them.', 'Thanks.']

    :param blanklines: Indicates how blank lines should be handled.  Valid values are:

        - ``discard``: strip blank lines out of the token list before returning it.
           A line is considered blank if it contains only whitespace characters.
        - ``keep``: leave all blank lines in the token list.
        - ``discard-eof``: if the string ends with a newline, then do not generate
           a corresponding token ``''`` after that newline.
    discardc                 C   s(   d}||vrt dd| || _d S )N)r    keepdiscard-eofzBlank lines must be one of: %sr   )
ValueErrorjoin_blanklines)r   
blanklinesZvalid_blanklinesr   r   r   __init__i   s   
zLineTokenizer.__init__c                 C   sJ   |  }| jdkrdd |D }|S | jdkr#|r#|d  s#|  |S )Nr    c                 S   s   g | ]}|  r|qS r   )rstrip).0lr   r   r   
<listcomp>v   s    z*LineTokenizer.tokenize.<locals>.<listcomp>r"   )
splitlinesr%   strippop)r   r   linesr   r   r   r   r   s   

zLineTokenizer.tokenizec                 c   s4    | j dkrt|dE d H  d S t|dE d H  d S )Nr!   z\nz
\n(\s+\n)*)r%   r   r   r   r   r   r   r   }   s   
zLineTokenizer.span_tokenizeNr    )r
   r   r   r   r'   r   r   r   r   r   r   r   R   s
    
	r   r    c                 C   s   t || S r   )r   r   )textr&   r   r   r   line_tokenize   s   r3   Nr1   )r   Znltk.tokenize.apir   r   Znltk.tokenize.utilr   r   r   r   r   r   r3   r   r   r   r   <module>   s   8