
    /h*                     T    S r SSKrSSKJr   " S S\5      r\" 5       R
                  rg)a  
S-Expression Tokenizer

``SExprTokenizer`` is used to find parenthesized expressions in a
string.  In particular, it divides a string into a sequence of
substrings that are either parenthesized expressions (including any
nested parenthesized expressions), or other whitespace-separated
tokens.

    >>> from nltk.tokenize import SExprTokenizer
    >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
    ['(a b (c d))', 'e', 'f', '(g)']

By default, `SExprTokenizer` will raise a ``ValueError`` exception if
used to tokenize an expression with non-matching parentheses:

    >>> SExprTokenizer().tokenize('c) d) e (f (g')
    Traceback (most recent call last):
      ...
    ValueError: Un-matched close paren at char 1

The ``strict`` argument can be set to False to allow for
non-matching parentheses.  Any unmatched close parentheses will be
listed as their own s-expression; and the last partial sexpr with
unmatched open parentheses will be listed as its own sexpr:

    >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
    ['c', ')', 'd', ')', 'e', '(f (g']

The characters used for open and close parentheses may be customized
using the ``parens`` argument to the `SExprTokenizer` constructor:

    >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}')
    ['{a b {c d}}', 'e', 'f', '{g}']

The s-expression tokenizer is also available as a function:

    >>> from nltk.tokenize import sexpr_tokenize
    >>> sexpr_tokenize('(a b (c d)) e f (g)')
    ['(a b (c d))', 'e', 'f', '(g)']

    N)
TokenizerIc                   (    \ rS rSrSrSS jrS rSrg)SExprTokenizer9   a   
A tokenizer that divides strings into s-expressions.
An s-expresion can be either:

  - a parenthesized expression, including any nested parenthesized
    expressions, or
  - a sequence of non-whitespace non-parenthesis characters.

For example, the string ``(a (b c)) d e (f)`` consists of four
s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``.

By default, the characters ``(`` and ``)`` are treated as open and
close parentheses, but alternative strings may be specified.

:param parens: A two-element sequence specifying the open and close parentheses
    that should be used to find sexprs.  This will typically be either a
    two-character string, or a list of two strings.
:type parens: str or list
:param strict: If true, then raise an exception when tokenizing an ill-formed sexpr.
c                    [        U5      S:w  a  [        S5      eX l        US   U l        US   U l        [
        R                  " [
        R                  " US   5       S[
        R                  " US   5       35      U l        g )N   z'parens must contain exactly two stringsr      |)	len
ValueError_strict_open_paren_close_parenrecompileescape_paren_regexp)selfparensstricts      K/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/sexpr.py__init__SExprTokenizer.__init__O   sr    v;!FGG!!9"1IZZyy#$Abiiq	&:%;<
    c                    / nSnSnU R                   R                  U5       H  nUR                  5       nUS:X  a2  X!X5R                  5        R	                  5       -  nUR                  5       nX`R
                  :X  a  US-  nX`R                  :X  d  Mp  U R                  (       a"  US:X  a  [        SUR                  5       -  5      e[        SUS-
  5      nUS:X  d  M  UR                  XUR                  5        5        UR                  5       nM     U R                  (       a  US:  a  [        SU-  5      eU[        U5      :  a  UR                  XS 5        U$ )a  
Return a list of s-expressions extracted from *text*.
For example:

    >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)')
    ['(a b (c d))', 'e', 'f', '(g)']

All parentheses are assumed to mark s-expressions.
(No special processing is done to exclude parentheses that occur
inside strings, or following backslash characters.)

If the given expression contains non-matching parentheses,
then the behavior of the tokenizer depends on the ``strict``
parameter to the constructor.  If ``strict`` is ``True``, then
raise a ``ValueError``.  If ``strict`` is ``False``, then any
unmatched close parentheses will be listed as their own
s-expression; and the last partial s-expression with unmatched open
parentheses will be listed as its own s-expression:

    >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g')
    ['c', ')', 'd', ')', 'e', '(f (g']

:param text: the string to be tokenized
:type text: str or iter(str)
:rtype: iter(str)
r   r	   z!Un-matched close paren at char %dz Un-matched open paren at char %dN)r   finditergroupstartsplitr   r   r   r   maxappendendr   )r   textresultposdepthmparens          r   tokenizeSExprTokenizer.tokenizeY   s%   6 ##,,T2AGGIEzsWWY/5577ggi(((
)))<<EQJ$%H1779%TUUAuqy)A:MM$QUUW"56%%'C 3 <<EAI?#EFFT?MM$t*%r   )r   r   r   r   N)z()T)__name__
__module____qualname____firstlineno____doc__r   r)   __static_attributes__ r   r   r   r   9   s    *
0r   r   )r/   r   nltk.tokenize.apir   r   r)   sexpr_tokenizer1   r   r   <module>r4      s2   )V 
 (PZ Pf  !**r   