
    /h                     6    S r SSKJr  SSKJr   " S S\5      rg)a(  
Multi-Word Expression Tokenizer

A ``MWETokenizer`` takes a string which has already been divided into tokens and
retokenizes it, merging multi-word expressions into single tokens, using a lexicon
of MWEs:


    >>> from nltk.tokenize import MWETokenizer

    >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')])
    >>> tokenizer.add_mwe(('in', 'spite', 'of'))

    >>> tokenizer.tokenize('Testing testing testing one two three'.split())
    ['Testing', 'testing', 'testing', 'one', 'two', 'three']

    >>> tokenizer.tokenize('This is a test in spite'.split())
    ['This', 'is', 'a', 'test', 'in', 'spite']

    >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split())
    ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of']

    )
TokenizerI)Triec                   .    \ rS rSrSrSS jrS rS rSrg)	MWETokenizer#   z`A tokenizer that processes tokenized text and merges multi-word expressions
into single tokens.
Nc                 B    U(       d  / n[        U5      U l        X l        g)ae  Initialize the multi-word tokenizer with a list of expressions and a
separator

:type mwes: list(list(str))
:param mwes: A sequence of multi-word expressions to be merged, where
    each MWE is a sequence of strings.
:type separator: str
:param separator: String that should be inserted between words in a multi-word
    expression token. (Default is '_')

N)r   _mwes
_separator)selfmwes	separators      I/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/mwe.py__init__MWETokenizer.__init__(   s     D$Z
#    c                 :    U R                   R                  U5        g)a.  Add a multi-word expression to the lexicon (stored as a word trie)

We use ``util.Trie`` to represent the trie. Its form is a dict of dicts.
The key True marks the end of a valid MWE.

:param mwe: The multi-word expression we're adding into the word trie
:type mwe: tuple(str) or list(str)

:Example:

>>> tokenizer = MWETokenizer()
>>> tokenizer.add_mwe(('a', 'b'))
>>> tokenizer.add_mwe(('a', 'b', 'c'))
>>> tokenizer.add_mwe(('a', 'x'))
>>> expected = {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}}
>>> tokenizer._mwes == expected
True

N)r	   insert)r   mwes     r   add_mweMWETokenizer.add_mwe9   s    ( 	

#r   c                     Sn[        U5      n/ nX#:  a  X   U R                  ;   a  UnU R                  nSnXS:  a9  X   U;   a1  XaU      nUS-   n[        R                  U;   a  UnXS:  a
  X   U;   a  M1  US:  a  Un[        R                  U;   d  US:  a/  UR	                  U R
                  R                  XU 5      5        UnO1UR	                  X   5        US-  nOUR	                  X   5        US-  nX#:  a  M  U$ )a]  

:param text: A list containing tokenized text
:type text: list(str)
:return: A list of the tokenized text with multi-words merged together
:rtype: list(str)

:Example:

>>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+')
>>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split())
['An', "hors+d'oeuvre", 'tonight,', 'sir?']

r      )lenr	   r   LEAFappendr
   join)r   textinresultjtrie
last_matchs           r   tokenizeMWETokenizer.tokenizeO   s     Iew$**$zz
e4Q=DAAyyD(%&
	 e4 "B&yyD(JOdoo&:&:4!9&EF dg.Qdg&Q3 e4 r   )r	   r
   )N_)	__name__
__module____qualname____firstlineno____doc__r   r   r%   __static_attributes__ r   r   r   r   #   s    $",-r   r   N)r,   nltk.tokenize.apir   	nltk.utilr   r   r.   r   r   <module>r1      s    . ) Y: Yr   