o
    rZh[                     @   s,   d Z ddlZddlmZ G dd deZdS )a  
The tok-tok tokenizer is a simple, general tokenizer, where the input has one
sentence per line; thus only final period is tokenized.

Tok-tok has been tested on, and gives reasonably good results for English,
Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
The input should be in UTF-8 encoding.

Reference:
Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
    N)
TokenizerIc                   @   s  e Zd ZdZeddfZeddfZeddfZeddfZ	edd	fZ
ed
dfZeddfZeddfZeddfZeddfZeddfZeddfZeddfZeddfZeddfZeddfZedZedZedZede dd fZede dd fZede dd fZed!d"fZed#d$fZed%d&fZed'd&fZ ed(d)fZ!ed*d+fZ"ed,dfZ#eeeeeee e
eeeeeeeeeee	eeeee#gZ$d1d.d/Z%d0S )2ToktokTokenizeru  
    This is a Python port of the tok-tok.pl from
    https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl

    >>> toktok = ToktokTokenizer()
    >>> text = u'Is 9.5 or 525,600 my favorite number?'
    >>> print(toktok.tokenize(text, return_str=True))
    Is 9.5 or 525,600 my favorite number ?
    >>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
    >>> print(toktok.tokenize(text, return_str=True))
    The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
    >>> text = u'¡This, is a sentence with weird» symbols… appearing everywhere¿'
    >>> expected = u'¡ This , is a sentence with weird » symbols … appearing everywhere ¿'
    >>> assert toktok.tokenize(text, return_str=True) == expected
    >>> toktok.tokenize(text) == [u'¡', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'»', u'symbols', u'…', u'appearing', u'everywhere', u'¿']
    True
         u1   ([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])z \1 u   ([({\[“‘„‚«‹「『])u
   ([–—])z& z&amp; 	z &#9; z\|z &#124; u   (?<!,)([,،])(?![,\d])u	   (['’`])z ` ` z `` z ' ' z '' z
(?<!\.)\.$z .u    (?<!\.)\.\s*(["'’»›”]) *$z . \1z(,{2,})z(-{2,})z(\.{2,})u   ([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢u   )]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣u   $¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩＄￠￡￥￦z([z])z\1 z:(?!//)z : z\?(?!\S)z ? z(:\/\/)[\S+\.\S+\/\S+][\/]z / z /z^ + z\s+$
z {2,}Fc                 C   s@   t |}| jD ]
\}}|||}qt | }|r|S | S )N)strTOKTOK_REGEXESsubstripsplit)selftextZ
return_strregexpZsubstitution r   C/var/www/auris/lib/python3.10/site-packages/nltk/tokenize/toktok.pytokenize   s
   zToktokTokenizer.tokenizeN)F)&__name__
__module____qualname____doc__recompileZNON_BREAKINGZFUNKY_PUNCT_1ZFUNKY_PUNCT_2ZEN_EM_DASHESZ	AMPERCENTZTABPIPEZCOMMA_IN_NUMZPROB_SINGLE_QUOTESZSTUPID_QUOTES_1ZSTUPID_QUOTES_2ZFINAL_PERIOD_1ZFINAL_PERIOD_2ZMULTI_COMMASZMULTI_DASHESZ
MULTI_DOTSr	   Z
OPEN_PUNCTZCLOSE_PUNCTZCURRENCY_SYMZOPEN_PUNCT_REZCLOSE_PUNCT_REZCURRENCY_SYM_REZ	URL_FOE_1Z	URL_FOE_2Z	URL_FOE_3Z	URL_FOE_4ZLSTRIPZRSTRIPZ	ONE_SPACEr
   r   r   r   r   r   r      s~    
r   )r   r   Znltk.tokenize.apir   r   r   r   r   r   <module>   s   
