
    /h[                     2    S r SSKrSSKJr   " S S\5      rg)a  
The tok-tok tokenizer is a simple, general tokenizer, where the input has one
sentence per line; thus only final period is tokenized.

Tok-tok has been tested on, and gives reasonably good results for English,
Persian, Russian, Czech, French, German, Vietnamese, Tajik, and a few others.
The input should be in UTF-8 encoding.

Reference:
Jon Dehdari. 2014. A Neurophysiologically-Inspired Statistical Language
Model (Doctoral dissertation). Columbus, OH, USA: The Ohio State University.
    N)
TokenizerIc                      \ rS rSrSr\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r	\R                  " S5      S4r
\R                  " S	5      S
4r\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r\" S5      r\" S5      r\" S5      r\R                  " S\ S 35      S!4r\R                  " S\ S 35      S!4r\R                  " S\ S 35      S!4r\R                  " S"5      S#4r\R                  " S$5      S%4r\R                  " S&5      S'4r \R                  " S(5      S'4r!\R                  " S)5      S*4r"\R                  " S+5      S,4r#\R                  " S-5      S4r$\\\	\\\ \!\\\\\\\\\\\\
\\\\\$/r%S1S. jr&S/r'g0)2ToktokTokenizer   u  
This is a Python port of the tok-tok.pl from
https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl

>>> toktok = ToktokTokenizer()
>>> text = u'Is 9.5 or 525,600 my favorite number?'
>>> print(toktok.tokenize(text, return_str=True))
Is 9.5 or 525,600 my favorite number ?
>>> text = u'The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things'
>>> print(toktok.tokenize(text, return_str=True))
The https://github.com/jonsafari/tok-tok/blob/master/tok-tok.pl is a website with/and/or slashes and sort of weird : things
>>> text = u'¡This, is a sentence with weird» symbols… appearing everywhere¿'
>>> expected = u'¡ This , is a sentence with weird » symbols … appearing everywhere ¿'
>>> assert toktok.tokenize(text, return_str=True) == expected
>>> toktok.tokenize(text) == [u'¡', u'This', u',', u'is', u'a', u'sentence', u'with', u'weird', u'»', u'symbols', u'…', u'appearing', u'everywhere', u'¿']
True
     u1   ([،;؛¿!"\])}»›”؟¡%٪°±©®।॥…])z \1 u   ([({\[“‘„‚«‹「『])u
   ([–—])z& z&amp; 	z &#9; z\|z &#124; u   (?<!,)([,،])(?![,\d])u	   (['’`])z ` ` z `` z ' ' z '' z
(?<!\.)\.$z .u    (?<!\.)\.\s*(["'’»›”]) *$z . \1z(,{2,})z(-{2,})z(\.{2,})u   ([{༺༼᚛‚„⁅⁽₍〈❨❪❬❮❰❲❴⟅⟦⟨⟪⟬⟮⦃⦅⦇⦉⦋⦍⦏⦑⦓⦕⦗⧘⧚⧼⸢⸤⸦⸨〈《「『【〔〖〘〚〝﴾︗︵︷︹︻︽︿﹁﹃﹇﹙﹛﹝（［｛｟｢u   )]}༻༽᚜⁆⁾₎〉❩❫❭❯❱❳❵⟆⟧⟩⟫⟭⟯⦄⦆⦈⦊⦌⦎⦐⦒⦔⦖⦘⧙⧛⧽⸣⸥⸧⸩〉》」』】〕〗〙〛〞〟﴿︘︶︸︺︼︾﹀﹂﹄﹈﹚﹜﹞）］｝｠｣u   $¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺꠸﷼﹩＄￠￡￥￦z([z])z\1 z:(?!//)z : z\?(?!\S)z ? z(:\/\/)[\S+\.\S+\/\S+][\/]z / z /z^ + z\s+$
z {2,}c                     [        U5      nU R                   H  u  p4UR                  XA5      nM     [        UR                  5       5      nU(       a  U$ UR	                  5       $ )N)strTOKTOK_REGEXESsubstripsplit)selftext
return_strregexpsubstitutions        L/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/toktok.pytokenizeToktokTokenizer.tokenize   sQ    4y$($7$7 F::l1D %8 4::< !t3tzz|3     N)F)(__name__
__module____qualname____firstlineno____doc__recompileNON_BREAKINGFUNKY_PUNCT_1FUNKY_PUNCT_2EN_EM_DASHES	AMPERCENTTABPIPECOMMA_IN_NUMPROB_SINGLE_QUOTESSTUPID_QUOTES_1STUPID_QUOTES_2FINAL_PERIOD_1FINAL_PERIOD_2MULTI_COMMASMULTI_DASHES
MULTI_DOTSr   
OPEN_PUNCTCLOSE_PUNCTCURRENCY_SYMOPEN_PUNCT_RECLOSE_PUNCT_RECURRENCY_SYM_RE	URL_FOE_1	URL_FOE_2	URL_FOE_3	URL_FOE_4LSTRIPRSTRIP	ONE_SPACEr   r   __static_attributes__r   r   r   r   r      s   & ::h',L JJSTV]]MJJABGKM::l+W4L 

4 (*I
**T
H
$C::ej(D ::78'AL L17:jj*G3Ojj*G3O ZZ.5N ZZ GH(RN ::j)72L::j)72LK('1J 	/
J 	)
K 	5L JJJ<r23V;MZZ"[M 45v=Njj2l^2!67?O 

:&.I

;'/I

895@I

5!6)I ZZ#FZZ $&F

8$c)I 	1N64r   r   )r    r!   nltk.tokenize.apir   r   r   r   r   <module>rB      s     
 (W4j W4r   