
    /hu                     R    S r SSKrSSKrSSKJr  SSKJr  SSKJr   " S S\5      r	g)a  
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
which was also ported into Python in
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
    N)perluniprops)
TokenizerI)xml_unescapec            	          \ rS rSrSr\R                  " S5      S4r\R                  " S5      S4r\R                  " S5      S4r	\R                  " S	5      S
4r
\R                  " S5      S4r\R                  " S5      S
4r\	\
\\/r\" SR                  \" \R$                  " S5      5      5      5      r\" SR                  \" \R$                  " S5      5      5      5      r\" SR                  \" \R$                  " S5      5      5      5      r\R,                  " SS\5      r\R,                  " SS\5      r\R,                  " SS\5      r\R                  " S5      S4r\R                  " S\ S\ S35      S
4r\R                  " S\ S\ S35      S4r\R                  " S\ S35      S4r\\\\/rS rSS jr  SS jr!Sr"g)NISTTokenizer   u  
This NIST tokenizer is sentence-based instead of the original
paragraph-based tokenization from mteval-14.pl; The sentence-based
tokenization is consistent with the other tokenizers available in NLTK.

>>> from nltk.tokenize.nist import NISTTokenizer
>>> nist = NISTTokenizer()
>>> s = "Good muffins cost $3.88 in New York."
>>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
>>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
>>> nist.tokenize(s, lowercase=False) == expected_cased
True
>>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
True

The international_tokenize() is the preferred function when tokenizing
non-european text, e.g.

>>> from nltk.tokenize.nist import NISTTokenizer
>>> nist = NISTTokenizer()

# Input strings.
>>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) is a Chinese e-commerce company...'
>>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
>>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'

# Expected tokens.
>>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'阿里巴巴集团控股', u'有限公司', u')']
>>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'ˈæ', u'm']
>>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'楽天株式会社', u'Rakuten', u'Kabushiki', u'-', u'gaisha']

>>> nist.international_tokenize(albb)[:10] == expected_albb
True
>>> nist.international_tokenize(amz)[:10] == expected_amz
True
>>> nist.international_tokenize(rkt)[:10] == expected_rkt
True

# Doctest for patching issue #1926
>>> sent = u'this is a foo☄sentence.'
>>> expected_sent = [u'this', u'is', u'a', u'foo', u'☄', u'sentence', u'.']
>>> nist.international_tokenize(sent) == expected_sent
True
z	<skipped> u     z([\{-\~\[-\` -\&\(-\+\:-\@\/])z \1 z([^0-9])([\.,])z\1 \2 z([\.,])([^0-9])z \1 \2z
([0-9])(-)NumberPunctuationSymbolz[]^\\-]z\\\g<0>z([ -]+)z([z])([z])c                     U R                   u  p#UR                  X15      n[        U5      nU R                  u  p#UR                  X15      nU$ )z8Performs the language independent string substituitions.)
STRIP_SKIPsubr   STRIP_EOL_HYPHEN)selftextregexpsubstitutions       J/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/nist.pylang_independent_sub"NISTTokenizer.lang_independent_sub   sI    
  $zz,-D!#44zz,-    c                 t   [        U5      nU R                  U5      nU(       aE  SU-   S-   nU(       a  UR                  5       nU R                   H  u  pVUR	                  Xa5      nM     SR                  UR                  5       5      n[        UR                  5       5      nU(       a  U$ UR                  5       $ Nr
   )strr   lowerLANG_DEPENDENT_REGEXESr   joinsplitstrip)r   r   	lowercasewestern_lang
return_strr   r   s          r   tokenizeNISTTokenizer.tokenize   s    4y((.:#Dzz|(,(C(C$zz,5 )D xx

% 4::< !t3tzz|3r   c                    [        U5      nU R                  u  pVUR                  Xa5      nU R                  u  pVUR                  Xa5      n[	        U5      nU(       a  UR                  5       nU R                   H  u  pVUR                  Xa5      nM     SR                  UR                  5       R                  5       5      nU(       a  U$ UR                  5       $ r   )
r   r   r   r   r   r   INTERNATIONAL_REGEXESr   r!   r    )r   r   r"   split_non_asciir$   r   r   s          r   international_tokenize$NISTTokenizer.international_tokenize   s     4y  $zz,-#44zz,-D!::<D$($>$> F::l1D %?
 xx

**,-!t3tzz|3r    N)FTF)#__name__
__module____qualname____firstlineno____doc__recompiler   r   PUNCTPERIOD_COMMA_PRECEEDPERIOD_COMMA_FOLLOWDASH_PRECEED_DIGITr   r   r   setr   chars
pup_number	pup_punct
pup_symbolr   number_regexpunct_regexsymbol_regexNONASCIIPUNCT_1PUNCT_2SYMBOLSr(   r   r%   r*   __static_attributes__r,   r   r   r   r      s   +\ K(",Jzz(+S0JJ897BE::&89:E**%78*DL1:= 		 RWWS!3!3H!=>?@JBGGC 2 2= ABCDIRWWS!3!3H!=>?@J 66*j*=L&&Z;K66*j*=L zz*+W4H 	

R~T+b9:G
 	

R}Db9:G
 jj2l^2./8G%wA
4( GL4r   r   )
r1   ior2   nltk.corpusr   nltk.tokenize.apir   nltk.tokenize.utilr   r   r,   r   r   <module>rI      s)    
 	 $ ( +Y4J Y4r   