o
    rZŽhu  ã                   @   sL   d Z ddlZddlZddlmZ ddlmZ ddlmZ G dd„ deƒZ	dS )a  
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
which was also ported into Python in
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
é    N)Úperluniprops)Ú
TokenizerI)Úxml_unescapec                   @   sh  e Zd ZdZe d¡dfZe d¡dfZe d¡dfZe d¡d	fZ	e d
¡dfZ
e d¡d	fZee	e
egZed ee d¡ƒ¡ƒZed ee d¡ƒ¡ƒZed ee d¡ƒ¡ƒZe dde¡Ze dde¡Ze dde¡Ze d¡dfZe de› de› d¡d	fZe de› de› d¡dfZe de› d¡dfZeeeegZdd„ Zddd„Z	ddd„Z dS ) ÚNISTTokenizeruT  
    This NIST tokenizer is sentence-based instead of the original
    paragraph-based tokenization from mteval-14.pl; The sentence-based
    tokenization is consistent with the other tokenizers available in NLTK.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()
    >>> s = "Good muffins cost $3.88 in New York."
    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
    >>> nist.tokenize(s, lowercase=False) == expected_cased
    True
    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
    True

    The international_tokenize() is the preferred function when tokenizing
    non-european text, e.g.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()

    # Input strings.
    >>> albb = u'Alibaba Group Holding Limited (Chinese: é˜¿é‡Œå·´å·´é›†å›¢æŽ§è‚¡ æœ‰é™å…¬å¸) is a Chinese e-commerce company...'
    >>> amz = u'Amazon.com, Inc. (/ËˆÃ¦mÉ™zÉ’n/) is an American electronic commerce...'
    >>> rkt = u'Rakuten, Inc. (æ¥½å¤©æ ªå¼ä¼šç¤¾ Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'

    # Expected tokens.
    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'é˜¿é‡Œå·´å·´é›†å›¢æŽ§è‚¡', u'æœ‰é™å…¬å¸', u')']
    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'ËˆÃ¦', u'm']
    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'æ¥½å¤©æ ªå¼ä¼šç¤¾', u'Rakuten', u'Kabushiki', u'-', u'gaisha']

    >>> nist.international_tokenize(albb)[:10] == expected_albb
    True
    >>> nist.international_tokenize(amz)[:10] == expected_amz
    True
    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
    True

    # Doctest for patching issue #1926
    >>> sent = u'this is a fooâ˜„sentence.'
    >>> expected_sent = [u'this', u'is', u'a', u'foo', u'â˜„', u'sentence', u'.']
    >>> nist.international_tokenize(sent) == expected_sent
    True
    z	<skipped>Ú u   â€¨ú z([\{-\~\[-\` -\&\(-\+\:-\@\/])z \1 z([^0-9])([\.,])z\1 \2 z([\.,])([^0-9])z \1 \2z
([0-9])(-)ÚNumberÚPunctuationÚSymbolz[]^\\-]z\\\g<0>z([ -]+)z([z])([z])c                 C   s8   | j \}}| ||¡}t|ƒ}| j\}}| ||¡}|S )z8Performs the language independent string substituitions.)Ú
STRIP_SKIPÚsubr   ÚSTRIP_EOL_HYPHEN)ÚselfÚtextÚregexpÚsubstitution© r   úA/var/www/auris/lib/python3.10/site-packages/nltk/tokenize/nist.pyÚlang_independent_sub   s   

z"NISTTokenizer.lang_independent_subFTc                 C   st   t |ƒ}|  |¡}|r%d| d }|r| ¡ }| jD ]
\}}| ||¡}qd | ¡ ¡}t | ¡ ƒ}|r6|S | ¡ S ©Nr   )Ústrr   ÚlowerÚLANG_DEPENDENT_REGEXESr   ÚjoinÚsplitÚstrip)r   r   Ú	lowercaseZwestern_langÚ
return_strr   r   r   r   r   Útokenize‹   s   
zNISTTokenizer.tokenizec                 C   s†   t |ƒ}| j\}}| ||¡}| j\}}| ||¡}t|ƒ}|r$| ¡ }| jD ]
\}}| ||¡}q'd | ¡  	¡ ¡}|r?|S | 	¡ S r   )
r   r   r   r   r   r   ÚINTERNATIONAL_REGEXESr   r   r   )r   r   r   Zsplit_non_asciir   r   r   r   r   r   Úinternational_tokenizež   s   

z$NISTTokenizer.international_tokenizeN)FTF)!Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚreÚcompiler   r   ZPUNCTZPERIOD_COMMA_PRECEEDZPERIOD_COMMA_FOLLOWZDASH_PRECEED_DIGITr   r   r   Úsetr   ÚcharsZ
pup_numberZ	pup_punctZ
pup_symbolr   Znumber_regexZpunct_regexZsymbol_regexZNONASCIIZPUNCT_1ZPUNCT_2ZSYMBOLSr   r   r   r    r   r   r   r   r      s@    .üþþ
ÿr   )
r$   Úior%   Znltk.corpusr   Znltk.tokenize.apir   Znltk.tokenize.utilr   r   r   r   r   r   Ú<module>   s   	