o
    rZh]%                     @   sp   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlmZ dZG dd deZdS )    N)PIPE)_java_optionsconfig_javafind_dir	find_filefind_jarjava)
TokenizerIz!https://nlp.stanford.edu/softwarec                       sp   e Zd ZdZdZ												ddd	Zd
d Z fddZdd Zdd Z	dd Z
dddZ  ZS )StanfordSegmenteru[  Interface to the Stanford Segmenter

    If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
    should be provieded, for example::

        seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')

    >>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
    >>> seg = StanfordSegmenter() # doctest: +SKIP
    >>> seg.default_config('zh') # doctest: +SKIP
    >>> sent = u'这是斯坦福中文分词器测试'
    >>> print(seg.segment(sent)) # doctest: +SKIP
    这 是 斯坦福 中文 分词器 测试
    <BLANKLINE>
    >>> seg.default_config('ar') # doctest: +SKIP
    >>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
    >>> print(seg.segment(sent.split())) # doctest: +SKIP
    هذا هو تصنيف ستانفورد العربي ل الكلمات
    <BLANKLINE>
    zstanford-segmenter.jarNfalseUTF-8F-mx2gc                 C   s   t dt t jtdtdd t dt t| j|ddt|d}|d ur0td	|d
dt|d}nd }tj	
dd ||fD | _|| _|| _|| _|| _|| _|| _|	| _|| _|
d u r^i n|
}
d
dd |
 D | _d S )Nalwaysz}
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPTokenizer[0m instead.'   )
stacklevelignoreSTANFORD_SEGMENTER )env_vars
searchpathurlverbosezslf4j-api.jar)ZSLF4Jr   c                 s   s    | ]	}|d ur|V  qd S Nr   ).0_r   r   O/var/www/auris/lib/python3.10/site-packages/nltk/tokenize/stanford_segmenter.py	<genexpr>j   s    z-StanfordSegmenter.__init__.<locals>.<genexpr>,c                 s   s(    | ]\}}| d t | V  qdS )=N)jsondumps)r   keyvalr   r   r   r   x   s    
)warningssimplefilterDeprecationWarningwarnstrr   _JAR_stanford_urlospathsepjoin_stanford_jar_java_class_model_sihan_corpora_dict_sihan_post_processing_keep_whitespaces_dict	_encodingjava_optionsitems_options_cmd)selfZpath_to_jarZpath_to_slf4jZ
java_classZpath_to_modelpath_to_dictZpath_to_sihan_corpora_dictZsihan_post_processingZkeep_whitespacesencodingoptionsr   r6   Zstanford_segmenterZslf4jr   r   r   __init__8   sT   		

zStanfordSegmenter.__init__c              
   C   sJ  d}t jdrt jt jddh}d| _d| _d| _|dkr'd| _d}n]|d	kr}d
| _d}d| _d}zt	||t
ddd| _W n tyT } ztd| |d}~ww d}zt|t
ddd}t j||| _W n ty| } ztd| |d}~ww td| zt	||t
ddd| _W dS  ty } ztd| |d}~ww )z
        Attempt to initialize Stanford Word Segmenter for the specified language
        using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
        r   r   dataNr   arz=edu.stanford.nlp.international.arabic.process.ArabicSegmenterz'arabic-segmenter-atb+bn+arztrain.ser.gzzhz%edu.stanford.nlp.ie.crf.CRFClassifierzpku.gztruezdict-chris6.ser.gzF)STANFORD_MODELS)r   r   r   r   z_Could not find '%s' (tried using env. variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)z./data/r   )r   r   r   zMCould not find '%s' (tried using the STANFORD_SEGMENTER environment variable)zUnsupported language )rB   r   )r+   environgetpathr-   r4   r1   r2   r/   r   r*   LookupErrorr   r0   )r9   langsearch_pathmodelr:   eZ	sihan_dirZpath_to_sihan_dirr   r   r   default_config|   s   z StanfordSegmenter.default_configc                    s   t  | d S r   )supertokenize)r9   s	__class__r   r   rM      s   zStanfordSegmenter.tokenizec                 C   sL   | j d| jd| jd|g}| jdur|d| jd| jd| jg | |}|S ) -loadClassifier-keepAllWhitespaces	-textFileN-serDictionary-sighanCorporaDict-sighanPostProcessing)r/   r0   r3   r1   extendr4   r2   _execute)r9   Zinput_file_pathcmdstdoutr   r   r   segment_file   s(   
	
zStanfordSegmenter.segment_filec                 C   s   |  |gS r   )segment_sents)r9   tokensr   r   r   segment   s   zStanfordSegmenter.segmentc                 C   s   | j }tjdd\}| _t|d}ddd |D }t|tr(|r(|	|}|
| |  | jd| jd| jd	| jg}| jd
urQ|d| jd| jd| jg | |}t| j |S )rQ   T)textwb
c                 s   s    | ]}d  |V  qdS )rQ   N)r-   )r   xr   r   r   r      s    z2StanfordSegmenter.segment_sents.<locals>.<genexpr>rR   rS   rT   NrU   rV   rW   )r5   tempfilemkstempZ_input_file_pathr+   fdopenr-   
isinstancer(   encodewritecloser/   r0   r3   r1   rX   r4   r2   rY   unlink)r9   Z	sentencesr;   Z	_input_fh_inputrZ   r[   r   r   r   r]      s:   


	
zStanfordSegmenter.segment_sentsc                 C   sv   | j }|d|g | j}|r|d| jg dt}t| j|d t|| jt	t	d\}}|
|}t|dd |S )Nz-inputEncodingz-optionsrQ   )r<   r   )Z	classpathr[   stderrF)r5   rX   r8   r-   r   r   r6   r   r.   r   decode)r9   rZ   r   r;   r8   Zdefault_optionsr[   _stderrr   r   r   rY     s   



zStanfordSegmenter._execute)NNNNNNr   r   r   NFr   )F)__name__
__module____qualname____doc__r)   r=   rK   rM   r\   r_   r]   rY   __classcell__r   r   rO   r   r
       s,    
DI*r
   )r    r+   rd   r$   
subprocessr   Znltk.internalsr   r   r   r   r   r   Znltk.tokenize.apir	   r*   r
   r   r   r   r   <module>   s    