
    /h]%                     v    S SK r S SKrS SKrS SKrS SKJr  S SKJrJrJ	r	J
r
JrJr  S SKJr  Sr " S S\5      rg)    N)PIPE)_java_optionsconfig_javafind_dir	find_filefind_jarjava)
TokenizerIz!https://nlp.stanford.edu/softwarec                   t   ^  \ rS rSrSrSr            SS jrS rU 4S jrS r	S r
S	 rSS
 jrSrU =r$ )StanfordSegmenter    u  Interface to the Stanford Segmenter

If stanford-segmenter version is older than 2016-10-31, then path_to_slf4j
should be provieded, for example::

    seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar')

>>> from nltk.tokenize.stanford_segmenter import StanfordSegmenter
>>> seg = StanfordSegmenter() # doctest: +SKIP
>>> seg.default_config('zh') # doctest: +SKIP
>>> sent = u'这是斯坦福中文分词器测试'
>>> print(seg.segment(sent)) # doctest: +SKIP
这 是 斯坦福 中文 分词器 测试
<BLANKLINE>
>>> seg.default_config('ar') # doctest: +SKIP
>>> sent = u'هذا هو تصنيف ستانفورد العربي للكلمات'
>>> print(seg.segment(sent.split())) # doctest: +SKIP
هذا هو تصنيف ستانفورد العربي ل الكلمات
<BLANKLINE>
zstanford-segmenter.jarc           	      :   [         R                  " S[        5        [         R                  " [	        S5      [        SS9  [         R                  " S[        5        [        U R                  USS[        US9nUb  [        S	US
S[        US9nOS n[        R                  R                  S X4 5       5      U l        X0l        X@l        X`l        Xpl        Xl        XPl        Xl        Xl        U
c  0 OU
n
SR                  S U
R)                  5        5       5      U l        g )Nalwaysz}
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPTokenizer[0m instead.'   )
stacklevelignoreSTANFORD_SEGMENTER )env_vars
searchpathurlverbosezslf4j-api.jar)SLF4Jr   c              3   .   #    U  H  oc  M  Uv   M     g 7fNr   ).0_s     X/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/stanford_segmenter.py	<genexpr>-StanfordSegmenter.__init__.<locals>.<genexpr>j   s      -
2!AA2s   	,c              3   \   #    U  H"  u  pU S [         R                  " U5       3v   M$     g7f)=N)jsondumps)r   keyvals      r   r    r!   x   s)      %
7F83se1TZZ_%&s   *,)warningssimplefilterDeprecationWarningwarnstrr   _JAR_stanford_urlospathsepjoin_stanford_jar_java_class_model_sihan_corpora_dict_sihan_post_processing_keep_whitespaces_dict	_encodingjava_optionsitems_options_cmd)selfpath_to_jarpath_to_slf4j
java_classpath_to_modelpath_to_dictpath_to_sihan_corpora_dictsihan_post_processingkeep_whitespacesencodingoptionsr   r;   stanford_segmenterslf4js                  r   __init__StanfordSegmenter.__init__8   s     	h(:;Z
 	
 	h(:;%II,
 $8!E E  ZZ__ -
*2-
 
 &##= &;#!1!
!("WHH %
7>}}%
 
    c                    Sn[         R                  R                  S5      (       a>  [         R                  R	                  [         R                  R                  S5      S5      1nSU l        SU l        SU l        US:X  a
  SU l        SnOuUS	:X  aa  S
U l        SnSU l        Sn [        UU[        SSS9U l        Sn [        U[        SSS9n[         R                  R	                  Xv5      U l        O[        SU 35      e [        UU[        SSS9U l        g! [         a  n[        SU-  5      UeSnAff = f! [         a  n[        SU-  5      UeSnAff = f! [         a  n[        SU-  5      UeSnAff = f)z
Attempt to initialize Stanford Word Segmenter for the specified language
using the STANFORD_SEGMENTER and STANFORD_MODELS environment variables
r   r   dataNfalsearz=edu.stanford.nlp.international.arabic.process.ArabicSegmenterz'arabic-segmenter-atb+bn+arztrain.ser.gzzhz%edu.stanford.nlp.ie.crf.CRFClassifierzpku.gztruezdict-chris6.ser.gzF)STANFORD_MODELS)r   r   r   r   z_Could not find '%s' (tried using env. variables STANFORD_MODELS and <STANFORD_SEGMENTER>/data/)z./data/r   )r   r   r   zMCould not find '%s' (tried using the STANFORD_SEGMENTER environment variable)zUnsupported language )rT   r   )r0   environgetpathr2   r9   r6   r7   r4   r   r/   LookupErrorr   r5   )r>   langsearch_pathmodelrC   e	sihan_dirpath_to_sihan_dirs           r   default_config StanfordSegmenter.default_config|   s    ::>>.//77<<

7K(LfUVK 
#' &-#4<O  >ET\FDE*0D'/L& *%!1
 "I$,%!4	%! ,.77<<8I+U(  5dV<==	#&!BDK3  !P"# 	   !?AJK    	LNST 	sH   #D <4D8  E 
D5!D00D58
EEE
E7#E22E7c                 $   > [         TU ]  U5        g r   )supertokenize)r>   s	__class__s     r   rc   StanfordSegmenter.tokenize   s    rM   c                     U R                   SU R                  SU R                  SU/nU R                  b5  UR	                  SU R
                  SU R                  SU R                  /5        U R                  U5      nU$ ) -loadClassifier-keepAllWhitespaces	-textFile-serDictionary-sighanCorporaDict-sighanPostProcessing)r4   r5   r8   r6   extendr9   r7   _execute)r>   input_file_pathcmdstdouts       r   segment_fileStanfordSegmenter.segment_file   s     KK!""
 ##/JJ$JJ(,,+//	 s#rM   c                 &    U R                  U/5      $ r   )segment_sents)r>   tokenss     r   segmentStanfordSegmenter.segment   s    !!6(++rM   c                    U R                   n[        R                  " SS9u  o0l        [        R
                  " US5      nSR                  S U 5       5      n[        U[        5      (       a  U(       a  UR                  U5      nUR                  U5        UR                  5         U R                  SU R                  SU R                  SU R                  /nU R                  b5  UR!                  S	U R"                  S
U R                  SU R$                  /5        U R'                  U5      n[        R(                  " U R                  5        U$ )rh   T)textwb
c              3   D   #    U  H  nS R                  U5      v   M     g7f) N)r2   )r   xs     r   r    2StanfordSegmenter.segment_sents.<locals>.<genexpr>   s     :	1388A;;	s    ri   rj   rk   rl   rm   rn   )r:   tempfilemkstemp_input_file_pathr0   fdopenr2   
isinstancer-   encodewritecloser4   r5   r8   r6   ro   r9   r7   rp   unlink)r>   	sentencesrG   	_input_fh_inputrr   rs   s          r   rw   StanfordSegmenter.segment_sents   s   >>+3+;+;+F(	( IIi.	:	::fc""x]]8,F KK!""!!
 ##/JJ$JJ(,,+//	 s# 			$''(rM   c                 j   U R                   nUR                  SU/5        U R                  nU(       a  UR                  SU R                  /5        SR                  [        5      n[        U R                  US9  [        XR                  [        [        S9u  pgUR                  U5      n[        USS9  U$ )Nz-inputEncodingz-optionsr   )rH   r   )	classpathrs   stderrF)r:   ro   r=   r2   r   r   r;   r	   r3   r   decode)r>   rr   r   rG   r=   default_optionsrs   _stderrs           r   rp   StanfordSegmenter._execute  s    >>

$h/0((JJ
D$5$567((=1 	D--w?--d4
 x( 	OU;rM   )r9   r:   r   r4   r8   r5   r=   r6   r7   r3   r;   )NNNNNNrP   rP   zUTF-8NFz-mx2g)F)__name__
__module____qualname____firstlineno____doc__r.   rK   r_   rc   rt   ry   rw   rp   __static_attributes____classcell__)re   s   @r   r   r       sb    * $D #'% B
HGR6,(T rM   r   )r%   r0   r   r)   
subprocessr   nltk.internalsr   r   r   r   r   r	   nltk.tokenize.apir
   r/   r   r   rM   r   <module>r      s8     	     )3D
 DrM   