
    /h                     z    S SK r S SKrS SKrS SKrS SKJr  S SKJrJrJ	r	J
r
  S SKJr  S SKJr  Sr " S S\5      rg)	    N)PIPE)_java_optionsconfig_javafind_jarjava)CoreNLPParser)
TokenizerIz1https://nlp.stanford.edu/software/tokenizer.shtmlc                   P    \ rS rSrSrSr     S
S jr\S 5       rS r	SS jr
S	rg)StanfordTokenizer   a"  
Interface to the Stanford Tokenizer

>>> from nltk.tokenize.stanford import StanfordTokenizer
>>> s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
>>> StanfordTokenizer().tokenize(s) # doctest: +SKIP
['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
>>> s = "The colour of the wall is blue."
>>> StanfordTokenizer(options={"americanize": True}).tokenize(s) # doctest: +SKIP
['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.']
zstanford-postagger.jarNc           	         [         R                  " [        S5      [        SS9  [	        U R
                  USS[        US9U l        X l        XPl	        Uc  0 OUnSR                  S UR                  5        5       5      U l        g )	Nzz
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.'   )
stacklevel)STANFORD_POSTAGGER )env_vars
searchpathurlverbose,c              3   4   #    U  H  u  pU S U 3v   M     g7f)=Nr   ).0keyvals      N/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/stanford.py	<genexpr>-StanfordTokenizer.__init__.<locals>.<genexpr>E   s     $TOuAcU^Os   )warningswarnstrDeprecationWarningr   _JAR_stanford_url_stanford_jar	_encodingjava_optionsjoinitems_options_cmd)selfpath_to_jarencodingoptionsr   r'   s         r   __init__StanfordTokenizer.__init__%   s     	W
 	
 &II,
 "("WHH$TGMMO$TT    c                 "    U R                  5       $ )N)
splitlines)ss    r   _parse_tokenized_output)StanfordTokenizer._parse_tokenized_outputG   s    ||~r1   c                 H    S/nU R                  U R                  X!5      5      $ )zG
Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences.
z%edu.stanford.nlp.process.PTBTokenizer)r5   _execute)r+   r4   cmds      r   tokenizeStanfordTokenizer.tokenizeK   s&     77++DMM#,ABBr1   c                    U R                   nUR                  SU/5        U R                  nU(       a  UR                  SU R                  /5        SR                  [        5      n[        U R                  US9  [        R                  " SSS9 n[        U[        5      (       a  U(       a  UR                  U5      nUR                  U5        UR                  5         UR                  UR                  5        [!        XR"                  [$        [$        S9u  pUR'                  U5      nS S S 5        [(        R*                  " WR                  5        [        USS9  W$ ! , (       d  f       N:= f)	Nz-charsetz-options )r.   r   wbF)modedelete)	classpathstdoutstderr)r&   extendr*   r(   r   r   r'   tempfileNamedTemporaryFile
isinstancer!   encodewriteflushappendnamer   r%   r   decodeosunlink)
r+   r9   input_r   r-   r*   default_options
input_filerB   rC   s
             r   r8   StanfordTokenizer._executeR   s   >>

J)*((JJ
D$5$567((=1 	D--w? ((d5AZ&#&&8x0V$JJz' "11$tNF ]]8,F B 			*//" 	OU;) BAs   BE
E*)r&   r*   r%   r'   )Nutf8NFz-mx1000m)F)__name__
__module____qualname____firstlineno____doc__r#   r/   staticmethodr5   r:   r8   __static_attributes__r   r1   r   r   r      sE    
 $D  UD  C!r1   r   )jsonrN   rE   r   
subprocessr   nltk.internalsr   r   r   r   nltk.parse.corenlpr   nltk.tokenize.apir	   r$   r   r   r1   r   <module>ra      s5     	    E E , (C]
 ]r1   