
    /h                     f    S SK r S SKrS SKrS SKrS SKrS SKJr  S SKJr  S SK	J
r
   " S S\
5      rg)    N)ZipFilePathPointer)find_dir)
TokenizerIc                   ^    \ rS rSrSrSS jrS rSS jrS r\	S 5       r
\	S 5       rS	 rS
rg)ReppTokenizer   ay  
A class for word tokenization using the REPP parser described in
Rebecca Dridan and Stephan Oepen (2012) Tokenization: Returning to a
Long Solved Problem - A Survey, Contrastive  Experiment, Recommendations,
and Toolkit. In ACL. http://anthology.aclweb.org/P/P12/P12-2.pdf#page=406

>>> sents = ['Tokenization is widely regarded as a solved problem due to the high accuracy that rulebased tokenizers achieve.' ,
... 'But rule-based tokenizers are hard to maintain and their rules language specific.' ,
... 'We evaluated our method on three languages and obtained error rates of 0.27% (English), 0.35% (Dutch) and 0.76% (Italian) for our best models.'
... ]
>>> tokenizer = ReppTokenizer('/home/alvas/repp/') # doctest: +SKIP
>>> for sent in sents:                             # doctest: +SKIP
...     tokenizer.tokenize(sent)                   # doctest: +SKIP
...
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')

>>> for sent in tokenizer.tokenize_sents(sents): # doctest: +SKIP
...     print(sent)                              # doctest: +SKIP
...
(u'Tokenization', u'is', u'widely', u'regarded', u'as', u'a', u'solved', u'problem', u'due', u'to', u'the', u'high', u'accuracy', u'that', u'rulebased', u'tokenizers', u'achieve', u'.')
(u'But', u'rule-based', u'tokenizers', u'are', u'hard', u'to', u'maintain', u'and', u'their', u'rules', u'language', u'specific', u'.')
(u'We', u'evaluated', u'our', u'method', u'on', u'three', u'languages', u'and', u'obtained', u'error', u'rates', u'of', u'0.27', u'%', u'(', u'English', u')', u',', u'0.35', u'%', u'(', u'Dutch', u')', u'and', u'0.76', u'%', u'(', u'Italian', u')', u'for', u'our', u'best', u'models', u'.')
>>> for sent in tokenizer.tokenize_sents(sents, keep_token_positions=True): # doctest: +SKIP
...     print(sent)                                                         # doctest: +SKIP
...
[(u'Tokenization', 0, 12), (u'is', 13, 15), (u'widely', 16, 22), (u'regarded', 23, 31), (u'as', 32, 34), (u'a', 35, 36), (u'solved', 37, 43), (u'problem', 44, 51), (u'due', 52, 55), (u'to', 56, 58), (u'the', 59, 62), (u'high', 63, 67), (u'accuracy', 68, 76), (u'that', 77, 81), (u'rulebased', 82, 91), (u'tokenizers', 92, 102), (u'achieve', 103, 110), (u'.', 110, 111)]
[(u'But', 0, 3), (u'rule-based', 4, 14), (u'tokenizers', 15, 25), (u'are', 26, 29), (u'hard', 30, 34), (u'to', 35, 37), (u'maintain', 38, 46), (u'and', 47, 50), (u'their', 51, 56), (u'rules', 57, 62), (u'language', 63, 71), (u'specific', 72, 80), (u'.', 80, 81)]
[(u'We', 0, 2), (u'evaluated', 3, 12), (u'our', 13, 16), (u'method', 17, 23), (u'on', 24, 26), (u'three', 27, 32), (u'languages', 33, 42), (u'and', 43, 46), (u'obtained', 47, 55), (u'error', 56, 61), (u'rates', 62, 67), (u'of', 68, 70), (u'0.27', 71, 75), (u'%', 75, 76), (u'(', 77, 78), (u'English', 78, 85), (u')', 85, 86), (u',', 86, 87), (u'0.35', 88, 92), (u'%', 92, 93), (u'(', 94, 95), (u'Dutch', 95, 100), (u')', 100, 101), (u'and', 102, 105), (u'0.76', 106, 110), (u'%', 110, 111), (u'(', 112, 113), (u'Italian', 113, 120), (u')', 120, 121), (u'for', 122, 125), (u'our', 126, 129), (u'best', 130, 134), (u'models', 135, 141), (u'.', 141, 142)]
c                 p    U R                  U5      U l        [        R                  " 5       U l        X l        g )N)find_repptokenizerrepp_dirtempfile
gettempdirworking_direncoding)selfr   r   s      J/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tokenize/repp.py__init__ReppTokenizer.__init__6   s*    //9#..0     c                 8    [        U R                  U/5      5      $ )z
Use Repp to tokenize a single sentence.

:param sentence: A single sentence string.
:type sentence: str
:return: A tuple of tokens.
:rtype: tuple(str)
)nexttokenize_sents)r   sentences     r   tokenizeReppTokenizer.tokenize=   s     D''
344r   c              #     #    [         R                  " SU R                  SSS9 nU H   nUR                  [	        U5      S-   5        M"     UR                  5         U R                  UR                  5      nU R                  U5      R                  U R                  5      R                  5       nU R                  U5       H  nU(       d  [        U6 u  pxn	Uv   M     SSS5        g! , (       d  f       g= f7f)z
Tokenize multiple sentences using Repp.

:param sentences: A list of sentence strings.
:type sentences: list(str)
:return: A list of tuples of tokens
:rtype: iter(tuple(str))
zrepp_input.wF)prefixdirmodedelete
N)r   NamedTemporaryFiler   writestrclosegenerate_repp_commandname_executedecoder   stripparse_repp_outputszip)
r   	sentenceskeep_token_positions
input_filesentcmdrepp_outputtokenized_sentstartsendss
             r   r   ReppTokenizer.tokenize_sentsH   s      (( d&6&6S
!  TT!12 ",,Z__=C--,33DMMBHHJK"&"9"9+"F+363G0ND$$	 #G
 
 
s   "C6B8C%	C6%
C3/C6c                 f    U R                   S-   /nUSU R                   S-   /-  nUSS/-  nX!/-  nU$ )z
This module generates the REPP command to be used at the terminal.

:param inputfilename: path to the input file
:type inputfilename: str
	/src/reppz-c/erg/repp.setz--formattriple)r   )r   inputfilenamer1   s      r   r&   #ReppTokenizer.generate_repp_commandb   sK     }}{*+dmmo566
H%%
r   c                     [         R                  " U [         R                  [         R                  S9nUR                  5       u  p#U$ )N)stdoutstderr)
subprocessPopenPIPEcommunicate)r1   pr>   r?   s       r   r(   ReppTokenizer._executeo   s0    SQr   c              #   :  #    [         R                  " S[         R                  5      nU R                  S5       HV  nUR	                  U5       VVVs/ s H  u  p4nU[        U5      [        U5      4PM     nnnn[        S U 5       5      nUv   MX     gs  snnnf 7f)a  
This module parses the tri-tuple format that REPP outputs using the
"--format triple" option and returns an generator with tuple of string
tokens.

:param repp_output:
:type repp_output: type
:return: an iterable of the tokenized sentences as tuples of strings
:rtype: iter(tuple)
z^\((\d+), (\d+), (.+)\)$z

c              3   *   #    U  H	  oS    v   M     g7f)   N ).0ts     r   	<genexpr>3ReppTokenizer.parse_repp_outputs.<locals>.<genexpr>   s     =(<1A$(<s   N)recompile	MULTILINEsplitfindallinttuple)r2   
line_regexsectionstartendtokenwords_with_positionswordss           r   r+    ReppTokenizer.parse_repp_outputsu   s      ZZ ;R\\J
"((0G *4););G)D$)D%E E
CH-)D ! $ =(<==E&& 1$s   AB$B
5&Bc                    [         R                  R                  U5      (       a  UnO
[        USS9n[         R                  R                  US-   5      (       d   e[         R                  R                  US-   5      (       d   eU$ )zH
A module to find REPP tokenizer binary and its *repp.set* config file.
)REPP_TOKENIZER)env_varsr8   r9   )ospathexistsr   )r   repp_dirname	_repp_dirs      r   r
    ReppTokenizer.find_repptokenizer   sk     77>>,''$I 8KLIww~~i+56666ww~~i/9::::r   )r   r   r   N)utf8)F)__name__
__module____qualname____firstlineno____doc__r   r   r   r&   staticmethodr(   r+   r
   __static_attributes__rI   r   r   r   r      sI    @!	5%4  
 ' '(r   r   )r`   rN   r@   sysr   	nltk.datar   nltk.internalsr   nltk.tokenize.apir   r   rI   r   r   <module>rr      s-    
 	  
  ( # (@J @r   