o
    rZŽh¯  ã                   @   sj   d Z ddlZddlmZ ddlT ddlT ddlT G dd„ dƒZG dd„ deƒZ	G d	d
„ d
e
ƒZdd„ ZdS )a  
Read from the Senseval 2 Corpus.

SENSEVAL [http://www.senseval.org/]
Evaluation exercises for Word Sense Disambiguation.
Organized by ACL-SIGLEX [https://www.siglex.org/]

Prepared by Ted Pedersen <tpederse@umn.edu>, University of Minnesota,
https://www.d.umn.edu/~tpederse/data.html
Distributed with permission.

The NLTK version of the Senseval 2 files uses well-formed XML.
Each instance of the ambiguous words "hard", "interest", "line", and "serve"
is tagged with a sense identifier, and supplied with context.
é    N)ÚElementTree)Ú*c                   @   s   e Zd Zdd„ Zdd„ ZdS )ÚSensevalInstancec                 C   s    || _ t|ƒ| _|| _|| _d S ©N)ÚwordÚtupleÚsensesÚpositionÚcontext)Úselfr   r	   r
   r   © r   úJ/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/senseval.pyÚ__init__"   s   

zSensevalInstance.__init__c                 C   s   d| j | j| j| jf S )Nz=SensevalInstance(word=%r, position=%r, context=%r, senses=%r))r   r	   r
   r   )r   r   r   r   Ú__repr__(   s   üzSensevalInstance.__repr__N)Ú__name__Ú
__module__Ú__qualname__r   r   r   r   r   r   r   !   s    r   c                   @   s   e Zd Zddd„Zdd„ ZdS )ÚSensevalCorpusReaderNc                 C   s   t dd„ |  |d¡D ƒƒS )Nc                 S   s   g | ]	\}}t ||ƒ‘qS r   )ÚSensevalCorpusView)Ú.0ÚfileidÚencr   r   r   Ú
<listcomp>4   s    ÿÿz2SensevalCorpusReader.instances.<locals>.<listcomp>T)ÚconcatZabspaths)r   Zfileidsr   r   r   Ú	instances2   s
   
þÿzSensevalCorpusReader.instancesc                 C   sV   g }|  d¡D ]!}|  d¡D ]}|d jd }dd„ |d D ƒ}| ||f¡ qq|S )NÚlexeltÚinstancer   Úsenseidc                 S   s   g | ]
}|j |jd  f‘qS )Úpos)ÚtextÚattrib)r   Úwr   r   r   r   ?   s    z/SensevalCorpusReader._entry.<locals>.<listcomp>é   )Úfindallr    Úappend)r   ÚtreeÚeltsr   ÚinstZsenser
   r   r   r   Ú_entry:   s   ýzSensevalCorpusReader._entryr   )r   r   r   r   r(   r   r   r   r   r   1   s    
r   c                   @   s$   e Zd Zdd„ Zdd„ Zdd„ ZdS )r   c                 C   s,   t j| ||d tƒ | _dg| _d g| _d S )N)Úencodingr   )ÚStreamBackedCorpusViewr   ZWhitespaceTokenizerÚ_word_tokenizerÚ_lexelt_startsÚ_lexelts)r   r   r)   r   r   r   r   E   s   zSensevalCorpusView.__init__c           
      C   s.  t  | j| ¡ ¡d }| j| }g }d}	 | ¡ }|dkr%|g ks#J ‚g S | ¡  d¡rd|d7 }t 	d|¡}|d us<J ‚| 
d¡dd… }|t| jƒk rV|| j| ksUJ ‚n| j |¡ | j | ¡ ¡ | ¡  d¡rs|g ksqJ ‚d}|rz| |¡ | ¡  d	¡r–d
 |¡}t|ƒ}t |¡}	|  |	|¡gS q)Nr"   FTÚ z<lexeltzitem=("[^"]+"|'[^']+')éÿÿÿÿz	<instancez
</instanceÚ
)ÚbisectÚbisect_rightr,   Útellr-   ÚreadlineÚlstripÚ
startswithÚreÚsearchÚgroupÚlenr$   ÚjoinÚ_fixXMLr   Z
fromstringÚ_parse_instance)
r   ÚstreamZ
lexelt_numr   Zinstance_linesZin_instanceÚlineÚmZ	xml_blockr'   r   r   r   Ú
read_blockL   s:   



àzSensevalCorpusView.read_blockc                 C   s¨  g }g }d }|D ]Ä}|j dkr| |jd ¡ q|j dkrÆ|| j |j¡7 }|D ]œ}|j dkr3|d }|j dkr—|d u s@J dƒ‚|j ¡ sMt|ƒdksMJ ‚|j ¡ rZt|ƒdkrZJ ‚t|ƒ}|j ¡ rl| |j ¡ ¡ nL|d j d	kr“| |d j|d jd
 f¡ |d jr’|| j |d j¡7 }n%J dƒ‚|j d	kr¨| |j|jd
 f¡ n|j dkr®n
t	d|j ƒ J dƒ‚|jrÄ|| j |j¡7 }q(qJ d|j  ƒ‚t
||||ƒS )NZanswerr   r
   Zcompoundr   Úheadzhead specified twicer"   Zwfr   Fzexpected CDATA or wf in <head>ÚsZACKz expected CDATA or <wf> or <head>zunexpected tag %s)Útagr$   r    r+   Útokenizer   Ústripr:   ÚtailÚprintr   )r   r   r   r   r
   r	   ÚchildÚcwordr   r   r   r=   u   sH   





€

€ãz"SensevalCorpusView._parse_instanceN)r   r   r   r   rA   r=   r   r   r   r   r   D   s    )r   c                 C   sÖ   t  dd| ¡} t  dd| ¡} t  dd| ¡} t  dd| ¡} t  d	d
| ¡} t  dd| ¡} t  dd| ¡} t  dd| ¡} t  dd| ¡} t  dd| ¡} t  dd| ¡} t  dd| ¡} t  dd| ¡} t  dd| ¡} t  dd| ¡} | S )z:
    Fix the various issues with Senseval pseudo-XML.
    z	<([~\^])>z\1z(\s+)\&(\s+)z	\1&amp;\2z"""z'"'z(<[^<]*snum=)([^">]+)>z\1"\2"/>z<\&frasl>\s*<p[^>]*>ZFRASLz
<\&I[^>]*>r.   z<{([^}]+)}>z	<(@|/?p)>z	<&\w+ \.>z<!DOCTYPE[^>]*>z<\[\/?[^>]+\]*>z
<(\&\w+;)>z&(?!amp|gt|lt|apos|quot)z'[ \t]*([^<>\s]+?)[ \t]*<p="([^"]*"?)"/>z <wf pos="\2">\1</wf>z\s*"\s*<p=\'"\'/>z <wf pos='"'>"</wf>)r7   Úsub)r   r   r   r   r<   ¡   s$   ÿr<   )Ú__doc__r7   Z	xml.etreer   Znltk.corpus.reader.apiZnltk.corpus.reader.utilZnltk.tokenizer   ZCorpusReaderr   r*   r   r<   r   r   r   r   Ú<module>   s   ]