o
    rZhT                     @   s   d Z ddlZddlT ddlT ddlmZ ddlmZ ddlm	Z	m
Z
 G dd deZG d	d
 d
ZG dd deZG dd deZdS )z!
Read CoNLL-style chunk fileids.
    N)*)map_tag)Tree)LazyConcatenationLazyMapc                   @   s,  e Zd ZdZdZdZdZdZdZdZ	dZ
eeeeee	e
fZd	d
ddded	d	fddZd<ddZd<ddZd=ddZd=ddZd>ddZd>ddZd>ddZd<ddZd?d d!Zd=d"d#Zd=d$d%Zd<d&d'Zd(d) Zd*d+ Zd<d,d-Zd<d.d/Zd<d0d1Zd<d2d3Zd4d5 Z d6d7 Z!d8d9 Z"e#d:d; Z$d	S )@ConllCorpusReadera  
    A corpus reader for CoNLL-style files.  These files consist of a
    series of sentences, separated by blank lines.  Each sentence is
    encoded using a table (or "grid") of values, where each line
    corresponds to a single word, and each column corresponds to an
    annotation type.  The set of columns used by CoNLL-style files can
    vary from corpus to corpus; the ``ConllCorpusReader`` constructor
    therefore takes an argument, ``columntypes``, which is used to
    specify the columns that are used by a given corpus. By default
    columns are split by consecutive whitespaces, with the
    ``separator`` argument you can set a string to split by (e.g.
    ``'	'``).


    @todo: Add support for reading from corpora where different
        parallel files contain different columns.
    @todo: Possibly add caching of the grid corpus view?  This would
        allow the same grid view to be used by different data access
        methods (eg words() and parsed_sents() could both share the
        same grid corpus view object).
    @todo: Better support for -DOCSTART-.  Currently, we just ignore
        it, but it could be used to define methods that retrieve a
        document at a time (eg parsed_documents()).
    wordspostreechunknesrlignoreNSFTutf8c                 C   s   |D ]}|| j vrtd| qt|tr|g}|| _dd t|D | _|| _|| _|| _	|	| _
t| ||| |
| _|| _d S )NzBad column type %rc                 S   s   i | ]\}}||qS  r   ).0icr   r   G/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/conll.py
<dictcomp>W   s    z.ConllCorpusReader.__init__.<locals>.<dictcomp>)COLUMN_TYPES
ValueError
isinstancestr_chunk_types	enumerate_colmap_pos_in_tree_root_label_srl_includes_roleset_tree_classCorpusReader__init___tagsetsep)selfrootfileidscolumntypeschunk_typesZ
root_labelpos_in_treeZsrl_includes_rolesetencodingZ
tree_classtagset	separator
columntyper   r   r   r#   C   s   


zConllCorpusReader.__init__c                 C   s"   |  | j tt| j| |S N)_requireWORDSr   r   
_get_words_gridsr&   r(   r   r   r   r   d   s   zConllCorpusReader.wordsc                 C      |  | j t| j| |S r0   )r1   r2   r   r3   r4   r5   r   r   r   sentsh      zConllCorpusReader.sentsc                    s2      j j  fdd}tt| |S )Nc                         | S r0   _get_tagged_wordsgridr&   r-   r   r   get_tagged_wordso      z8ConllCorpusReader.tagged_words.<locals>.get_tagged_words)r1   r2   POSr   r   r4   r&   r(   r-   r?   r   r>   r   tagged_wordsl   s   zConllCorpusReader.tagged_wordsc                    s.      j j  fdd}t| |S )Nc                    r9   r0   r:   r<   r>   r   r   r?   w   r@   z8ConllCorpusReader.tagged_sents.<locals>.get_tagged_words)r1   r2   rA   r   r4   rB   r   r>   r   tagged_sentst   s   zConllCorpusReader.tagged_sentsc                    sF    jjj  d u rj  fdd}tt||S )Nc                        |  S r0   _get_chunked_wordsr<   r*   r&   r-   r   r   get_chunked_words      z:ConllCorpusReader.chunked_words.<locals>.get_chunked_words)r1   r2   rA   CHUNKr   r   r   r4   r&   r(   r*   r-   rI   r   rH   r   chunked_words|   s
   zConllCorpusReader.chunked_wordsc                    B    jjj  d u rj  fdd}t||S )Nc                    rE   r0   rF   r<   rH   r   r   rI      rJ   z:ConllCorpusReader.chunked_sents.<locals>.get_chunked_words)r1   r2   rA   rK   r   r   r4   rL   r   rH   r   chunked_sents   
   zConllCorpusReader.chunked_sentsc                    rN   )Nc                    rE   r0   )_get_parsed_sentr<   r+   r&   r-   r   r   get_parsed_sent   rJ   z7ConllCorpusReader.parsed_sents.<locals>.get_parsed_sent)r1   r2   rA   TREEr   r   r4   )r&   r(   r+   r-   rS   r   rR   r   parsed_sents   rP   zConllCorpusReader.parsed_sentsc                 C   r6   r0   )r1   SRLr   _get_srl_spansr4   r5   r   r   r   	srl_spans   r8   zConllCorpusReader.srl_spansc                    sT    jjjj  d u rj  fdd}t||}|r(t|}|S )Nc                    s    |  S r0   )_get_srl_instancesr<   r+   r&   r   r   get_srl_instances   r@   z:ConllCorpusReader.srl_instances.<locals>.get_srl_instances)	r1   r2   rA   rT   rV   r   r   r4   r   )r&   r(   r+   flattenr[   resultr   rZ   r   srl_instances   s   zConllCorpusReader.srl_instancesc                    s6      j j j  fdd}tt| |S )z
        :return: a list of word/tag/IOB tuples
        :rtype: list(tuple)
        :param fileids: the list of fileids that make up this corpus
        :type fileids: None or str or list
        c                    r9   r0   _get_iob_wordsr<   r>   r   r   get_iob_words   r@   z2ConllCorpusReader.iob_words.<locals>.get_iob_words)r1   r2   rA   rK   r   r   r4   r&   r(   r-   ra   r   r>   r   	iob_words   s   zConllCorpusReader.iob_wordsc                    s2      j j j  fdd}t| |S )z
        :return: a list of lists of word/tag/IOB tuples
        :rtype: list(list)
        :param fileids: the list of fileids that make up this corpus
        :type fileids: None or str or list
        c                    r9   r0   r_   r<   r>   r   r   ra      r@   z2ConllCorpusReader.iob_sents.<locals>.get_iob_words)r1   r2   rA   rK   r   r4   rb   r   r>   r   	iob_sents   s   zConllCorpusReader.iob_sentsc                    s   t  fdd |dD S )Nc                    s    g | ]\}}t | j|d qS ))r,   )ZStreamBackedCorpusView_read_grid_block)r   Zfileidencr&   r   r   
<listcomp>   s    z,ConllCorpusReader._grids.<locals>.<listcomp>T)concatZabspathsr5   r   rg   r   r4      s
   

zConllCorpusReader._gridsc                    s   g }t |D ]?}| }|sq fdd|dD }|d  jdd dkr+|d= |D ]}t|t|d kr?td| q-|| q|S )Nc                    s   g | ]}|  jqS r   )splitr%   )r   linerg   r   r   rh          z6ConllCorpusReader._read_grid_block.<locals>.<listcomp>
r   r   z
-DOCSTART-z"Inconsistent number of columns:
%s)Zread_blankline_blockstriprj   r   getlenr   append)r&   streamZgridsblockr=   rowr   rg   r   re      s   z"ConllCorpusReader._read_grid_blockc                 C   s   |  || jd S )Nr   )_get_columnr   )r&   r=   r   r   r   r3      s   zConllCorpusReader._get_wordsc                    sP     | jd }r jkr fdd|D }tt  | jd |S )Nr	   c                       g | ]	}t  j|qS r   r   r$   r   tr>   r   r   rh          z7ConllCorpusReader._get_tagged_words.<locals>.<listcomp>r   ru   r   r$   listzipr&   r=   r-   pos_tagsr   r>   r   r;      s   z#ConllCorpusReader._get_tagged_wordsc              	      s`     | jd }r jkr fdd|D }tt  | jd |  | jd S )Nr	   c                    rv   r   rw   rx   r>   r   r   rh      rz   z4ConllCorpusReader._get_iob_words.<locals>.<listcomp>r   r   r{   r~   r   r>   r   r`      s   z ConllCorpusReader._get_iob_wordsc                    s.    | jd }  | jd }r# jkr# fdd|D }  | jd }t jg g}t|||D ]Y\}}	}
|
dkrGd\}}n|
d\}}|d urX||vrXd}|d	krf||d
  krfd}|dv rtt|dkrt|	  |dkrt|g }|d
 
| |
| |d
 
||	f q9|d S )Nr   r	   c                    rv   r   rw   rx   r>   r   r   rh     rz   z8ConllCorpusReader._get_chunked_words.<locals>.<listcomp>r   O)r    -IBZBO   r   )ru   r   r$   r   r   r}   rj   labelrp   poprq   )r&   r=   r*   r-   r   r   Z
chunk_tagsstackwordpos_tagZ	chunk_tagstateZ
chunk_typeZ	new_chunkr   r>   r   rG     s,   


z$ConllCorpusReader._get_chunked_wordsc              
      s    | jd }  | jd }r# jkr# fdd|D }  | jd }d}t|||D ]:\}}	}
|dkr?d}|d	krEd
}|	dkrKd}	|	d	krQd
}	|
d\}}|d	d	 }|| d|	 d| d| 7 }q4z j|}W n tt	fy    jd j
 d| d	}Y nw |s| D ]'}t|D ] \}}t|trt|dkrt|d tr|d | f||< qq|S )Nr   r	   c                    rv   r   rw   rx   r>   r   r   rh   ,  rz   z6ConllCorpusReader._get_parsed_sent.<locals>.<listcomp>r
   r   (z-LRB-)z-RRB-r   z ( z)    r   )ru   r   r$   r}   rj   countr!   Z
fromstringr   
IndexErrorr   Zsubtreesr   r   r   rp   r   r   )r&   r=   r+   r-   r   r   Z
parse_tagsZtreestrr   r   	parse_tagleftrightr
   Zsubtreer   childr   r>   r   rQ   (  sF     z"ConllCorpusReader._get_parsed_sentc                 C   s  | j r| || jd d }| jd d }n| || jd }| jd d }tdd |D }g }t|D ]O}| ||| }g }g }	t|D ]7\}
}|d\}}|dD ]}|rb|	||
f qWt|dD ]}|		 \}}|||
d f|f qjqG|| q5|S )	z;
        list of list of (start, end), tag) tuples
        r   r   r   c                 S   s   g | ]}|d kr|qS )r   r   )r   pr   r   r   rh   Z  rl   z4ConllCorpusReader._get_srl_spans.<locals>.<listcomp>r   r   r   )
r    ru   r   rp   ranger   rj   rq   r   r   )r&   r=   
predicatesZ	start_colZ	num_preds	spanlistsr   colspanlistr   wordnumZsrl_tagr   r   tagstartr   r   r   rW   M  s.   z ConllCorpusReader._get_srl_spansc              	   C   s   |  ||}| |}| jr#| || jd d }| || jd }n| || jd }d gt| }t|}t|D ]9\}}	|	dkrDq;|D ]}
|
D ]\\}}}|t||v r]|dv r] nqJqF nt	d|	 |
t|||	|| |
 q;|S )Nr   r   r   VzC-VzNo srl column found for %r)rQ   rW   r    ru   r   rp   ConllSRLInstanceListr   r   r   rq   ConllSRLInstance)r&   r=   r+   r
   r   r   Zrolesets	instancesr   	predicater   r   endr   r   r   r   rY   m  s.   
z$ConllCorpusReader._get_srl_instancesc                 G   s$   |D ]}|| j vrtd| qd S )Nz)This corpus does not contain a %s column.)r   r   )r&   r)   r/   r   r   r   r1     s   
zConllCorpusReader._requirec                    s    fddt tD S )Nc                    s   g | ]}|   qS r   r   r   r   column_indexr=   r   r   rh     rl   z1ConllCorpusReader._get_column.<locals>.<listcomp>)r   rp   )r=   r   r   r   r   ru     s   zConllCorpusReader._get_columnr0   )NN)NNN)NNT)%__name__
__module____qualname____doc__r2   rA   rT   rK   ZNErV   ZIGNOREr   r   r#   r   r7   rC   rD   rM   rO   rU   rX   r^   rc   rd   r4   re   r3   r;   r`   rG   rQ   rW   rY   r1   staticmethodru   r   r   r   r   r      sT    

!

















"% $r   c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	r   z|
    An SRL instance from a CoNLL corpus, which identifies and
    providing labels for the arguments of a single verb.
    c           	      C   s   g | _ 	 || _	 || _|| _g | _	 || _	 || _	 | | _	 |D ]!\\}}}|dv r9|  j t	t
||7  _ q"| j||f|f q"d S )Nr   )verb	verb_head	verb_stemroleset	argumentstagged_spansr
   leavesr   r|   r   rq   )	r&   r
   r   r   r   r   r   r   r   r   r   r   r#     s&   
zConllSRLInstance.__init__c                 C   s,   t | jdkr	dnd}d| jt | j|f S )Nr   sr   z,<ConllSRLInstance for %r with %d argument%s>)rp   r   r   )r&   pluralr   r   r   __repr__  s   zConllSRLInstance.__repr__c           	         s   d  fdd jD }d|d jd}d}t jD ]8\}}t|tr+|d } jD ]\\}}}||kr>|d	| 7 }||krF|d
7 }q.| jv rPd| }||d 7 }q|tj	|
ddddd S )Nr   c                 3   s    | ]
} j | d  V  qdS )r   N)r   r   rg   r   r   	<genexpr>  s    z*ConllSRLInstance.pprint.<locals>.<genexpr>zSRL for z (stem=z):
r   r   z[%s z] z<<%s>>z ]]z    )initial_indentsubsequent_indent)joinr   r   r   r   r   tupler   textwrapfillreplace)	r&   Zverbstrhdrr   r   r   r   r   argidr   rg   r   pprint  s$   

zConllSRLInstance.pprintN)r   r   r   r   r#   r   r   r   r   r   r   r     s
    *r   c                   @   s4   e Zd ZdZdddZdd Zddd	Zd
d ZdS )r   z0
    Set of instances for a single sentence
    r   c                 C   s   || _ t| | d S r0   )r
   r|   r#   )r&   r
   r   r   r   r   r#     s   zConllSRLInstanceList.__init__c                 C   s   |   S r0   )r   rg   r   r   r   __str__  s   zConllSRLInstanceList.__str__Fc                 C   sF  | D ]}|j | j krtdq|r.| j  }d gt| }dgt| }| | j d||| d}tt|D ]j}|rW|d||  7 }|d||  7 }|dt|| d 7 }| D ]}||jkri|d|j	 7 } nqY|dd 7 }| D ])}d}|j
D ]\\}	}
}||	krd	| | }||
d
 kr|d7 }qy|d| 7 }qr|d7 }q6|S )NzTree mismatch!r   r   r   z%-20s z%-8s z
%15s*%-8s r   r   r   r   z%-12s rm   )r
   r   r   rp   _tree2conllr   r   rj   r   r   r   )r&   Zinclude_treeinstr   r	   syntr   r   Zargstrr   r   r   r   r   r   r     s@   


zConllSRLInstanceList.pprintc                 C   s   t |tsJ t|dkr(t |d tr(| ||< || |d ks$J |d S t|dkrMt |d trMt|d dks?J |d \||< ||< |d S d|  ||  ||< |D ]}| |||||}q\||d   d7  < |S )Nr   r   r   r   r   )r   r   rp   r   r   r   r   )r&   r
   r   r   r	   r   r   r   r   r   r     s   z ConllSRLInstanceList._tree2conllN)r   )F)r   r   r   r   r#   r   r   r   r   r   r   r   r     s    

(r   c                   @   s   e Zd ZdZ	dddZdS )ConllChunkCorpusReaderz`
    A ConllCorpusReader whose data file contains three columns: words,
    pos, and chunk.
    r   Nc              
   C   s   t j| ||d||||d d S )N)r   r	   r   )r*   r,   r-   r.   )r   r#   )r&   r'   r(   r*   r,   r-   r.   r   r   r   r#   7  s   
zConllChunkCorpusReader.__init__)r   NN)r   r   r   r   r#   r   r   r   r   r   1  s    r   )r   r   Znltk.corpus.reader.apiZnltk.corpus.reader.utilZnltk.tagr   Z	nltk.treer   Z	nltk.utilr   r   r"   r   r   r|   r   r   r   r   r   r   <module>   s      
NF