o
    rZh$                     @   sb   d Z ddlmZ ddlmZmZmZ G dd deZdddZG d	d
 d
e	Z
G dd deZdS )zACorpus reader for the XML version of the British National Corpus.    )concat)ElementTreeXMLCorpusReaderXMLCorpusViewc                   @   sT   e Zd ZdZdddZdddZdd	d
ZdddZdddZdddZ	dd Z
dS )BNCCorpusReadera7  Corpus reader for the XML version of the British National Corpus.

    For access to the complete XML data structure, use the ``xml()``
    method.  For access to simple word lists and tagged word lists, use
    ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``.

    You can obtain the full version of the BNC corpus at
    https://www.ota.ox.ac.uk/desc/2554

    If you extracted the archive to a directory called `BNC`, then you can
    instantiate the reader as::

        BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml')

    Tc                 C   s   t | || || _d S N)r   __init___lazy)selfrootfileidsZlazy r   E/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/bnc.pyr      s   
zBNCCorpusReader.__init__NFc                 C      |  |dd||S )aT  
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)

        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        FN_viewsr
   r   strip_spacestemr   r   r   words#   s   
zBNCCorpusReader.wordsc                 C   s   |rdnd}|  |d|||S )a   
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))

        :param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        c5posFr   r
   r   r   r   r   tagr   r   r   tagged_words/   s   zBNCCorpusReader.tagged_wordsc                 C   r   )a  
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))

        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        TNr   r   r   r   r   sents?   s   zBNCCorpusReader.sentsc                 C   s    |rdnd}| j |d|||dS )a  
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
        :rtype: list(list(tuple(str,str)))

        :param c5: If true, then the tags used will be the more detailed
            c5 tags.  Otherwise, the simplified tags will be used.
        :param strip_space: If true, then strip trailing spaces from
            word tokens.  Otherwise, leave the spaces on the tokens.
        :param stem: If true, then use word stems instead of word strings.
        r   r   T)sentr   r   r   r   r   r   r   r   tagged_sentsL   s   
zBNCCorpusReader.tagged_sentsc                    s4   | j rtn| j t fdd| |D S )zPA helper function that instantiates BNCWordViews or the list of words/sentences.c                    s   g | ]
} |qS r   r   ).0fileidfr   r   r   r   r   r   
<listcomp>a   s    z*BNCCorpusReader._views.<locals>.<listcomp>)r	   BNCWordView_wordsr   Zabspaths)r
   r   r   r   r   r   r   r    r   r   ]   s   zBNCCorpusReader._viewsc              	   C   s   g }t | }|dD ]U}g }	t|D ]9}
|
j}|sd}|s#|r'| }|r/|
d|}|dkr;||
df}n|dkrJ||
d|
df}|	| q|r^|t	|j
d |	 q||	 qd|vsjJ |S )a  
        Helper used to implement the view methods -- returns a list of
        words or a list of sentences, optionally tagged.

        :param fileid: The name of the underlying file.
        :param bracket_sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        z.//s hwr   r   nN)r   parseZgetrootfindall_all_xmlwords_intextstripgetappendBNCSentenceattribextend)r
   r   Zbracket_sentr   r   r   resultZxmldocZxmlsentr   Zxmlwordwordr   r   r   r$   g   s,   zBNCCorpusReader._words)T)NTF)NFTF)NFFTF)__name__
__module____qualname____doc__r   r   r   r   r   r   r$   r   r   r   r   r      s    






r   Nc                 C   s:   |d u rg }| D ]}|j dv r|| qt|| q|S )N)cw)r   r.   r*   )eltr2   childr   r   r   r*      s   
r*   c                   @   s   e Zd ZdZdd ZdS )r/   z
    A list of words, augmented by an attribute ``num`` used to record
    the sentence identifier (the ``n`` attribute from the XML).
    c                 C   s   || _ t| | d S r   )numlistr   )r
   r<   itemsr   r   r   r      s   zBNCSentence.__init__N)r4   r5   r6   r7   r   r   r   r   r   r/      s    r/   c                   @   sB   e Zd ZdZh dZ	 dd Zdd Zdd Zd	d
 Zdd Z	dS )r#   zN
    A stream backed corpus view specialized for use with the BNC corpus.
    >   aligneventshiftgapZunclearZvocalpauseZpbc                 C   s|   |rd}nd}|| _ || _|| _|| _d| _d| _d| _d| _t	| || | 
  | | jd| j |   ddi| _dS )aG  
        :param fileid: The name of the underlying file.
        :param sent: If true, include sentence bracketing.
        :param tag: The name of the tagset to use, or None for no tags.
        :param strip_space: If true, strip spaces from word tokens.
        :param stem: If true, then substitute stems for words.
        z.*/sz.*/s/(.*/)?(c|w)Nz.*/teiHeader$r   r   )_sent_tag_strip_space_stemtitleauthoreditorrespsr   r   _openZ
read_block_streamhandle_headercloseZ_tag_context)r
   r   r   r   r   r   Ztagspecr   r   r   r      s    zBNCWordView.__init__c                 C   s   | d}|rddd |D | _| d}|r$ddd |D | _| d}|r6ddd |D | _| d	}|rJd
dd |D | _d S d S )NztitleStmt/title
c                 s       | ]}|j  V  qd S r   r+   r,   )r   rH   r   r   r   	<genexpr>       z,BNCWordView.handle_header.<locals>.<genexpr>ztitleStmt/authorc                 s   rQ   r   rR   )r   rI   r   r   r   rS      rT   ztitleStmt/editorc                 s   rQ   r   rR   )r   rJ   r   r   r   rS      rT   ztitleStmt/respStmtz

c                 s   s$    | ]}d  dd |D V  qdS )rP   c                 s   rQ   r   rR   )r   Zresp_eltr   r   r   rS      rT   z6BNCWordView.handle_header.<locals>.<genexpr>.<genexpr>N)join)r   respr   r   r   rS      s    
)r)   rU   rH   rI   rJ   rK   )r
   r:   contexttitlesZauthorsZeditorsrK   r   r   r   rN      s   




zBNCWordView.handle_headerc                 C   s   | j r| |S | |S r   )rD   handle_senthandle_word)r
   r:   rW   r   r   r   
handle_elt   s   

zBNCWordView.handle_eltc                 C   st   |j }|sd}| js| jr| }| jr|d|}| jdkr(||df}|S | jdkr8||d|df}|S )Nr%   r&   r   r   )r+   rF   rG   r,   r-   rE   )r
   r:   r3   r   r   r   rZ      s   

zBNCWordView.handle_wordc                    sv   g }|D ].}|j dv r| fdd|D 7 }q|j dv r%| | q|j  jvr2td|j  qt|jd |S )N)mwhiZcorrtruncc                    s   g | ]}  |qS r   )rZ   )r   r9   r
   r   r   r"     s    z+BNCWordView.handle_sent.<locals>.<listcomp>)r9   r8   zUnexpected element %sr'   )r   r.   rZ   tags_to_ignore
ValueErrorr/   r0   )r
   r:   r   r;   r   r_   r   rY      s   

zBNCWordView.handle_sentN)
r4   r5   r6   r7   r`   r   rN   r[   rZ   rY   r   r   r   r   r#      s    
 r#   r   )r7   Znltk.corpus.reader.utilr   Znltk.corpus.reader.xmldocsr   r   r   r   r*   r=   r/   r#   r   r   r   r   <module>   s   
