o
    rZh6                     @   s   d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	 dd Z
G dd	 d	e	ZG d
d dZG dd dZG dd deZdS )z9
A reader for corpora whose documents are in MTE format.
    N)reduce)TaggedCorpusReaderconcat)XMLCorpusViewc                 C   s   |  ||S N)findall)rootpathns r   E/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/mte.pyxpath   s   r   c                   @   s$   e Zd ZdZdddZdddZdS )	MTECorpusViewz0
    Class for lazy viewing the MTE Corpus.
    Nc                 C   s   t | ||| d S r   )r   __init__)selfZfileidtagspecelt_handlerr   r   r   r      s   zMTECorpusView.__init__c              	   C   s   t tdd t| |||S )Nc                 S      | d uS r   r   xr   r   r   <lambda>       z*MTECorpusView.read_block.<locals>.<lambda>)listfilterr   
read_block)r   streamr   r   r   r   r   r      s   zMTECorpusView.read_blockr   )NN)__name__
__module____qualname____doc__r   r   r   r   r   r   r      s    
r   c                   @   s   e Zd ZdZdddZdZdZdZdZd	Z	d
d Z
edd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zedd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0S )1MTEFileReaderz
    Class for loading the content of the multext-east corpus. It
    parses the xml files and does some tag-filtering depending on the
    given method parameters.
    zhttps://www.tei-c.org/ns/1.0z%https://www.w3.org/XML/1998/namespace)Zteixmlz{https://www.tei-c.org/ns/1.0}z'{https://www.w3.org/XML/1998/namespace}zTEI/text/body/div/div/p/s/(w|c)zTEI/text/body/div/div/p/szTEI/text/body/div/div/pc                 C   s
   || _ d S r   )_MTEFileReader__file_path)r   	file_pathr   r   r   r   3   s   
zMTEFileReader.__init__c                 C   s   |j S r   )textclseltcontextr   r   r   	_word_elt6   s   zMTEFileReader._word_eltc                        fddt |d jD S )Nc                       g | ]}  |d qS r   )r)   .0wr&   r   r   
<listcomp><       z+MTEFileReader._sent_elt.<locals>.<listcomp>*r   r
   r%   r   r/   r   	_sent_elt:      zMTEFileReader._sent_eltc                    r*   )Nc                    r+   r   )r4   r-   sr/   r   r   r0   @   r1   z+MTEFileReader._para_elt.<locals>.<listcomp>r2   r3   r%   r   r/   r   	_para_elt>   r5   zMTEFileReader._para_eltc                 C   s   d|j vr
|jdfS | jdkr| jdkr|j|j d fS | jdkr1| jdkr1|jt|j d fS tdtdd| j d }|	|j d r`| jdkrU|j|j d fS |jt|j d fS d S )	NZana msd	universal^-.z.*$)
attribr$   _MTEFileReader__tags_MTEFileReader__tagsetMTETagConvertermsd_to_universalrecompilesubmatch)r&   r'   r(   tagsr   r   r   _tagged_word_eltB   s   


zMTEFileReader._tagged_word_eltc                    *   t tdd  fddt|d jD S )Nc                 S   r   r   r   r   r   r   r   r   \   r   z0MTEFileReader._tagged_sent_elt.<locals>.<lambda>c                    r+   r   )rI   r,   r/   r   r   r0   ]   r1   z2MTEFileReader._tagged_sent_elt.<locals>.<listcomp>r2   r   r   r   r
   r%   r   r/   r   _tagged_sent_eltX      zMTEFileReader._tagged_sent_eltc                    rJ   )Nc                 S   r   r   r   r   r   r   r   r   e   r   z0MTEFileReader._tagged_para_elt.<locals>.<lambda>c                    r+   r   )rL   r6   r/   r   r   r0   f   r1   z2MTEFileReader._tagged_para_elt.<locals>.<listcomp>r2   rK   r%   r   r/   r   _tagged_para_elta   rM   zMTEFileReader._tagged_para_eltc                 C   s$   d|j vr
|jdfS |j|j d fS )NZlemmar9   )r?   r$   r%   r   r   r   _lemma_word_eltj   s   

zMTEFileReader._lemma_word_eltc                    r*   )Nc                    r+   r   )rO   r,   r/   r   r   r0   s   r1   z1MTEFileReader._lemma_sent_elt.<locals>.<listcomp>r2   r3   r%   r   r/   r   _lemma_sent_eltq   r5   zMTEFileReader._lemma_sent_eltc                    r*   )Nc                    r+   r   )rP   r6   r/   r   r   r0   w   r1   z1MTEFileReader._lemma_para_elt.<locals>.<listcomp>r2   r3   r%   r   r/   r   _lemma_para_eltu   r5   zMTEFileReader._lemma_para_eltc                 C      t | jtjtjS r   )r   r"   r    	word_pathr)   r   r   r   r   wordsy      zMTEFileReader.wordsc                 C   rR   r   )r   r"   r    	sent_pathr4   rT   r   r   r   sents~   rV   zMTEFileReader.sentsc                 C   rR   r   )r   r"   r    	para_pathr8   rT   r   r   r   paras   rV   zMTEFileReader.parasc                 C   rR   r   )r   r"   r    rS   rO   rT   r   r   r   lemma_words   rV   zMTEFileReader.lemma_wordsc                 C      |t _|t _t| jt jt jS r   )r    rA   r@   r   r"   rS   rI   r   tagsetrH   r   r   r   tagged_words   
   zMTEFileReader.tagged_wordsc                 C   rR   r   )r   r"   r    rW   rP   rT   r   r   r   lemma_sents   rV   zMTEFileReader.lemma_sentsc                 C   r\   r   )r    rA   r@   r   r"   rW   rL   r]   r   r   r   tagged_sents   r`   zMTEFileReader.tagged_sentsc                 C   rR   r   )r   r"   r    rY   rQ   rT   r   r   r   lemma_paras   rV   zMTEFileReader.lemma_parasc                 C   r\   r   )r    rA   r@   r   r"   rY   rN   r]   r   r   r   tagged_paras   r`   zMTEFileReader.tagged_parasN)r   r   r   r   r
   Ztag_nsZxml_nsrS   rW   rY   r   classmethodr)   r4   r8   rI   rL   rN   rO   rP   rQ   rU   rX   rZ   r[   r_   ra   rb   rc   rd   r   r   r   r   r    "   sL    








r    c                   @   s:   e Zd ZdZdddddddd	d
ddddZedd ZdS )rB   zu
    Class for converting msd tags to universal tags, more conversion
    options are currently not implemented.
    ZADJZADPZADVZCONJZDETZNOUNZNUMZPRTZPRONZVERBr>   X)ASRCDNMQPVr>   r=   c                 C   s4   | d dks
| d n| d }|t jvrd}t j| S )z
        This function converts the annotation from the Multex-East to the universal tagset
        as described in Chapter 5 of the NLTK-Book

        Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so
        r   #   r=   )rB   mapping_msd_universal)tagZ	indicatorr   r   r   rC      s   

z MTETagConverter.msd_to_universalN)r   r   r   r   rs   staticmethodrC   r   r   r   r   rB      s"    rB   c                   @   s|   e Zd ZdZdddZdd Zddd	Zdd
dZdddZdddZ	dddZ
dddZdddZdddZdddZdS )MTECorpusReaderz
    Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East.
    MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging
    scheme. These tags can be converted to the Universal tagset
    Nutf8c                 C   s   t | ||| d| _dS )a.  
        Construct a new MTECorpusreader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP

        :param root: The root directory for this corpus. (default points to location in multext config file)
        :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml)
        :param encoding: The encoding of the given files (default is utf8)
        z00README.txtN)r   r   Z_readme)r   r   fileidsencodingr   r   r   r      s   
zMTECorpusReader.__init__c                    sP   |d u r j }nt|tr|g}t fdd|}tdd |}|s&td |S )Nc                    s
   |  j v S r   )_fileidsr   rT   r   r   r      s   
 z+MTECorpusReader.__fileids.<locals>.<lambda>c                 S   s   | dvS )N)zoana-bg.xmlzoana-mk.xmlr   r   r   r   r   r      r   z$No valid multext-east file specified)rz   
isinstancestrr   printr   rx   r   rT   r   Z	__fileids   s   
zMTECorpusReader.__fileidsc                       t  fdd |D S )z
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        c                    $   g | ]}t tj j| qS r   )r    osr	   join_rootrU   r-   frT   r   r   r0          z)MTECorpusReader.words.<locals>.<listcomp>r   _MTECorpusReader__fileidsr~   r   rT   r   rU      s
   
zMTECorpusReader.wordsc                    r   )z
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of sentences or utterances,
                 each encoded as a list of word strings
        :rtype: list(list(str))
        c                    r   r   )r    r   r	   r   r   rX   r   rT   r   r   r0   
  r   z)MTECorpusReader.sents.<locals>.<listcomp>r   r~   r   rT   r   rX     
   
zMTECorpusReader.sentsc                    r   )a  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of paragraphs, each encoded as a list
                 of sentences, which are in turn encoded as lists of word string
        :rtype: list(list(list(str)))
        c                    r   r   )r    r   r	   r   r   rZ   r   rT   r   r   r0     r   z)MTECorpusReader.paras.<locals>.<listcomp>r   r~   r   rT   r   rZ     r   zMTECorpusReader.parasc                    r   )a  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of words, the corresponding lemmas
                 and punctuation symbols, encoded as tuples (word, lemma)
        :rtype: list(tuple(str,str))
        c                    r   r   )r    r   r	   r   r   r[   r   rT   r   r   r0   &  r   z/MTECorpusReader.lemma_words.<locals>.<listcomp>r   r~   r   rT   r   r[     r   zMTECorpusReader.lemma_wordsr:   r9   c                    <   dksdkrt  fdd |D S td dS )a;  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of tagged words and punctuation symbols
                 encoded as tuples (word, tag)
        :rtype: list(tuple(str, str))
        r;   r:   c                    (   g | ]}t tj j|qS r   )r    r   r	   r   r   r_   r   r   rH   r^   r   r   r0   9      z0MTECorpusReader.tagged_words.<locals>.<listcomp>Unknown tagset specified.Nr   r   r}   r   rx   r^   rH   r   r   r   r_   ,     	zMTECorpusReader.tagged_wordsc                    r   )aB  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of sentences or utterances, each
                 encoded as a list of tuples of the word and the corresponding
                 lemma (word, lemma)
        :rtype: list(list(tuple(str, str)))
        c                    r   r   )r    r   r	   r   r   ra   r   rT   r   r   r0   L  r   z/MTECorpusReader.lemma_sents.<locals>.<listcomp>r   r~   r   rT   r   ra   C  
   
zMTECorpusReader.lemma_sentsc                    r   )aH  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of sentences or utterances, each
                 each encoded as a list of (word,tag) tuples
        :rtype: list(list(tuple(str, str)))
        r;   r:   c                    r   r   )r    r   r	   r   r   rb   r   r   r   r   r0   _  r   z0MTECorpusReader.tagged_sents.<locals>.<listcomp>r   Nr   r   r   r   r   rb   R  r   zMTECorpusReader.tagged_sentsc                    r   )am  
        :param fileids: A list specifying the fileids that should be used.
        :return: the given file(s) as a list of paragraphs, each encoded as a
                 list of sentences, which are in turn encoded as a list of
                 tuples of the word and the corresponding lemma (word, lemma)
        :rtype: list(List(List(tuple(str, str))))
        c                    r   r   )r    r   r	   r   r   rc   r   rT   r   r   r0   r  r   z/MTECorpusReader.lemma_paras.<locals>.<listcomp>r   r~   r   rT   r   rc   i  r   zMTECorpusReader.lemma_parasc                    r   )a  
        :param fileids: A list specifying the fileids that should be used.
        :param tagset: The tagset that should be used in the returned object,
                       either "universal" or "msd", "msd" is the default
        :param tags: An MSD Tag that is used to filter all parts of the used corpus
                     that are not more precise or at least equal to the given tag
        :return: the given file(s) as a list of paragraphs, each encoded as a
                 list of sentences, which are in turn encoded as a list
                 of (word,tag) tuples
        :rtype: list(list(list(tuple(str, str))))
        r;   r:   c                    r   r   )r    r   r	   r   r   rd   r   r   r   r   r0     r   z0MTECorpusReader.tagged_paras.<locals>.<listcomp>r   Nr   r   r   r   r   rd   x  s   	zMTECorpusReader.tagged_paras)NNrw   r   )Nr:   r9   )r   r   r   r   r   r   rU   rX   rZ   r[   r_   ra   rb   rc   rd   r   r   r   r   rv      s    








rv   )r   r   rD   	functoolsr   Znltk.corpus.readerr   r   Znltk.corpus.reader.xmldocsr   r   r   r    rB   rv   r   r   r   r   <module>   s     %