o
    rZh=                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZ dd Z	G dd deZ
G dd	 d	eZG d
d dZG dd deZG dd deZG dd deZdS )    Nconcat)XMLCorpusReaderXMLCorpusViewc                    s   t  d fdd	}|S )zj
    Wraps function arguments:
    if fileids not specified then function set NKJPCorpusReader paths.
    Nc                    s   |s| j } | |fi |S N)_pathsselffileidskwargsfun F/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/nkjp.py	decorator   s   z_parse_args.<locals>.decoratorr   )	functoolswraps)r   r   r   r   r   _parse_args   s   r   c                   @   s   e Zd ZdZdZdZdZdddZdd	 Zd
d Z	dddZ
dd ZedddZedddZedddZedddZedddZdS )NKJPCorpusReaderr            .*c                 C   sD   t |trt| ||d  nt| |dd |D  |  | _dS )aN  
        Corpus reader designed to work with National Corpus of Polish.
        See http://nkjp.pl/ for more details about NKJP.
        use example:
        import nltk
        import nkjp
        from nkjp import NKJPCorpusReader
        x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus
        x.header()
        x.raw()
        x.words()
        x.tagged_words(tags=['subst', 'comp'])  #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html
        x.sents()
        x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s)
        x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'])
        x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp'])
        z.*/header.xmlc                 S   s   g | ]}|d  qS )z/header.xmlr   .0fileidr   r   r   
<listcomp><   s    z-NKJPCorpusReader.__init__.<locals>.<listcomp>N)
isinstancestrr   __init__	get_pathsr   )r	   rootr
   r   r   r   r   &   s   
zNKJPCorpusReader.__init__c                    s    fdd j D S )Nc                    s*   g | ]}t jt j|d d qS 
header.xmlr   )ospathjoinr   _rootsplitr   fr	   r   r   r   A   s    z.NKJPCorpusReader.get_paths.<locals>.<listcomp>Z_fileidsr+   r   r+   r   r    @   s   
zNKJPCorpusReader.get_pathsc                 C   s   dd | j D S )zf
        Returns a list of file identifiers for the fileids that make up
        this corpus.
        c                 S   s   g | ]	}| d d qS r"   r(   r)   r   r   r   r   K   s    z,NKJPCorpusReader.fileids.<locals>.<listcomp>r,   r+   r   r   r   r
   F   s   zNKJPCorpusReader.fileidsNc                 K   sr   | dtj}|tju rt||dS |tju rt||dS |tju r(t||dS |tju r5t	||t	jdS t
d)zQ
        Returns a view specialised for use with particular corpus file.
        mode)tags)r/   r.   zNo such mode!)popr   
WORDS_MODENKJPCorpus_Morph_View
SENTS_MODENKJPCorpus_Segmentation_ViewHEADER_MODENKJPCorpus_Header_ViewRAW_MODENKJPCorpus_Text_View	NameError)r	   filenamer/   r   r.   r   r   r   _viewM   s   



zNKJPCorpusReader._viewc                 C   s   | j |v r|S | j | S )z<
        Add root if necessary to specified fileid.
        )r!   )r	   r   r   r   r   add_root`   s   

zNKJPCorpusReader.add_rootc                       t  fdd|D S )z9
        Returns header(s) of specified fileids.
        c                    .   g | ]}j |fd tji  qS r.   )r;   r<   r   r5   handle_queryr   r   r	   r   r   r   n       z+NKJPCorpusReader.header.<locals>.<listcomp>r   r   r   rA   r   headerh   
   zNKJPCorpusReader.headerc                    r=   )z9
        Returns sentences in specified fileids.
        c                    r>   r?   )r;   r<   r   r3   r@   r   rA   r   r   r   |   rB   z*NKJPCorpusReader.sents.<locals>.<listcomp>r   r   r   rA   r   sentsv   rD   zNKJPCorpusReader.sentsc                    r=   )5
        Returns words in specified fileids.
        c                    r>   r?   r;   r<   r   r1   r@   r   rA   r   r   r      rB   z*NKJPCorpusReader.words.<locals>.<listcomp>r   r   r   rA   r   words   s
   zNKJPCorpusReader.wordsc                    s&     dg t fdd|D S )z
        Call with specified tags as a list, e.g. tags=['subst', 'comp'].
        Returns tagged words in specified fileids.
        r/   c                    s0   g | ]}j |ftjd   qS ))r.   r/   rG   r   r   r	   r/   r   r   r      s    z1NKJPCorpusReader.tagged_words.<locals>.<listcomp>)r0   r   r   r   rI   r   tagged_words   s   zNKJPCorpusReader.tagged_wordsc                    r=   )rF   c                    r>   r?   )r;   r<   r   r7   r@   r   rA   r   r   r      rB   z(NKJPCorpusReader.raw.<locals>.<listcomp>r   r   r   rA   r   raw   rD   zNKJPCorpusReader.raw)r   r   )__name__
__module____qualname__r1   r3   r5   r7   r   r    r
   r;   r<   r   rC   rE   rH   rJ   rK   r   r   r   r   r       s(    

r   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )r6   c                 K   s   d| _ t| |d | j  dS )z
        HEADER_MODE
        A stream backed corpus view specialized for use with
        header.xml files in NKJP corpus.
        z.*/sourceDesc$r#   N)tagspecr   r   r	   r:   r   r   r   r   r      s   zNKJPCorpus_Header_View.__init__c                 C   sB   |    g }	 t| | j}t|dkrn|| q|   |S NTr   )_openr   
read_block_streamlenextendclose)r	   rC   segmr   r   r   r@      s   
z#NKJPCorpus_Header_View.handle_queryc                 C   s   | d}g }|rddd |D }| d}g }|r&ddd |D }| d}g }|r9ddd |D }| d	}	g }
|	rLdd
d |	D }
| d}g }|r_ddd |D }| d}g }|rrddd |D }||||
||dS )Nz
bibl/title
c                 s       | ]}|j  V  qd S r   textstrip)r   titler   r   r   	<genexpr>       z4NKJPCorpus_Header_View.handle_elt.<locals>.<genexpr>zbibl/authorc                 s   rZ   r   r[   )r   authorr   r   r   r_      r`   z	bibl/datec                 s   rZ   r   r[   )r   dater   r   r   r_      r`   zbibl/publisherc                 s   rZ   r   r[   )r   	publisherr   r   r   r_      r`   z	bibl/idnoc                 s   rZ   r   r[   )r   idnor   r   r   r_      r`   z	bibl/notec                 s   rZ   r   r[   )r   noter   r   r   r_      r`   )r^   ra   rb   rc   rd   re   )findallr&   )r	   eltcontexttitlesr^   Zauthorsra   datesrb   Z
publishersrc   Zidnosrd   notesre   r   r   r   
handle_elt   s>   





z!NKJPCorpus_Header_View.handle_eltN)rL   rM   rN   r   r@   rl   r   r   r   r   r6      s    	r6   c                   @   (   e Zd ZdZdd Zdd Zdd ZdS )	XML_Toola  
    Helper class creating xml file to one without references to nkjp: namespace.
    That's needed because the XMLCorpusView assumes that one can find short substrings
    of XML that are valid XML, which is not true if a namespace is declared at top level
    c                 C   s"   t j||| _tjdd| _d S )NF)delete)r$   r%   r&   	read_filetempfileNamedTemporaryFile
write_file)r	   r!   r:   r   r   r   r      s   zXML_Tool.__init__c              
   C   s   z_t | j}| j}d}t|rS| }td|}d|}td|}d|}td|}d|}td|}d|}td|}d|}|| t|s|	  |	  | jj
W S  tyr } z|   t|d }~ww )N znkjp:[^ ]* z<nkjp:paren>z</nkjp:paren>z<choice>z	</choice>)openrp   rs   rU   readlinerer(   r&   writerW   name	Exceptionremove_preprocessed_file)r	   frfwlinexreter   r   r   build_preprocessed_file   s4   







z XML_Tool.build_preprocessed_filec                 C   s   t | jj d S r   )r$   removers   ry   r+   r   r   r   r{     s   z!XML_Tool.remove_preprocessed_fileN)rL   rM   rN   __doc__r   r   r{   r   r   r   r   rn      s
    rn   c                   @   sP   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd ZdS )r4   zm
    A stream backed corpus view specialized for use with
    ann_segmentation.xml files in NKJP corpus.
    c                 K   sF   d| _ t|tjd| _| j  t|d| _t| | j	 | j  d S )Nz.*p/.*sr?   zann_segmentation.xml)
rO   r8   r3   	text_viewr@   rn   xml_toolr   r   r   rP   r   r   r   r   !  s   
z%NKJPCorpus_Segmentation_View.__init__c                 C   s   | dd  dd S )N(r   ,r   r-   )r	   Zexample_wordr   r   r   get_segm_id/  s   z(NKJPCorpus_Segmentation_View.get_segm_idc                 C   s   t |dd S )Nr   r   )intr(   )r	   Zbeg_wordr   r   r   get_sent_beg2  s   z)NKJPCorpus_Segmentation_View.get_sent_begc                 C   s,   | dd  d}t|d t|d  S )N)r   r   r   r   )r(   r   )r	   Zend_wordZsplittedr   r   r   get_sent_end6  s   z)NKJPCorpus_Segmentation_View.get_sent_endc                 C   sJ   |  |d }| jj| }| |d }| |t|d  }||| S )Nr   r   )r   r   	segm_dictr   r   rU   )r	   	sent_segmidrX   begendr   r   r   get_sentences;  s
   z*NKJPCorpus_Segmentation_View.get_sentencesc                 C   sV   g }d}d}|D ] }|  |}| ||d ks||kr&|| | |}|}q|S )Nr   )r   r   appendr   )r	   rX   r   Zprev_txt_endZprev_txt_nrwordZtxt_nrr   r   r   remove_choiceC  s   


z*NKJPCorpus_Segmentation_View.remove_choicec              
   C   s   z4|    g }	 t| | j}t|dkrn|D ]}| |}|| | qq|   | j	
  |W S  tyH } z| j	
  t|d }~ww rQ   )rR   r   rS   rT   rU   r   r   r   rW   r   r{   rz   )r	   Z	sentencesr   rX   r   r   r   r   r@   Q  s&   


z)NKJPCorpus_Segmentation_View.handle_queryc                 C   s"   g }|D ]
}| |d q|S )NZcorresp)r   get)r	   rg   rh   r   segr   r   r   rl   c  s   z'NKJPCorpus_Segmentation_View.handle_eltN)rL   rM   rN   r   r   r   r   r   r   r   r@   rl   r   r   r   r   r4     s    r4   c                   @   sB   e Zd ZdZdZdZdd Zdd Zdd	d
Zdd Z	dd Z
dS )r8   za
    A stream backed corpus view specialized for use with
    text.xml files in NKJP corpus.
    r   r   c                 K   sB   | dd| _d| _t | _t|d| _t| | j	 | j d S )Nr.   r   z	.*/div/abztext.xml)
r0   r.   rO   dictr   rn   r   r   r   r   rP   r   r   r   r   s  s   zNKJPCorpus_Text_View.__init__c              
   C   sV   z|    | | j}|   | j  |W S  ty* } z| j  t|d }~ww r   )rR   rS   rT   rW   r   r{   rz   )r	   r   r   r   r   r   r@   ~  s   

z!NKJPCorpus_Text_View.handle_queryNc                 C   sL   g }	 t | |}t|dkrn|D ]}|| qqddd |D gS )z6
        Returns text as a list of sentences.
        Tr   rt   c                 S   s   g | ]}|qS r   r   )r   rX   r   r   r   r     s    z3NKJPCorpus_Text_View.read_block.<locals>.<listcomp>)r   rS   rU   r   r&   )r	   streamrO   Zelt_handlertxtrX   partr   r   r   rS     s   zNKJPCorpus_Text_View.read_blockc                 C   s(   |j D ]}|dr||  S qd S )Nr   )attribendswithr   )r	   rg   attrr   r   r   r     s
   

z NKJPCorpus_Text_View.get_segm_idc                 C   s$   | j tju r|j| j| |< |jS r   )r.   r8   r3   r\   r   r   )r	   rg   rh   r   r   r   rl     s   zNKJPCorpus_Text_View.handle_elt)NN)rL   rM   rN   r   r3   r7   r   r@   rS   r   rl   r   r   r   r   r8   j  s    
r8   c                   @   rm   )	r2   zm
    A stream backed corpus view specialized for use with
    ann_morphosyntax.xml files in NKJP corpus.
    c                 K   s:   | dd | _d| _t|d| _t| | j | j d S )Nr/   z	.*/seg/fszann_morphosyntax.xml)r0   r/   rO   rn   r   r   r   r   rP   r   r   r   r     s   zNKJPCorpus_Morph_View.__init__c              
   C   s   z0|    g }	 t| | j}t|dkrn|D ]}|d ur#|| qq|   | j  |W S  t	yD } z| j  t	|d }~ww rQ   )
rR   r   rS   rT   rU   r   rW   r   r{   rz   )r	   rH   rX   r   r   r   r   r   r@     s(   


z"NKJPCorpus_Morph_View.handle_queryc           
      C   s$  d}d}d}| j d u rd}|D ]x}d| v r,|jd dkr,|D ]
}|jdkr*|j}q qd| v r|jd dkr|D ]K}d| v r|jd d	kr|D ]9}d| v r|jd d
kr|D ]'}	d|	 v ru| j d uru|	jd | j v rud}q]d|	 v r|	jd dkrd}q]qLq;q|r|r|S d S d S )N FTry   ZorthstringZinterpstypelexZctagvalueZinterp)r/   keysr   tagr\   )
r	   rg   rh   r   flagZis_not_interpchildsymbolZsymbol2Zsymbol3r   r   r   rl     s<   


z NKJPCorpus_Morph_View.handle_eltN)rL   rM   rN   r   r   r@   rl   r   r   r   r   r2     s
    r2   )r   r$   rw   rq   Znltk.corpus.reader.utilr   Znltk.corpus.reader.xmldocsr   r   r   r   r6   rn   r4   r8   r2   r   r   r   r   <module>   s    >(O9