o
    rZh                     @   sv   d dl Z d dlmZmZ d dlmZmZmZ d dlm	Z	 dd Z
G dd deZd	d
 Zdd Zedkr9e  dS dS )    N)CorpusReaderSyntaxCorpusReader)FileSystemPathPointerfind_corpus_fileidsread_blankline_block)DependencyGraphc                 C   s   d dd | D S )N/c                 s   s$    | ]}|d  dkr|d  V  qdS )r   EOSN .0mr
   r
   F/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/knbc.py	<genexpr>   s   " z<lambda>.<locals>.<genexpr>joinZmorphsr
   r
   r   <lambda>   s    r   c                   @   s@   e Zd ZdZdefddZdd Zdd Zdd
dZdd Z	d	S )KNBCorpusReadera  
    This class implements:
      - ``__init__``, which specifies the location of the corpus
        and a method for detecting the sentence blocks in corpus files.
      - ``_read_block``, which reads a block from the input stream.
      - ``_word``, which takes a block and returns a list of list of words.
      - ``_tag``, which takes a block and returns a list of list of tagged
        words.
      - ``_parse``, which takes a block and returns a list of parsed
        sentences.

    The structure of tagged words:
      tagged_word = (word(str), tags(tuple))
      tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...)

    Usage example

    >>> from nltk.corpus.util import LazyCorpusLoader
    >>> knbc = LazyCorpusLoader(
    ...     'knbc/corpus1',
    ...     KNBCorpusReader,
    ...     r'.*/KN.*',
    ...     encoding='euc-jp',
    ... )

    >>> len(knbc.sents()[0])
    9

    utf8c                 C   s   t | ||| || _dS )z
        Initialize KNBCorpusReader
        morphs2str is a function to convert morphlist to str for tree representation
        for _parse()
        N)r   __init__
morphs2str)selfrootfileidsencodingr   r
   r
   r   r   7   s   
zKNBCorpusReader.__init__c                 C   s   t |S N)r   )r   streamr
   r
   r   _read_block@   s   zKNBCorpusReader._read_blockc                 C   s>   g }|  D ]}td|s| d}||d  q|S )NEOS|\*|\#|\+ r   )
splitlinesrematchstripsplitappend)r   treslinecellsr
   r
   r   _wordD   s   zKNBCorpusReader._wordNc              	   C   sP   g }|  D ]}td|s%| d}||d d|dd  f q|S )Nr   r    r      )r!   r"   r#   r$   r%   r&   r   )r   r'   Ztagsetr(   r)   r*   r
   r
   r   _tagO   s    zKNBCorpusReader._tagc           
      C   s*  t  }d}| D ]s}|d dv rV| dd}td|d }|d us'J |j| }|||dg d t	|d}|d	krG||_
n
|j| d
 | |d7 }q	|d dkr|| d}|d d|dd  f}	|j|d  d |	 q	| jr|j D ]}| |d |d< q| S )Nr   z*+r       z([\-0-9]*)([ADIP])r,      )addressrelworddeps#r2   )r   r!   r$   r%   r"   r#   nodesupdategroupintr   r&   r   r   valuestree)
r   r'   dgir)   r*   r   nodeZ
dep_parentZmorphr
   r
   r   _parseZ   s.   

zKNBCorpusReader._parser   )
__name__
__module____qualname____doc___morphs2str_defaultr   r   r+   r-   r?   r
   r
   r
   r   r      s    	
r   c                  C   s   dd l } ddlm} | jd}dd tt|dD }dd }|d	tt||d
dd}t	|
 d d  t	d| d d  t	ddd | d d D  dd |_t	ddd | d d D  t	ddd | dd D  d S )Nr   LazyCorpusLoaderzcorpora/knbc/corpus1c                 S   s   g | ]
}t d |r|qS )z\d\-\d\-[\d]+\-[\d]+)r"   search)r   fr
   r
   r   
<listcomp>   s    
zdemo.<locals>.<listcomp>z.*c                 S   s2   |  d}|d t|d t|d t|d fS )N-r   r,   r/   r.   )r%   r9   )xr*   r
   r
   r   _knbc_fileids_sort   s   
(z demo.<locals>._knbc_fileids_sortknbc/corpus1)keyeuc-jpr   
    d   z

c                 s   s    | ]}t |V  qd S r   )strr   r;   r
   r
   r   r          zdemo.<locals>.<genexpr>r/   c                 S   s   d dd | D dS )Nr   c                 s   s:    | ]}|d  dkrd |d  |d dd V  qdS )r   r	   z{}({})r,   r    r/   Nformatr%   r   r
   r
   r   r      s    0z)demo.<locals>.<lambda>.<locals>.<genexpr>zutf-8)r   encoder   r
   r
   r   r      s   
 zdemo.<locals>.<lambda>c                 s   s    | ]}d | V  qdS )z%sNr
   rU   r
   r
   r   r      rV   
c                 s   s$    | ]}d  dd |D V  qdS )r    c                 s   s.    | ]}d  |d |d dd V  qdS )z{}/{}r   r,   r    r/   NrW   )r   wr
   r
   r   r      s   , z!demo.<locals>.<genexpr>.<genexpr>Nr   )r   sentr
   r
   r   r      s
    
)nltknltk.corpus.utilrF   datafindr   r   r   sortedprintr   r   wordsZparsed_sentsr   tagged_sents)r]   rF   r   r   rL   knbcr
   r
   r   demo   s.   
$
$
rf   c                  C   s   ddl m}  | dtddd}t| d tsJ t| d d ts&J t| d ts1J t|	 d d ts>J d S )Nr   rE   rM   z.*/KN.*rO   rP   )
r^   rF   r   
isinstancerc   rT   ZsentsZtagged_wordstuplerd   )rF   re   r
   r
   r   test   s   ri   __main__)r"   Znltk.corpus.reader.apir   r   Znltk.corpus.reader.utilr   r   r   Z
nltk.parser   rD   r   rf   ri   r@   r
   r
   r
   r   <module>   s   	l)
