o
    rZh#                     @   s|   d Z ddlZddlZddlZddlmZ ddlT ddlm	Z	 ddl
T ddlT ddlmZ G dd deZG d	d
 d
eZdS )zN
A reader for corpora that contain chunked (and optionally tagged)
documents.
    Ntagstr2tree)*)BracketParseCorpusReader)Treec                   @   s   e Zd ZdZdeedddeddfdd	Zdd
dZdddZ	dddZ
dddZdddZdddZdddZdddZdddZdd ZdS ) ChunkedCorpusReadera&  
    Reader for chunked (and optionally tagged) corpora.  Paragraphs
    are split using a block reader.  They are then tokenized into
    sentences using a sentence tokenizer.  Finally, these sentences
    are parsed into chunk trees using a string-to-chunktree conversion
    function.  Each of these steps can be performed using a default
    function or a custom function.  By default, paragraphs are split
    on blank lines; sentences are listed one per line; and sentences
    are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.
     
T)Zgapsutf8Nc	           	      C   s"   t | ||| ||||f| _dS )z
        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        N)CorpusReader__init___cv_args)	selfrootfileids	extensionstr2chunktreesent_tokenizerpara_block_readerencodingtagset r   I/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/chunked.pyr   &   s   zChunkedCorpusReader.__init__c                       t  fdd |dD S )z~
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)
        c              	      s,   g | ]\}}t ||d d d d g jR  qS )r   ChunkedCorpusViewr   .0fencr   r   r   
<listcomp>A       z-ChunkedCorpusReader.words.<locals>.<listcomp>TconcatZabspathsr   r   r   r    r   words:   s
   

zChunkedCorpusReader.wordsc                    r   )z
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))
        c              	      s,   g | ]\}}t ||d dd d g jR  qS r      r   r   r    r   r   r!   O   r"   z-ChunkedCorpusReader.sents.<locals>.<listcomp>Tr#   r%   r   r    r   sentsG   
   

zChunkedCorpusReader.sentsc                    r   )z
        :return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of word strings.
        :rtype: list(list(list(str)))
        c              	      s,   g | ]\}}t ||d ddd g jR  qS r'   r   r   r    r   r   r!   ]   r"   z-ChunkedCorpusReader.paras.<locals>.<listcomp>Tr#   r%   r   r    r   parasU   r*   zChunkedCorpusReader.parasc                        t  fdd |dD S )z
        :return: the given file(s) as a list of tagged
            words and punctuation symbols, encoded as tuples
            ``(word,tag)``.
        :rtype: list(tuple(str,str))
        c              	      s2   g | ]\}}t ||d dddg jR diqS r(   r   target_tagsetr   r   r   r   r   r   r!   k       z4ChunkedCorpusReader.tagged_words.<locals>.<listcomp>Tr#   r   r   r   r   r/   r   tagged_wordsc   
   
z ChunkedCorpusReader.tagged_wordsc                    r,   )z
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.

        :rtype: list(list(tuple(str,str)))
        c              	      s2   g | ]\}}t ||d d ddg jR diqS r-   r   r   r/   r   r   r!   {   r0   z4ChunkedCorpusReader.tagged_sents.<locals>.<listcomp>Tr#   r1   r   r/   r   tagged_sentss   r3   z ChunkedCorpusReader.tagged_sentsc                    r,   )z
        :return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as lists of ``(word,tag)`` tuples.
        :rtype: list(list(list(tuple(str,str))))
        c              	      s2   g | ]\}}t ||d d d dg jR diqS r-   r   r   r/   r   r   r!      r0   z4ChunkedCorpusReader.tagged_paras.<locals>.<listcomp>Tr#   r1   r   r/   r   tagged_paras   r3   z ChunkedCorpusReader.tagged_parasc                    r,   )av  
        :return: the given file(s) as a list of tagged
            words and chunks.  Words are encoded as ``(word, tag)``
            tuples (if the corpus has tags) or word strings (if the
            corpus has no tags).  Chunks are encoded as depth-one
            trees over ``(word,tag)`` tuples or word strings.
        :rtype: list(tuple(str,str) and Tree)
        c              	      s2   g | ]\}}t ||d ddd g jR diqS r-   r   r   r/   r   r   r!      r0   z5ChunkedCorpusReader.chunked_words.<locals>.<listcomp>Tr#   r1   r   r/   r   chunked_words   
   	
z!ChunkedCorpusReader.chunked_wordsc                    r,   )a6  
        :return: the given file(s) as a list of
            sentences, each encoded as a shallow Tree.  The leaves
            of these trees are encoded as ``(word, tag)`` tuples (if
            the corpus has tags) or word strings (if the corpus has no
            tags).
        :rtype: list(Tree)
        c              	      s2   g | ]\}}t ||d d dd g jR diqS r-   r   r   r/   r   r   r!      r0   z5ChunkedCorpusReader.chunked_sents.<locals>.<listcomp>Tr#   r1   r   r/   r   chunked_sents   r7   z!ChunkedCorpusReader.chunked_sentsc                    r,   )ao  
        :return: the given file(s) as a list of
            paragraphs, each encoded as a list of sentences, which are
            in turn encoded as a shallow Tree.  The leaves of these
            trees are encoded as ``(word, tag)`` tuples (if the corpus
            has tags) or word strings (if the corpus has no tags).
        :rtype: list(list(Tree))
        c              	      s2   g | ]\}}t ||d d d d g jR diqS )r(   r.   r   r   r/   r   r   r!      r0   z5ChunkedCorpusReader.chunked_paras.<locals>.<listcomp>Tr#   r1   r   r/   r   chunked_paras   r7   z!ChunkedCorpusReader.chunked_parasc                 C   s   dd t |D S )Nc                 S   s   g | ]}t |qS r   r   )r   tr   r   r   r!      s    z3ChunkedCorpusReader._read_block.<locals>.<listcomp>)read_blankline_block)r   streamr   r   r   _read_block   s   zChunkedCorpusReader._read_block)NNN)__name__
__module____qualname____doc__r   ZRegexpTokenizerr;   r   r&   r)   r+   r2   r4   r5   r6   r8   r9   r=   r   r   r   r   r      s&    










r   c                   @   s*   e Zd Z		dddZdd Zdd ZdS )	r   Nc                 C   sJ   t j| ||d || _|| _|| _|| _|| _|| _|	| _|
| _	|| _
d S )N)r   )StreamBackedCorpusViewr   _tagged_group_by_sent_group_by_para_chunked_str2chunktree_sent_tokenizer_para_block_reader_source_tagset_target_tagset)r   Zfileidr   ZtaggedZgroup_by_sentZgroup_by_parachunkedr   r   r   source_tagsetr.   r   r   r   r      s   
zChunkedCorpusView.__init__c                 C   s   g }|  |D ]B}g }| j|D ])}| j|| j| jd}| js%| |}| js,|	 }| j
r5|| q|| q| jrD|| q|| q|S )N)rN   r.   )rJ   rI   tokenizerH   rK   rL   rD   _untagrG   leavesrE   appendextendrF   )r   r<   blockZpara_strparaZsent_strsentr   r   r   
read_block   s(   
zChunkedCorpusView.read_blockc                 C   sJ   t |D ]\}}t|tr| | qt|tr|d ||< qtd|S )Nr   z"expected child to be Tree or tuple)	enumerate
isinstancer   rP   tuple
ValueError)r   treeichildr   r   r   rP   	  s   

zChunkedCorpusView._untagr>   )r?   r@   rA   r   rW   rP   r   r   r   r   r      s    
"r   )rB   codecsZos.pathosZnltkZ
nltk.chunkr   Znltk.corpus.reader.apiZ nltk.corpus.reader.bracket_parser   Znltk.corpus.reader.utilZnltk.tokenizeZ	nltk.treer   r   r   rC   r   r   r   r   r   <module>   s    4