o
    rZh                     @   sd   d dl mZ d dlmZmZmZ d dlmZmZ d dl	m
Z
mZ G dd deZG dd deZd	S )
    )CorpusReader)StreamBackedCorpusViewconcatread_alignedsent_block)RegexpTokenizerWhitespaceTokenizer)AlignedSent	Alignmentc                   @   sL   e Zd ZdZde edddedfddZdd
dZdddZ	dddZ
d	S )AlignedCorpusReaderz
    Reader for corpora of word-aligned sentences.  Tokens are assumed
    to be separated by whitespace.  Sentences begin on separate lines.
    /
T)Zgapslatin1c                 C   s,   t | ||| || _|| _|| _|| _dS )a  
        Construct a new Aligned Corpus reader for a set of documents
        located at the given root directory.  Example usage:

            >>> root = '/...path to corpus.../'
            >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP

        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        N)r   __init__Z_sep_word_tokenizer_sent_tokenizer_alignedsent_block_reader)selfrootfileidssepword_tokenizersent_tokenizeralignedsent_block_readerencoding r   I/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/aligned.pyr      s
   
zAlignedCorpusReader.__init__Nc                       t  fdd |dD S )z~
        :return: the given file(s) as a list of words
            and punctuation symbols.
        :rtype: list(str)
        c              
      *   g | ]\}}t ||d d  j j jqS )FAlignedSentCorpusViewr   r   r   .0Zfileidencr   r   r   
<listcomp>9       
z-AlignedCorpusReader.words.<locals>.<listcomp>Tr   Zabspathsr   r   r   r#   r   words2   s
   


zAlignedCorpusReader.wordsc                    r   )z
        :return: the given file(s) as a list of
            sentences or utterances, each encoded as a list of word
            strings.
        :rtype: list(list(str))
        c              
      s*   g | ]\}}t ||d d j j jqS )FTr   r    r#   r   r   r$   O   r%   z-AlignedCorpusReader.sents.<locals>.<listcomp>Tr&   r'   r   r#   r   sentsG   s
   


zAlignedCorpusReader.sentsc                    r   )zp
        :return: the given file(s) as a list of AlignedSent objects.
        :rtype: list(AlignedSent)
        c              
      r   )Tr   r    r#   r   r   r$   c   r%   z5AlignedCorpusReader.aligned_sents.<locals>.<listcomp>Tr&   r'   r   r#   r   aligned_sents]   s
   


z!AlignedCorpusReader.aligned_sents)N)__name__
__module____qualname____doc__r   r   r   r   r(   r)   r*   r   r   r   r   r
      s    	



r
   c                   @   s    e Zd ZdZdd Zdd ZdS )r   z
    A specialized corpus view for aligned sentences.
    ``AlignedSentCorpusView`` objects are typically created by
    ``AlignedCorpusReader`` (not directly by nltk users).
    c                 C   s2   || _ || _|| _|| _|| _tj| ||d d S )N)r   )_aligned_group_by_sentr   r   r   r   r   )r   Zcorpus_filer   ZalignedZgroup_by_sentr   r   r   r   r   r   r   y   s   
zAlignedSentCorpusView.__init__c                    sd    fdd  |D } jr"td|d |d< t| g}|S  jr,|d g}|S |d }|S )Nc                    s*   g | ]} j |D ]} j|q
qS r   )r   tokenizer   )r!   Zalignedsent_strZsent_strr#   r   r   r$      s    

z4AlignedSentCorpusView.read_block.<locals>.<listcomp>    r   )r   r/   r	   Z
fromstringjoinr   r0   )r   streamblockr   r#   r   
read_block   s   


z AlignedSentCorpusView.read_blockN)r+   r,   r-   r.   r   r7   r   r   r   r   r   r   s    r   N)Znltk.corpus.reader.apir   Znltk.corpus.reader.utilr   r   r   Znltk.tokenizer   r   Znltk.translater   r	   r
   r   r   r   r   r   <module>   s   `