o
    rZŽh>  ã                   @   st   d Z ddlZddlmZ ddlmZ ddlT ddlmZ ddl	m
Z
 ddlmZ G d	d
„ d
eƒZG dd„ deƒZdS )z‚
Corpus reader for corpora whose documents are xml files.

(note -- not named 'xml' to avoid conflicting w/ standard xml package)
é    N)ÚElementTree)ÚCorpusReader)Ú*)ÚSeekableUnicodeStreamReader)ÚElementWrapper)ÚWordPunctTokenizerc                   @   s.   e Zd ZdZd
dd„Zddd„Zddd	„ZdS )ÚXMLCorpusReadera  
    Corpus reader for corpora whose documents are xml files.

    Note that the ``XMLCorpusReader`` constructor does not take an
    ``encoding`` argument, because the unicode encoding is specified by
    the XML files themselves.  See the XML specs for more info.
    Fc                 C   s   || _ t | ||¡ d S ©N)Ú_wrap_etreer   Ú__init__)ÚselfÚrootZfileidsZ
wrap_etree© r   úI/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/xmldocs.pyr   !   s   zXMLCorpusReader.__init__Nc                 C   s€   |d u rt | jƒdkr| jd }t|tƒstdƒ‚|  |¡ ¡ }t |¡ 	¡ }W d   ƒ n1 s2w   Y  | j
r>t|ƒ}|S )Né   r   z(Expected a single file identifier string)ÚlenZ_fileidsÚ
isinstanceÚstrÚ	TypeErrorÚabspathÚopenr   ÚparseZgetrootr
   r   )r   ÚfileidÚfpÚeltr   r   r   Úxml%   s   

ÿzXMLCorpusReader.xmlc           
      C   s€   |   |¡}|  |¡}tƒ }z| ¡ }W n   | ¡ }Y g }|D ]}|j}|dur=t|tƒr3| |¡}| 	|¡}	| 
|	¡ q |S )aE  
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        N)r   Úencodingr   ZgetiteratorÚiterÚtextr   ÚbytesÚdecodeÚtokenizeÚextend)
r   r   r   r   Zword_tokenizerÚiteratorÚoutÚnoder   Útoksr   r   r   Úwords4   s"   







€zXMLCorpusReader.words)Fr	   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r'   r   r   r   r   r      s
    

r   c                   @   sv   e Zd ZdZdZdZddd„Zdd„ Zd	d
„ Ze	 
de	je	jB ¡Ze	 
d¡Ze	 
de	je	jB ¡Zdd„ Zddd„ZdS )ÚXMLCorpusViewam  
    A corpus view that selects out specified elements from an XML
    file, and provides a flat list-like interface for accessing them.
    (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself,
    but may be used by subclasses of ``XMLCorpusReader``.)

    Every XML corpus view has a "tag specification", indicating what
    XML elements should be included in the view; and each (non-nested)
    element that matches this specification corresponds to one item in
    the view.  Tag specifications are regular expressions over tag
    paths, where a tag path is a list of element tag names, separated
    by '/', indicating the ancestry of the element.  Some examples:

      - ``'foo'``: A top-level element whose tag is ``foo``.
      - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent
        is a top-level element whose tag is ``foo``.
      - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere
        in the xml tree.
      - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``,
        appearing anywhere in the xml tree.

    The view items are generated from the selected XML elements via
    the method ``handle_elt()``.  By default, this method returns the
    element as-is (i.e., as an ElementTree object); but it can be
    overridden, either via subclassing or via the ``elt_handler``
    constructor parameter.
    Fi   Nc                 C   sF   |r|| _ t |d ¡| _	 ddi| _	 |  |¡}tj| ||d dS )aW  
        Create a new corpus view based on a specified XML file.

        Note that the ``XMLCorpusView`` constructor does not take an
        ``encoding`` argument, because the unicode encoding is
        specified by the XML files themselves.

        :type tagspec: str
        :param tagspec: A tag specification, indicating what XML
            elements should be included in the view.  Each non-nested
            element that matches this specification corresponds to one
            item in the view.

        :param elt_handler: A function used to transform each element
            to a value for the view.  If no handler is specified, then
            ``self.handle_elt()`` is called, which returns the element
            as an ElementTree object.  The signature of elt_handler is::

                elt_handler(elt, tagspec) -> value
        z\Zr   r   )r   N)Ú
handle_eltÚreÚcompileÚ_tagspecÚ_tag_contextÚ_detect_encodingÚStreamBackedCorpusViewr   )r   r   ÚtagspecÚelt_handlerr   r   r   r   r   u   s   

zXMLCorpusView.__init__c                 C   sô   t |tƒrz| ¡ }| ¡ }W | ¡  n| ¡  w t|dƒ}| ¡ }W d   ƒ n1 s-w   Y  | tj¡r:dS | tj¡rBdS | tj	¡rJdS | tj
¡rRdS | tj¡rZdS t d|¡}|ri| d¡ ¡ S t d	|¡}|rx| d¡ ¡ S dS )
NÚrbz	utf-16-bez	utf-16-lez	utf-32-bez	utf-32-lezutf-8s!   \s*<\?xml\b.*\bencoding="([^"]+)"r   s!   \s*<\?xml\b.*\bencoding='([^']+)')r   ZPathPointerr   ÚreadlineÚcloseÚ
startswithÚcodecsÚBOM_UTF16_BEÚBOM_UTF16_LEÚBOM_UTF32_BEÚBOM_UTF32_LEÚBOM_UTF8r.   ÚmatchÚgroupr    )r   r   ÚinfileÚsÚmr   r   r   r2   ™   s2   


ÿzXMLCorpusView._detect_encodingc                 C   s   |S )a  
        Convert an element into an appropriate value for inclusion in
        the view.  Unless overridden by a subclass or by the
        ``elt_handler`` constructor argument, this method simply
        returns ``elt``.

        :return: The view value corresponding to ``elt``.

        :type elt: ElementTree
        :param elt: The element that should be converted.

        :type context: str
        :param context: A string composed of element tags separated by
            forward slashes, indicating the XML context of the given
            element.  For example, the string ``'foo/bar/baz'``
            indicates that the element is a ``baz`` element whose
            parent is a ``bar`` element and whose grandparent is a
            top-level ``foo`` element.
        r   )r   r   Úcontextr   r   r   r-   ¶   s   zXMLCorpusView.handle_elta;  
        [^<]*
        (
          ((<!--.*?-->)                         |  # comment
           (<![CDATA[.*?]])                     |  # raw character data
           (<!DOCTYPE\s+[^\[]*(\[[^\]]*])?\s*>) |  # doctype decl
           (<[^!>][^>]*>))                         # tag or PI
          [^<]*)*
        \Zz<\s*(?:/\s*)?([^\s>]+)a6  
        # Include these so we can skip them:
        (?P<COMMENT>        <!--.*?-->                          )|
        (?P<CDATA>          <![CDATA[.*?]]>                     )|
        (?P<PI>             <\?.*?\?>                           )|
        (?P<DOCTYPE>        <!DOCTYPE\s+[^\[^>]*(\[[^\]]*])?\s*>)|
        # These are the ones we actually care about:
        (?P<EMPTY_ELT_TAG>  <\s*[^>/\?!\s][^>]*/\s*>            )|
        (?P<START_TAG>      <\s*[^>/\?!\s][^>]*>                )|
        (?P<END_TAG>        <\s*/[^>/\?!\s][^>]*>               )c                 C   sô   d}t |tƒr| ¡ }	 | | j¡}||7 }| j |¡r|S t d|¡ 	d¡dkr?| ¡ t
|ƒt d|¡ ¡   }td| ƒ‚|sEtdƒ‚| d¡}|dkry| j |d	|… ¡ryt |tƒrh| |¡ | |¡ n| t
|ƒ|  d
¡ |d	|… S q)a{  
        Read a string from the given stream that does not contain any
        un-closed tags.  In particular, this function first reads a
        block from the stream of size ``self._BLOCK_SIZE``.  It then
        checks if that block contains an un-closed tag.  If it does,
        then this function either backtracks to the last '<', or reads
        another block.
        Ú Tz[<>]r   ú>zUnexpected ">" near char %sz&Unexpected end of file: tag not closedú<Nr   )r   r   ÚtellÚreadÚ_BLOCK_SIZEÚ_VALID_XML_REr@   r.   ÚsearchrA   r   ÚendÚ
ValueErrorÚrfindÚseekÚchar_seek_forward)r   ÚstreamÚfragmentÚstartposZ	xml_blockÚposZlast_open_bracketr   r   r   Ú_read_xml_fragmentñ   s0   	
ÿ


áz XMLCorpusView._read_xml_fragmentc                    s  |du r| j }ˆ du r| j‰ t| j | ¡ ¡ƒ}|dusJ ‚g }d}d}d}|g ks/|durht|tƒr8| ¡ }	|  |¡}
|
sI|du rEn#t	dƒ‚| j
 |
¡D ]É}| jretd d |¡dd… | ¡ ¡ƒ | d¡r| j | ¡ ¡ d¡}| |¡ |du rt |d |¡¡r| ¡ }t|ƒ}qO| d	¡rä| j | ¡ ¡ d¡}|s¨t	d
| ƒ‚||d kr»t	d|d › d|› dƒ‚|durß|t|ƒkrß||
|| ¡ … 7 }| |d |¡f¡ d }}d}| ¡  qO| d¡r| j | ¡ ¡ d¡}|du rt |d |¡d | ¡r| | ¡ d |¡d | f¡ qO|dur`|g kr.||
|d… 7 }d}n2| jr6tdƒ t|tƒrG| |	¡ | |¡ n| t|
ƒ|  d¡ |d|d … }d }}d}|g ks/|dus/| ¡ }|| jv rt|ƒ| j| ks~J ‚nt|ƒ| j|< ‡ fdd„|D ƒS )z¼
        Read from ``stream`` until we find at least one element that
        matches ``tagspec``, and return the result of applying
        ``elt_handler`` to each element found.
        NrF   zUnexpected end of filez	{:>25} {}ú/iìÿÿÿZ	START_TAGr   ZEND_TAGzUnmatched tag </%s>éÿÿÿÿzUnmatched tag <z>...</rG   ZEMPTY_ELT_TAGr   z/                                    (backtrack)c              	      s(   g | ]\}}ˆ t  | d d¡¡|ƒ‘qS )ÚasciiÚxmlcharrefreplace)r   Z
fromstringÚencode)Ú.0r   rE   ©r5   r   r   Ú
<listcomp>‡  s    üþÿz,XMLCorpusView.read_block.<locals>.<listcomp>)r0   r-   Úlistr1   ÚgetrI   r   r   rW   rO   Ú
_XML_PIECEÚfinditerÚ_DEBUGÚprintÚformatÚjoinrA   Ú_XML_TAG_NAMEr@   Úappendr.   Ústartr   rN   ÚpoprQ   rR   Útuple)r   rS   r4   r5   rE   ÚeltsZ	elt_startZ	elt_depthZelt_textrU   Zxml_fragmentZpieceÚnamerV   r   r^   r   Ú
read_block"  sˆ   

"

€


 €


¹J
ûzXMLCorpusView.read_blockr	   )NN)r(   r)   r*   r+   rd   rK   r   r2   r-   r.   r/   ÚDOTALLÚVERBOSErL   rh   rb   rW   ro   r   r   r   r   r,   Q   s$    
$
	ö


õ1r,   )r+   r:   Z	xml.etreer   Znltk.corpus.reader.apir   Znltk.corpus.reader.utilZ	nltk.datar   Znltk.internalsr   Znltk.tokenizer   r   r3   r,   r   r   r   r   Ú<module>   s   9