o
    Zh1                     @   sn   d Z ddlZddlmZmZ ddlmZmZmZ e r%ddl	Z	ddl	m
Z
 eeZG dd deZdgZdS )	z'
Feature extractor class for MarkupLM.
    N   )BatchFeatureFeatureExtractionMixin)is_bs4_availableloggingrequires_backends)BeautifulSoupc                       sF   e Zd ZdZ fddZdd Zdd Zdd	 Zd
efddZ	  Z
S )MarkupLMFeatureExtractorao  
    Constructs a MarkupLM feature extractor. This can be used to get a list of nodes and corresponding xpaths from HTML
    strings.

    This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`] which contains most
    of the main methods. Users should refer to this superclass for more information regarding those methods.

    c                    s"   t | dg t jdi | d S )Nbs4 )r   super__init__)selfkwargs	__class__r   g/var/www/auris/lib/python3.10/site-packages/transformers/models/markuplm/feature_extraction_markuplm.pyr   +   s   z!MarkupLMFeatureExtractor.__init__c                    s   g }g }|j r	|n|j  jD ]+}|j j dd}| j  |dt|kr)dnt fddt|dD  | q|  |  ||fS )NF)	recursive   r   c                 3   s     | ]\}}| u r|V  qd S )Nr   ).0ischildr   r   	<genexpr>7   s    z6MarkupLMFeatureExtractor.xpath_soup.<locals>.<genexpr>)	nameparentparentsZfind_allappendlennext	enumeratereverse)r   element
xpath_tagsxpath_subscriptsr   Zsiblingsr   r   r   
xpath_soup/   s   
*z#MarkupLMFeatureExtractor.xpath_soupc           
      C   s   t |d}g }g }g }|jD ]3}t|tjjrAt|jtjjur!qt	
| }|s+q|| | |\}}	|| ||	 qt|t|krNtdt|t|krZtd|||fS )Nzhtml.parserz3Number of doc strings and xtags does not correspondz3Number of doc strings and xsubs does not correspond)r   Zdescendants
isinstancer
   r#   ZNavigableStringtyper   Taghtmlunescapestripr   r&   r   
ValueError)
r   html_stringZ	html_codeall_doc_stringsstring2xtag_seqstring2xsubs_seqr#   Ztext_in_this_tagr$   r%   r   r   r   get_three_from_single>   s*   





z.MarkupLMFeatureExtractor.get_three_from_singlec                 C   sB   d}t ||D ]\}}|d| 7 }|dkr|d| d7 }q|S )N /r   [])zip)r   r$   r%   Zxpathtagnamesubsr   r   r   construct_xpath[   s   z(MarkupLMFeatureExtractor.construct_xpathreturnc                 C   s  d}t |tr
d}nt |ttfr t|dkst |d tr d}|s,tdt| dtt |ttfo:t |d t}|sA|g}g }g }|D ]-}| |\}}}	|	| g }
t
|||	D ]\}}}| ||}|
	| q^|	|
 qG||d}t|dd}|S )	a\  
        Main method to prepare for the model one or several HTML strings.

        Args:
            html_strings (`str`, `List[str]`):
                The HTML string or batch of HTML strings from which to extract nodes and corresponding xpaths.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **nodes** -- Nodes.
            - **xpaths** -- Corresponding xpaths.

        Examples:

        ```python
        >>> from transformers import MarkupLMFeatureExtractor

        >>> page_name_1 = "page1.html"
        >>> page_name_2 = "page2.html"
        >>> page_name_3 = "page3.html"

        >>> with open(page_name_1) as f:
        ...     single_html_string = f.read()

        >>> feature_extractor = MarkupLMFeatureExtractor()

        >>> # single example
        >>> encoding = feature_extractor(single_html_string)
        >>> print(encoding.keys())
        >>> # dict_keys(['nodes', 'xpaths'])

        >>> # batched example

        >>> multi_html_strings = []

        >>> with open(page_name_2) as f:
        ...     multi_html_strings.append(f.read())
        >>> with open(page_name_3) as f:
        ...     multi_html_strings.append(f.read())

        >>> encoding = feature_extractor(multi_html_strings)
        >>> print(encoding.keys())
        >>> # dict_keys(['nodes', 'xpaths'])
        ```FTr   zQHTML strings must of type `str`, `List[str]` (batch of examples), but is of type .)nodesxpathsN)dataZtensor_type)r'   strlisttupler   r-   r(   boolr2   r   r7   r:   r   )r   Zhtml_stringsZvalid_stringsZ
is_batchedr=   r>   r.   r/   r0   r1   Zxpath_stringsnodeZtag_listZsub_listZxpath_stringr?   Zencoded_inputsr   r   r   __call__c   s8   0
 

z!MarkupLMFeatureExtractor.__call__)__name__
__module____qualname____doc__r   r&   r2   r:   r   rE   __classcell__r   r   r   r   r	   !   s    	r	   )rI   r*   Zfeature_extraction_utilsr   r   utilsr   r   r   r
   r   Z
get_loggerrF   loggerr	   __all__r   r   r   r   <module>   s   
 
