o
    Zh                     @   sZ   d Z ddlmZmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZ G dd deZdgZd	S )
z
Processor class for MarkupLM.
    )OptionalUnion   )
TensorType)ProcessorMixin)BatchEncodingPaddingStrategyTruncationStrategyc                    @   s   e Zd ZdZdZdZdZ																			dded	eee	e
f d
eee	ef dee dedee dee dee dedededededeee	ef  defddZdd Zdd Zedd ZdS ) MarkupLMProcessoraJ  
    Constructs a MarkupLM processor which combines a MarkupLM feature extractor and a MarkupLM tokenizer into a single
    processor.

    [`MarkupLMProcessor`] offers all the functionalities you need to prepare data for the model.

    It first uses [`MarkupLMFeatureExtractor`] to extract nodes and corresponding xpaths from one or more HTML strings.
    Next, these are provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which turns them into token-level
    `input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and `xpath_subs_seq`.

    Args:
        feature_extractor (`MarkupLMFeatureExtractor`):
            An instance of [`MarkupLMFeatureExtractor`]. The feature extractor is a required input.
        tokenizer (`MarkupLMTokenizer` or `MarkupLMTokenizerFast`):
            An instance of [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]. The tokenizer is a required input.
        parse_html (`bool`, *optional*, defaults to `True`):
            Whether or not to use `MarkupLMFeatureExtractor` to parse HTML strings into nodes and corresponding xpaths.
    ZMarkupLMFeatureExtractor)ZMarkupLMTokenizerZMarkupLMTokenizerFastTNFr   add_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                 K   s  | j r)|du rtd|dus|dus|durtd| |}|d }|d }n|dur1td|du s9|du r=td|durL| j rLt|trL|g}| jdi d|durW|n|d	|dur`|n8dd|d
|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d||}|S d|d
|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d||}|S )a  
        This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
        passes the `nodes` and `xpaths` along with the additional arguments to [`~MarkupLMTokenizer.__call__`] and
        returns the output.

        Optionally, one can also provide a `text` argument which is passed along as first sequence.

        Please refer to the docstring of the above two methods for more information.
        NzDMake sure to pass HTML strings in case `parse_html` is set to `True`zUPlease don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`nodesxpathsz@You have passed HTML strings but `parse_html` is set to `False`.zIMake sure to pass nodes and xpaths in case `parse_html` is set to `False`textZ	text_pairnode_labelsr   r   r   r   r   r   r   r   r   r   r   r   r   r    )
parse_html
ValueErrorZfeature_extractor
isinstancestr	tokenizer)selfZhtml_stringsr   r   r   Z	questionsr   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargsfeaturesZencoded_inputsr   r   _/var/www/auris/lib/python3.10/site-packages/transformers/models/markuplm/processing_markuplm.py__call__2   s   !



	
	
zMarkupLMProcessor.__call__c                 O      | j j|i |S )z
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
        to the docstring of this method for more information.
        )r#   batch_decoder$   argsr%   r   r   r'   r*         zMarkupLMProcessor.batch_decodec                 O   r)   )z
        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
        docstring of this method for more information.
        )r#   decoder+   r   r   r'   r.      r-   zMarkupLMProcessor.decodec                 C   s   | j j}|S )N)r#   model_input_names)r$   Ztokenizer_input_namesr   r   r'   r/      s   z#MarkupLMProcessor.model_input_names)NNNNNTFNNr   NNNFFFFTN)__name__
__module____qualname____doc__Zfeature_extractor_classZtokenizer_classr   boolr   r"   r   r	   r   intr   r   r(   r*   r.   propertyr/   r   r   r   r'   r
      sv    	

Pr
   N)r3   typingr   r   Z
file_utilsr   Zprocessing_utilsr   Ztokenization_utils_baser   r   r	   r
   __all__r   r   r   r'   <module>   s   
|