
    fTh                     `    S r SSKJrJr  SSKJr  SSKJr  SSKJ	r	J
r
Jr   " S S\5      rS/rg	)
z
Processor class for MarkupLM.
    )OptionalUnion   )
TensorType)ProcessorMixin)BatchEncodingPaddingStrategyTruncationStrategyc                        \ rS rSrSrSrSrSr                   SS\S\	\\
\4   S	\	\\
\4   S
\\   S\S\\   S\\   S\\   S\S\S\S\S\S\\	\
\4      S\4S jjrS rS r\S 5       rSrg)MarkupLMProcessor   a  
Constructs a MarkupLM processor which combines a MarkupLM feature extractor and a MarkupLM tokenizer into a single
processor.

[`MarkupLMProcessor`] offers all the functionalities you need to prepare data for the model.

It first uses [`MarkupLMFeatureExtractor`] to extract nodes and corresponding xpaths from one or more HTML strings.
Next, these are provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which turns them into token-level
`input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and `xpath_subs_seq`.

Args:
    feature_extractor (`MarkupLMFeatureExtractor`):
        An instance of [`MarkupLMFeatureExtractor`]. The feature extractor is a required input.
    tokenizer (`MarkupLMTokenizer` or `MarkupLMTokenizerFast`):
        An instance of [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]. The tokenizer is a required input.
    parse_html (`bool`, *optional*, defaults to `True`):
        Whether or not to use `MarkupLMFeatureExtractor` to parse HTML strings into nodes and corresponding xpaths.
MarkupLMFeatureExtractor)MarkupLMTokenizerMarkupLMTokenizerFastTNadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                    U R                   (       a>  Uc  [        S5      eUc  Uc  Ub  [        S5      eU R                  U5      nUS   nUS   nOUb  [        S5      eUb  Uc  [        S5      eUb)  U R                   (       a  [        U[        5      (       a  U/nU R
                  " S0 SUb  UOU_S	Ub  UOS_SU_S
U_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_UD6nU$ )a  
This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
passes the `nodes` and `xpaths` along with the additional arguments to [`~MarkupLMTokenizer.__call__`] and
returns the output.

Optionally, one can also provide a `text` argument which is passed along as first sequence.

Please refer to the docstring of the above two methods for more information.
NzDMake sure to pass HTML strings in case `parse_html` is set to `True`zUPlease don't pass nodes, xpaths nor node labels in case `parse_html` is set to `True`nodesxpathsz@You have passed HTML strings but `parse_html` is set to `False`.zIMake sure to pass nodes and xpaths in case `parse_html` is set to `False`text	text_pairnode_labelsr   r   r   r   r   r   r   r   r   r   r   r   r   r    )
parse_html
ValueErrorfeature_extractor
isinstancestr	tokenizer)selfhtml_stringsr!   r"   r%   	questionsr   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargsfeaturesencoded_inputss                          h/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/markuplm/processing_markuplm.py__call__MarkupLMProcessor.__call__2   s   B ??# !ghh F$6+:Q k  --l;HW%Eh'F' !cdd} !lmm  T__)S))&K	 
'3
(4e$
 
 $	

  2
 
 "
 "
 
  2
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ *'
,     c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please refer
to the docstring of this method for more information.
)r,   batch_decoder-   argsr0   s      r3   r8   MarkupLMProcessor.batch_decode   s    
 ~~**D;F;;r6   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
docstring of this method for more information.
)r,   decoder9   s      r3   r=   MarkupLMProcessor.decode   s    
 ~~$$d5f55r6   c                 2    U R                   R                  nU$ )N)r,   model_input_names)r-   tokenizer_input_namess     r3   r@   #MarkupLMProcessor.model_input_names   s     $ @ @$$r6   r&   )NNNNNTFNNr   NNNFFFFTN)__name__
__module____qualname____firstlineno____doc__feature_extractor_classtokenizer_classr'   boolr   r+   r	   r
   r   intr   r   r4   r8   r=   propertyr@   __static_attributes__r&   r6   r3   r   r      sU   & 9DOJ #'5:;?$(,00404*/+0',#;?)N !N tS/12N $%778N SMN N %SMN  (~N  (~N $(N  %)!N" !%#N$ %N& 'N( !sJ!78)N, 
-N`<6 % %r6   r   N)rG   typingr   r   
file_utilsr   processing_utilsr   tokenization_utils_baser   r	   r
   r   __all__r&   r6   r3   <module>rS      s4    # $ . Y Yy% y%x 
r6   