
    fTh5$                     t    S r SSKrSSKJrJrJr  SSKJr  SSKJ	r	J
r
JrJrJr  SSKJr   " S S	\5      rS	/rg)
z 
Processor class for LayoutXLM.
    N)ListOptionalUnion   )ProcessorMixin)BatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypec            (         ^  \ rS rSrSrSS/rSrSrS#U 4S jjr                  S$S\	\
\\\
   \\   4   S	\\	\\\   4      S
\\	\\\      \\\\         4      S\\	\\   \\\      4      S\S\	\\\4   S\	\\\4   S\\   S\S\\   S\\   S\\   S\S\S\S\S\S\\	\\4      S\4&S jjrS rS rS r\S 5       r\S  5       r\S! 5       rS"rU =r$ )%LayoutXLMProcessor   a6  
Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
processor.

[`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.

It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
[`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).

Args:
    image_processor (`LayoutLMv2ImageProcessor`, *optional*):
        An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
    tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
        An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
image_processor	tokenizerLayoutLMv2ImageProcessor)LayoutXLMTokenizerLayoutXLMTokenizerFastc                    > SU;   a,  [         R                  " S[        5        UR                  S5      nUb  UOWnUc  [	        S5      eUc  [	        S5      e[
        TU ]  X5        g )Nfeature_extractorzhThe `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.z)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.)warningswarnFutureWarningpop
ValueErrorsuper__init__)selfr   r   kwargsr   	__class__s        j/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/layoutxlm/processing_layoutxlm.pyr   LayoutXLMProcessor.__init__3   so    &(MM
 !'

+> ?-<-H/N_"HIIABB4    text	text_pairboxesword_labelsadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                    U R                   R                  (       a  Ub  [        S5      eU R                   R                  (       a  Ub  [        S5      eUSL a  USL a  [        S5      eU R                  UUS9nUb;  U R                   R                  (       a   Uc  [        U[        5      (       a  U/nUS   nU R
                  " S0 S	Ub  UOUS   _S
Ub  UOS_SUb  UOUS   _SU_SU_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_UD6nUR                  S5      nUSL a  U R                  UUS   5      nUUS'   U$ )a  
This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
[`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
`False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.

Please refer to the docstring of the above two methods for more information.
NzdYou cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True.zaYou cannot provide word labels if you initialized the image processor with apply_ocr set to True.TFzKYou cannot return overflowing tokens without returning the offsets mapping.)imagesr6   wordsr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   pixel_valuesoverflow_to_sample_mappingimage )r   	apply_ocrr   
isinstancestrr   r   get_overflowing_images)r   r9   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r    featuresencoded_inputss                          r"   __call__LayoutXLMProcessor.__call__D   s   D ))u/@v  )){/Fs  %,1G51Pjkk ''vn'U  4 4 > >9CT$$$v )I 
)x/@
#,#8id
 !,%(72C
 $	

  2
 
 "
 "
 
  2
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ *'
. n-$,00Hd9efF"(wr$   c                     / nU H  nUR                  X   5        M     [        U5      [        U5      :w  a#  [        S[        U5       S[        U5       35      eU$ )Nz`Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got z and )appendlenr   )r   r9   r<   images_with_overflow
sample_idxs        r"   rB   )LayoutXLMProcessor.get_overflowing_images   sr    !4J ''(:; 5 #$,F(GG,-.eC8R4S3TV 
 $#r$   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
)r   batch_decoder   argsr    s      r"   rN   LayoutXLMProcessor.batch_decode   s    
 ~~**D;F;;r$   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
)r   decoderO   s      r"   rS   LayoutXLMProcessor.decode   s    
 ~~$$d5f55r$   c                 
    / SQ$ )N)	input_idsbboxattention_maskr=   r>   r   s    r"   model_input_names$LayoutXLMProcessor.model_input_names   s    ??r$   c                 P    [         R                  " S[        5        U R                  $ )Nzg`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.)r   r   r   image_processor_classrY   s    r"   feature_extractor_class*LayoutXLMProcessor.feature_extractor_class   s"    u	
 )))r$   c                 P    [         R                  " S[        5        U R                  $ )Nz[`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.)r   r   r   r   rY   s    r"   r   $LayoutXLMProcessor.feature_extractor   s"    i	
 ###r$   r>   )NN)NNNNTFNNr   NNNFFFFTN)__name__
__module____qualname____firstlineno____doc__
attributesr]   tokenizer_classr   r   r   r
   r   r   intboolrA   r	   r   r   r   rE   rB   rN   rS   propertyrZ   r^   r   __static_attributes____classcell__)r!   s   @r"   r   r      s(   & $[1J6FO5( _cQUIMCG#'5:;?$(,00404*/+0',#;?)T I0$y/4HYCZZ[T E"3T:K5L"LMN	T
 d49otDcO/DDEFT eDItDI$>?@T !T tS/12T $%778T SMT T %SMT  (~T  (~T $(T  %)!T" !%#T$ %T& 'T( !sJ!78)T, 
-Tl$<6 @ @ * * $ $r$   r   )rf   r   typingr   r   r   processing_utilsr   tokenization_utils_baser   r	   r
   r   r   utilsr   r   __all__r>   r$   r"   <module>rs      s:     ( ( . w w m$ m$`  
 r$   