
    fTh#                     t    S r SSKrSSKJrJrJr  SSKJr  SSKJ	r	J
r
JrJrJr  SSKJr   " S S	\5      rS	/rg)
z!
Processor class for LayoutLMv3.
    N)ListOptionalUnion   )ProcessorMixin)BatchEncodingPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypec            (         ^  \ rS rSrSrSS/rSrSrS#U 4S jjr                  S$S\	\
\\\
   \\   4   S	\\	\\\   4      S
\\	\\\      \\\\         4      S\\	\\   \\\      4      S\S\	\\\4   S\	\\\4   S\\   S\S\\   S\\   S\\   S\S\S\S\S\S\\	\\4      S\4&S jjrS rS rS r\S 5       r\S  5       r\S! 5       rS"rU =r$ )%LayoutLMv3Processor   a>  
Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
single processor.

[`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.

It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
[`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).

Args:
    image_processor (`LayoutLMv3ImageProcessor`, *optional*):
        An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
    tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
        An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
image_processor	tokenizerLayoutLMv3ImageProcessor)LayoutLMv3TokenizerLayoutLMv3TokenizerFastc                    > S nSU;   a,  [         R                  " S[        5        UR                  S5      nUb  UOUnUc  [	        S5      eUc  [	        S5      e[
        TU ]  X5        g )Nfeature_extractorzhThe `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.z)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.)warningswarnFutureWarningpop
ValueErrorsuper__init__)selfr   r   kwargsr   	__class__s        l/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.pyr   LayoutLMv3Processor.__init__3   su     &(MM
 !'

+> ?-<-H/N_"HIIABB4    text	text_pairboxesword_labelsadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosereturn_tensorsreturnc                 `   U R                   R                  (       a  Ub  [        S5      eU R                   R                  (       a  Ub  [        S5      eU R                  UUS9nUb;  U R                   R                  (       a   Uc  [        U[        5      (       a  U/nUS   nU R
                  " S0 SUb  UOUS   _SUb  UOS_SUb  UOUS   _S	U_S
U_SU_SU_SU	_SU
_SU_SU_SU_SU_SU_SU_SU_SU_SU_UD6nUR                  S5      nUSL a  U R                  UUS   5      nUUS'   U$ )a  
This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
[`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
`apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
resized and normalized `pixel_values`.

Please refer to the docstring of the above two methods for more information.
NzdYou cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True.zaYou cannot provide word labels if you initialized the image processor with apply_ocr set to True.)imagesr6   wordsr%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   pixel_valuesToverflow_to_sample_mapping )r   	apply_ocrr   
isinstancestrr   r   get_overflowing_images)r   r9   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r    featuresencoded_inputss                          r"   __call__LayoutLMv3Processor.__call__E   s   F ))u/@v  )){/Fs 
 ''vn'U  4 4 > >9CT$$$v )I 
)x/@
#,#8id
 !,%(72C
 $	

  2
 
 "
 "
 
  2
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ *'
. n-$,00Hd9efF)/~&r$   c                     / nU H  nUR                  X   5        M     [        U5      [        U5      :w  a#  [        S[        U5       S[        U5       35      eU$ )Nz`Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got z and )appendlenr   )r   r9   r<   images_with_overflow
sample_idxs        r"   rA   *LayoutLMv3Processor.get_overflowing_images   sr    !4J ''(:; 5 #$,F(GG,-.eC8R4S3TV 
 $#r$   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
)r   batch_decoder   argsr    s      r"   rM    LayoutLMv3Processor.batch_decode   s    
 ~~**D;F;;r$   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
)r   decoderN   s      r"   rR   LayoutLMv3Processor.decode   s    
 ~~$$d5f55r$   c                 
    / SQ$ )N)	input_idsbboxattention_maskr;   r=   r   s    r"   model_input_names%LayoutLMv3Processor.model_input_names   s    FFr$   c                 P    [         R                  " S[        5        U R                  $ )Nzg`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.)r   r   r   image_processor_classrX   s    r"   feature_extractor_class+LayoutLMv3Processor.feature_extractor_class   s"    u	
 )))r$   c                 P    [         R                  " S[        5        U R                  $ )Nz[`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.)r   r   r   r   rX   s    r"   r   %LayoutLMv3Processor.feature_extractor   s"    i	
 ###r$   r=   )NN)NNNNTFNNr   NNNFFFFTN)__name__
__module____qualname____firstlineno____doc__
attributesr\   tokenizer_classr   r   r   r
   r   r   intboolr@   r	   r   r   r   rD   rA   rM   rR   propertyrY   r]   r   __static_attributes____classcell__)r!   s   @r"   r   r      s(   & $[1J6HO5* _cQUIMCG#'5:;?$(,00404*/+0',#;?)R I0$y/4HYCZZ[R E"3T:K5L"LMN	R
 d49otDcO/DDEFR eDItDI$>?@R !R tS/12R $%778R SMR R %SMR  (~R  (~R $(R  %)!R" !%#R$ %R& 'R( !sJ!78)R, 
-Rh$<6 G G * * $ $r$   r   )re   r   typingr   r   r   processing_utilsr   tokenization_utils_baser   r	   r
   r   r   utilsr   r   __all__r=   r$   r"   <module>rr      s:     ( ( . w w l$. l$^ !
!r$   