o
    Zh+5                     @   s(  d Z ddlmZmZmZ ddlZddlmZm	Z	m
Z
 ddlmZmZmZmZ ddlmZmZmZmZmZmZmZmZ ddlmZmZmZmZmZmZ dd	l m!Z! e rWddl"Z"e r^ddl#Z#e$e%Z&d
d Z'		ddej(dee) dee) deee)ef  fddZ*e!ddG dd deZ+dgZ,dS )z%Image processor class for LayoutLMv2.    )DictOptionalUnionN   )BaseImageProcessorBatchFeatureget_size_dict)flip_channel_orderresizeto_channel_dimension_formatto_pil_image)ChannelDimension
ImageInputPILImageResamplinginfer_channel_dimension_formatmake_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)
TensorTypefilter_out_non_signature_kwargsis_pytesseract_availableis_vision_availableloggingrequires_backends)requiresc                 C   sL   t d| d |  t d| d |  t d| d |  t d| d |  gS )Ni  r         r   )int)boxwidthheight r"   i/var/www/auris/lib/python3.10/site-packages/transformers/models/layoutlmv2/image_processing_layoutlmv2.pynormalize_box6   s
   r$   imagelangtesseract_configinput_data_formatc                    sb  |dur|nd}t | |d}|j\}}tj||d|d}|d |d |d |d	 |d
 f\}}	}
}}dd t|D   fddt|D } fddt|	D }	 fddt|
D }
 fddt|D } fddt|D }g }t|	|
||D ]\}}}}|||| || g}|| q{g }|D ]}|t||| qt|t|ksJ d||fS )zdApplies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes.N r(   dict)r&   output_typeconfigtextlefttopr    r!   c                 S   s   g | ]
\}}|  s|qS r"   )strip.0idxwordr"   r"   r#   
<listcomp>O       z#apply_tesseract.<locals>.<listcomp>c                       g | ]
\}}| vr|qS r"   r"   r2   Zirrelevant_indicesr"   r#   r6   P   r7   c                    r8   r"   r"   r3   r4   Zcoordr9   r"   r#   r6   Q   r7   c                    r8   r"   r"   r:   r9   r"   r#   r6   R   r7   c                    r8   r"   r"   r:   r9   r"   r#   r6   S   r7   c                    r8   r"   r"   r:   r9   r"   r#   r6   T   r7   z-Not as many words as there are bounding boxes)	r   sizepytesseractZimage_to_data	enumeratezipappendr$   len)r%   r&   r'   r(   Z	pil_imageZimage_widthZimage_heightdatawordsr/   r0   r    r!   Zactual_boxesxywhZ
actual_boxZnormalized_boxesr   r"   r9   r#   apply_tesseract?   s(   
,rG   )Zvision)backendsc                       sB  e Zd ZdZdgZddejdddfdedee	e
ef  ded	ed
ee
 dee
 ddf fddZejddfdejde	e
ef dedeee
ef  deee
ef  dejfddZe dddddddejdf	dedee dee	e
ef  ded	ee d
ee
 dee
 deee
ef  dedeee
ef  dejjfddZ  ZS )LayoutLMv2ImageProcessora  
    Constructs a LayoutLMv2 image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
            overridden by `do_resize` in `preprocess`.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
            `preprocess` method.
        apply_ocr (`bool`, *optional*, defaults to `True`):
            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
            `apply_ocr` in `preprocess`.
        ocr_lang (`str`, *optional*):
            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
            used. Can be overridden by `ocr_lang` in `preprocess`.
        tesseract_config (`str`, *optional*, defaults to `""`):
            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
            Tesseract. For example: '--psm 6'. Can be overridden by `tesseract_config` in `preprocess`.
    pixel_valuesTNr)   	do_resizer;   resample	apply_ocrocr_langr'   returnc                    sX   t  jdi | |d ur|nddd}t|}|| _|| _|| _|| _|| _|| _d S )N   )r!   r    r"   )	super__init__r   rK   r;   rL   rM   rN   r'   )selfrK   r;   rL   rM   rN   r'   kwargs	__class__r"   r#   rR      s   

z!LayoutLMv2ImageProcessor.__init__r%   data_formatr(   c                 K   sT   t |}d|vsd|vrtd|  |d |d f}t|f||||d|S )a  
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        r!   r    zFThe `size` dictionary must contain the keys `height` and `width`. Got )r;   rL   rW   r(   )r   
ValueErrorkeysr
   )rS   r%   r;   rL   rW   r(   rT   Zoutput_sizer"   r"   r#   r
      s   #zLayoutLMv2ImageProcessor.resizeimagesreturn_tensorsc                    sn  |dur|nj }durnjtdurnj|dur%|nj}|dur.|nj}|dur7|nj}t|}t|sFt	dt
|d dd |D }du r^t|d |rtd g }g }|D ]}t|||d\}}|| || qk|rfd	d|D }fd
d|D } fdd|D }td|i|d}|r||d< ||d< |S )a  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                Desired size of the output image after resizing.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the enum `PIL.Image` resampling
                filter. Only has an effect if `do_resize` is set to `True`.
            apply_ocr (`bool`, *optional*, defaults to `self.apply_ocr`):
                Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
            ocr_lang (`str`, *optional*, defaults to `self.ocr_lang`):
                The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
                used.
            tesseract_config (`str`, *optional*, defaults to `self.tesseract_config`):
                Any additional custom configuration flags that are forwarded to the `config` parameter when calling
                Tesseract.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        NzkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)rK   r;   rL   c                 S   s   g | ]}t |qS r"   )r   r3   r%   r"   r"   r#   r6     s    z7LayoutLMv2ImageProcessor.preprocess.<locals>.<listcomp>r   r<   r*   c                    s   g | ]}j | d qS ))r%   r;   rL   r(   )r
   r\   )r(   rL   rS   r;   r"   r#   r6     s    c                    s   g | ]}t | d qS )r*   )r	   r\   r*   r"   r#   r6   "  s    c                    s   g | ]	}t | d qS ))Zinput_channel_dim)r   r\   )rW   r(   r"   r#   r6   #  s    rJ   )rA   Ztensor_typerB   boxes)rK   r;   r   rL   rM   rN   r'   r   r   rX   r   r   r   rG   r?   r   )rS   rZ   rK   r;   rL   rM   rN   r'   r[   rW   r(   Zwords_batchZboxes_batchr%   rB   r]   rA   r"   )rW   r(   rL   rS   r;   r#   
preprocess   sR   /

z#LayoutLMv2ImageProcessor.preprocess)__name__
__module____qualname____doc__Zmodel_input_namesr   ZBILINEARboolr   r   strr   rR   npndarrayr   r   r
   r   ZFIRSTr   r   PILZImager^   __classcell__r"   r"   rU   r#   rI   f   s    	

0	
rI   )NN)-rb   typingr   r   r   numpyre   Zimage_processing_utilsr   r   r   Zimage_transformsr	   r
   r   r   Zimage_utilsr   r   r   r   r   r   r   r   utilsr   r   r   r   r   r   Zutils.import_utilsr   rg   r<   Z
get_loggerr_   loggerr$   rf   rd   rG   rI   __all__r"   r"   r"   r#   <module>   s:   ( 


' 
I