o
    ZhMM                     @   s  d Z ddlZddlZddlmZmZmZ ddlZddl	m
Z
 ddlmZmZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZ dd	lmZmZmZmZ dd
l m!Z! e rgddl"Z"ddl#m$Z$m%Z%m&Z& e rnddl'Z'e(e)Z*dZ+dd Z,									d'de-de.de-de-de.de.de.de.dee/ dee- de$j$fddZ0	d(d ej1d!e-d"eee-e2f  fd#d$Z3G d%d& d&eZ4d&gZ5dS ))z%Image processor class for Pix2Struct.    N)DictOptionalUnion)hf_hub_download   )BaseImageProcessorBatchFeature)convert_to_rgb	normalizeto_channel_dimension_formatto_pil_image)ChannelDimension
ImageInputget_image_sizeinfer_channel_dimension_formatmake_list_of_imagesto_numpy_arrayvalid_images)
TensorTypeis_torch_availableis_vision_availablelogging)requires_backends)Image	ImageDraw	ImageFontzybelkada/fontsc                 C   s   t tdg | d} tjjj| ||f||fd}|| d| d||d}|	ddddd| d| | d| | d| | }|dS )	a  
    Utiliy function to extract patches from a given image tensor. Returns a tensor of shape (1, `patch_height`,
    `patch_width`, `num_channels`x `patch_height` x `patch_width`)

    Args:
        image_tensor (torch.Tensor):
            The image tensor to extract patches from.
        patch_height (int):
            The height of the patches to extract.
        patch_width (int):
            The width of the patches to extract.
    torchr   )Zstride         r   )
r   torch_extract_patches	unsqueezer   nn
functionalZunfoldreshapesizeZpermute)Zimage_tensorpatch_heightpatch_widthpatches r*   i/var/www/auris/lib/python3.10/site-packages/transformers/models/pix2struct/image_processing_pix2struct.pyr!   4   s   

r!   $   blackwhite   text	text_size
text_colorbackground_colorleft_paddingright_paddingtop_paddingbottom_padding
font_bytes	font_pathreturnc
                 C   s   t td tjdd}
|
j| d}d|}|dur$|	du r$t|}n|	dur+|	}ntt	d}t
j|d|d	}ttd
d|}|d||\}}}}|| | }|| | }td
||f|}t|}|j||f|||d |S )a  
    Render text. This script is entirely adapted from the original script that can be found here:
    https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py

    Args:
        text (`str`, *optional*, defaults to ):
            Text to render.
        text_size (`int`, *optional*, defaults to 36):
            Size of the text.
        text_color (`str`, *optional*, defaults to `"black"`):
            Color of the text.
        background_color (`str`, *optional*, defaults to `"white"`):
            Color of the background.
        left_padding (`int`, *optional*, defaults to 5):
            Padding on the left.
        right_padding (`int`, *optional*, defaults to 5):
            Padding on the right.
        top_padding (`int`, *optional*, defaults to 5):
            Padding on the top.
        bottom_padding (`int`, *optional*, defaults to 5):
            Padding on the bottom.
        font_bytes (`bytes`, *optional*):
            Bytes of the font to use. If `None`, the default font will be used.
        font_path (`str`, *optional*):
            Path to the font to use. If `None`, the default font will be used.
    visionP   )width)r0   
Nz	Arial.TTFzUTF-8)encodingr&   RGB)r   r   r   r   )Zxyr0   fillfont)r   render_texttextwrapTextWrapperwrapjoinioBytesIOr   DEFAULT_FONT_PATHr   Ztruetyper   ZDrawr   newZtextbboxr0   )r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   wrapperlinesZwrapped_textrC   Z	temp_draw_
text_widthZtext_heightimage_widthimage_heightimageZdrawr*   r*   r+   rD   O   s$   
&


rD   rS   headerinput_data_formatc           	      K   s   t td t| |d} t|fi |}t|j| j}t| j|| j  }t|j||j  }t	d||| fd}|
|||fd |
| ||fd|f t|}t|tjkrbt|tj}|S )a  
    Renders the input text as a header on the input image.

    Args:
        image (`np.ndarray`):
            The image to render the header on.
        header (`str`):
            The header text.
        data_format (`Union[ChannelDimension, str]`, *optional*):
            The data format of the image. Can be either "ChannelDimension.channels_first" or
            "ChannelDimension.channels_last".

    Returns:
        `np.ndarray`: The image with the header rendered.
    r;   )rU   r@   r.   rA   r   )r   render_headerr   rD   maxr=   intheightr   rL   Zpasteresizer   r   r   ZLASTr   )	rS   rT   rU   kwargsZheader_imageZ	new_widthZ
new_heightZnew_header_heightZ	new_imager*   r*   r+   rV      s   
rV   c                       s@  e Zd ZdZdgZ					ddeded	eeee	f  d
e	deddf fddZ
	ddejd
e	d	edeeeef  dejf
ddZ		ddejdeeeef  deeeef  dejfddZddddddejdfdedee dee dee d
ee	 d	eeee	f  deeeef  dedeeeef  defddZ  ZS )Pix2StructImageProcessoraf  
    Constructs a Pix2Struct image processor.

    Args:
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. According to Pix2Struct paper and code, the image is normalized with its own mean and standard
            deviation.
        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
            The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
        max_patches (`int`, *optional*, defaults to 2048):
            The maximum number of patches to extract from the image as per the [Pix2Struct
            paper](https://arxiv.org/pdf/2210.03347.pdf).
        is_vqa (`bool`, *optional*, defaults to `False`):
            Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
            rendered onto the input images.
    flattened_patchesTN   Fdo_convert_rgbdo_normalize
patch_sizemax_patchesis_vqar:   c                    sF   t  jdi | |d ur|nddd| _|| _|| _|| _|| _d S )N   )rY   r=   r*   )super__init__ra   r`   r_   rb   rc   )selfr_   r`   ra   rb   rc   r[   	__class__r*   r+   rf      s   	
z!Pix2StructImageProcessor.__init__rS   rU   c              	   K   s  t | jd t|tj|}t|}|d |d }}t|tj\}}	t	|||  ||	  }
t
tt|
| | |d}t
tt|
|	 | |d}t
|| d}t
|| d}tjjj|d||fdddd	d}t|||}|j}|d }|d
 }|d }||| |g}t||dgd||| dg}t|d|g|d|| dg}|d7 }|d7 }|tj}|tj}t|||gd}tjj|ddd|||  g }t|}|S )a  
        Extract flattened patches from an image.

        Args:
            image (`np.ndarray`):
                Image to extract flattened patches from.
            max_patches (`int`):
                Maximum number of patches to extract.
            patch_size (`dict`):
                Dictionary containing the patch height and width.

        Returns:
            result (`np.ndarray`):
                A sequence of `max_patches` flattened patches.
        r   rY   r=   r   r   ZbilinearFT)r&   modeZalign_cornersZ	antialiasr    r   r   )r   extract_flattened_patchesr   r   FIRSTr   Z
from_numpyr   mathsqrtrW   minfloorr#   r$   Zinterpolater"   Zsqueezer!   shaper%   Zarangerepeattofloat32catpadfloatr   )rg   rS   rb   ra   rU   r[   r'   r(   rR   rQ   scaleZnum_feasible_rowsZnum_feasible_colsZresized_heightZresized_widthr)   Zpatches_shaperowscolumnsdepthZrow_idsZcol_idsresultr*   r*   r+   rk      sF   
	**$z2Pix2StructImageProcessor.extract_flattened_patchesdata_formatc                 K   sb   |j tjkr|tj}t|}t|}t|dt	t
|j }t|f||||d|S )a  
        Normalize an image. image = (image - image_mean) / image_std.

        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization

        Args:
            image (`np.ndarray`):
                Image to normalize.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        g      ?)meanstdr}   rU   )ZdtypenpZuint8astypert   r~   r   rW   rm   rn   prodrq   r
   )rg   rS   r}   rU   r[   r~   r   Zadjusted_stddevr*   r*   r+   r
   5  s   

z"Pix2StructImageProcessor.normalizeimagesheader_textreturn_tensorsc
                    sd  |dur|nj }|dur|nj}durnjdur!njj}|
dddur3tdt|}t|s?td|rHdd |D }dd |D }du rYt	|d |rdu rctd	|

d
d |

ddttr{gt|  fddt|D }|rfdd|D }fdd|D }dd |D }t||d|d}|S )a  
        Preprocess an image or batch of images. The processor first computes the maximum possible number of
        aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
        image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
        images are standardized following the tensorflow implementation of `per_image_standardization`
        (https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).


        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images.
            header_text (`Union[List[str], str]`, *optional*):
                Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            max_patches (`int`, *optional*, defaults to `self.max_patches`):
                Maximum number of patches to extract.
            patch_size (`dict`, *optional*, defaults to `self.patch_size`):
                Dictionary containing the patch height and width.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        Nr}   z8data_format is not an accepted input as the outputs are zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.c                 S      g | ]}t |qS r*   )r	   .0rS   r*   r*   r+   
<listcomp>      z7Pix2StructImageProcessor.preprocess.<locals>.<listcomp>c                 S   r   r*   )r   r   r*   r*   r+   r     r   r   z.A header text must be provided for VQA models.r8   r9   c                    s$   g | ]\}}t ||  d qS ))r8   r9   )rV   )r   irS   )r8   r9   r   r*   r+   r     s    c                    s   g | ]	}j | d qS ))rS   rU   )r
   r   )rU   rg   r*   r+   r     s    c                    s   g | ]}j | d qS ))rS   rb   ra   rU   )rk   r   )rU   rb   ra   rg   r*   r+   r     s    c                 S   s$   g | ]}|j d ddktjqS )r   )Zaxisr   )sumr   r   rt   r   r*   r*   r+   r     s   $ )r]   Zattention_mask)dataZtensor_type)r`   r_   ra   rb   rc   get
ValueErrorr   r   r   pop
isinstancestrlen	enumerater   )rg   r   r   r_   r`   rb   ra   r   r}   rU   r[   rc   Zattention_masksZencoded_outputsr*   )r8   r9   r   rU   rb   ra   rg   r+   
preprocess\  sJ   5

z#Pix2StructImageProcessor.preprocess)TTNr^   FN)NN)__name__
__module____qualname____doc__Zmodel_input_namesboolr   r   r   rX   rf   r   ndarraydictr   r   rk   r
   rl   r   r   r   __classcell__r*   r*   rh   r+   r\      s    
T
*	
r\   )	r,   r-   r.   r/   r/   r/   r/   NNr   )6r   rI   rm   typingr   r   r   numpyr   Zhuggingface_hubr   Zimage_processing_utilsr   r   Zimage_transformsr	   r
   r   r   Zimage_utilsr   r   r   r   r   r   r   utilsr   r   r   r   Zutils.import_utilsr   rE   ZPILr   r   r   r   Z
get_loggerr   loggerrK   r!   r   rX   bytesrD   r   ChildProcessErrorrV   r\   __all__r*   r*   r*   r+   <module>   s~   $	
	

E
*  
