o
    Zh\                     @   s   d dl mZmZmZmZmZ d dlZddlm	Z	m
Z
mZmZ ddlmZmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZ ddlmZm Z  e !e"Z#dej$d	e%d
eej$ fddZ&G dd de	Z'dgZ(dS )    )IterableListOptionalTupleUnionN   )BaseImageProcessorBatchFeatureget_patch_output_sizeselect_best_resolution)PaddingModeconvert_to_rgbpadresizeto_channel_dimension_format)
ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatis_scaled_imagemake_flat_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)
TensorTypeloggingimage
patch_sizereturnc           	      C   s   g }t | |d\}}td||D ]5}td||D ],}|tjkr.| ||| ||| f }n| dd||| ||| f }|| qq|S )a  
    Divides an image into patches of a specified size.

    Args:
        image (`np.array`):
            The input image.
        patch_size (`int`):
            The size of each patch.
        input_data_format (`ChannelDimension` or `str`):
            The channel dimension format of the input image.

    Returns:
        list: A list of np.array representing the patches.
    channel_dimr   N)r   ranger   ZLASTappend)	r   r   input_data_formatpatchesheightwidthijpatch r+   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/aria/image_processing_aria.pydivide_to_patches-   s   
"r-   c                        sV  e Zd ZdZg dZddddddddddejfd	eee	  d
eee	  de
de
deeee
e
f   dee dee dedee
e	f dee def fddZdddddddddddejdfdeeee f d	eee	ee	 f  d
eee	ee	 f  dee
 dee
 dee dee dee dee	 dee dedeeeef  dee deeeef  fddZdejdededejfd d!Zdejdededejfd"d#Zejd$ddfdejd%ee
ee
e
f eee
e
f  f d&ed'ee	ee	 f deeeef  deeeef  dejfd(d)Zdejd*eee
e
f  d+e
dedededeej fd,d-Z  Z S ).AriaImageProcessoraG  
    A vision processor for the Aria model that handles image preprocessing.
    Initialize the AriaImageProcessor.

    Args:
        image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]):
            Mean values for normalization.
        image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]):
            Standard deviation values for normalization.
        max_image_size (`int`, *optional*, defaults to 980):
            Maximum image size.
        min_image_size (`int`, *optional*, defaults to 336):
            Minimum image size.
        split_resolutions (`list`, *optional*, defaults to a list of optimal,resolutions as tuples):
            The optimal resolutions for splitting the image.
        split_image (`bool`, *optional*, defaults to `False`):
            Whether to split the image.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image.
        resample (PILImageResampling, *optional*, defaults to `BICUBIC`):
            The resampling filter to use if resizing the image.
    pixel_values
pixel_mask	num_cropsN  iP  FTgp?
image_mean	image_stdmax_image_sizemin_image_sizesplit_resolutionssplit_imagedo_convert_rgb
do_rescalerescale_factordo_normalizeresamplec                    s   t  jdi | |d u rg d}|d u rg d}|| _|| _|| _|| _|| _|d u r7g d}dd |D }|| _|| _|| _	|	| _
|
| _|| _d S )N)      ?r?   r?   ))      )r@   r   )r@      )r@      )r@      )r@      )r@      )rA   rB   )rA   r   )rA   rA   )rA   r@   )r   r@   )r   rA   )rB   r@   )rB   rA   )rC   r@   )rD   r@   )rE   r@   )rF   r@   c                 S   s$   g | ]}|d  d |d d fqS )r     r@   r+   ).0elr+   r+   r,   
<listcomp>   s   $ z/AriaImageProcessor.__init__.<locals>.<listcomp>r+   )super__init__r6   r7   r4   r5   r9   r8   r:   r;   r<   r=   r>   )selfr4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   kwargs	__class__r+   r,   rL   k   s&   
zAriaImageProcessor.__init__ptimagesreturn_tensorsdata_formatr$   c              	   C   s  |dur|n| j }|dur|n| j}|dur|n| j}|dur!|n| j}|dur*|n| j}|dur3|n| j}|dur<|n| j}|	durE|	n| j}	|
durN|
n| j}
|durW|n| j	}|dvrbt
dt|}t|snt
dt|
|||||	d |rdd |D }dd |D }|rt|d	 rtd
 |du rt|d	 }g }g }d}|D ]}|r| j|| j||||d}n|g}|du st||krt|}|D ]}t|\}}|t|| }||krtt|| ||f}n|tt|| |f}t|||||d}||d	  ||d  }}t|d	|fd	|ff||d}tj||ftd}d|d|d	 d|d f< || |r@| j||	|d}|
r\| j|| j | j||d}|durZt |||n|}|| qqt!tj"|d	dtj"|d	d|d|dS )aI  
        Process a list of images.

        Args:
            images (ImageInput or list of ImageInput):
                The input image or a list of images.
            image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]):
                Mean values for normalization.
            image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]):
                Standard deviation values for normalization.
            max_image_size (`int`, *optional*, defaults to `self.max_image_size` (980)):
                Maximum image size.
            min_image_size (`int`, *optional*, defaults to `self.min_image_size` (336)):
                Minimum image size.
            split_image (`bool`, *optional*, defaults to `self.split_image` (False)):
                Whether to split the image.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)):
                Whether to convert the image to RGB.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)):
                Whether to normalize the image.
            resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)):
                The resampling filter to use if resizing the image.
            return_tensors (`str` or `TensorType`, *optional*, defaults to "pt"):
                The type of tensor to return.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`:
                        image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`:
                        image in (height, width, num_channels) format.
                If unset, will use same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`:
                        image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`:
                        image in (height, width, num_channels) format.
                If unset, will use the inferred format of the input image.

        Returns:
            BatchFeature:
                A BatchFeature object containing:
                - 'pixel_values':
                    Tensor of processed image pixel values.
                - 'pixel_mask':
                    Boolean pixel mask. This mask is a 2D tensor of shape (max_image_size, max_image_size) where:
                    - True (1) values indicate pixels that belong to the original resized image.
                    - False (0) values indicate pixels that are part of the padding.
                  The mask helps distinguish between actual image content and padded areas in subsequent processing steps.
                - 'num_crops':
                    The maximum number of crops across all images.
        N)rG   r3   z(max_image_size must be either 490 or 980zkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)r=   r4   r5   r>   r;   r<   c                 S      g | ]}t |qS r+   )r   rH   r   r+   r+   r,   rJ          z1AriaImageProcessor.preprocess.<locals>.<listcomp>c                 S   rU   r+   )r   rV   r+   r+   r,   rJ      rW   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.rT   r$   )r>   rT   r$   r@   )Zdtype)r   scaler$   )Zaxisr/   )dataZtensor_type)#r4   r5   r6   r7   r9   r:   r;   r<   r=   r>   
ValueErrorr   r   r   r   loggerZwarning_oncer   get_image_patchesr8   lenr   maxintr   r   npZzerosboolr#   Zrescale	normalizer   r	   stack)rM   rR   r4   r5   r6   r7   r9   r:   r;   r<   r=   r>   rS   rT   r$   r0   Zpixel_masksr2   r   Zcrop_imagesZ
crop_imagehwrY   new_sizeZcrop_image_resizedpadding_bottompadding_rightZcrop_image_paddedr1   r+   r+   r,   
preprocess   s   I		

	2zAriaImageProcessor.preprocessr   target_resolutionr   c                 C   s(   t |||\}}t|||f||d}|S )aC  
        Resizes an image to a target resolution while maintaining aspect ratio.

        Args:
            image (np.array):
                The input image.
            target_resolution (tuple):
                The target resolution (height, width) of the image.
            resample (`PILImageResampling`):
                Resampling filter to use if resizing the image.
            input_data_format (`ChannelDimension` or `str`):
                The channel dimension format of the input image.

        Returns:
            np.array: The resized and padded image.
        r>   r$   )r
   r   )rM   r   rk   r>   r$   
new_height	new_widthresized_imager+   r+   r,   _resize_for_patchingU  s   z'AriaImageProcessor._resize_for_patchingc                 C   sb   |\}}t |||\}}t|| d\}}	t|| d\}
}| j||
|
| f|||	 ffd}|S )zU
        Pad an image to a target resolution while maintaining aspect ratio.
        rA   )padding)r
   divmodr   )rM   r   rk   r$   Ztarget_heightZtarget_widthrm   rn   Zpaste_xZr_xZpaste_yZr_ypadded_imager+   r+   r,   _pad_for_patchingo  s   "z$AriaImageProcessor._pad_for_patchingg        rq   modeconstant_valuesc                 C   s   t |tst|dkrt||||||S |du rt|}tjdtjdtjdtj	di}t
j|||| |d}|durAt|||}|S |}|S )a	  
        Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`)
        dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected
        as input.

        Args:
            image (`np.ndarray`):
                The image to pad.
            padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
                Padding to apply to the edges of the height, width axes. Can be one of three formats:
                - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
                - `((before, after),)` yields same before and after pad for height and width.
                - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
            mode (`PaddingMode`):
                The padding mode to use. Can be one of:
                    - `"constant"`: pads with a constant value.
                    - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
                    vector along each axis.
                    - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
                    - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use the inferred format of the input image.

        Returns:
            `np.ndarray`: The padded image.

        rB   NZconstantZreflectedgeZ	symmetric)ru   rv   )
isinstancer`   r^   r   r   r   CONSTANTZREFLECTZ	REPLICATEZ	SYMMETRICra   r   )rM   r   rq   ru   rv   rT   r$   Zpadding_mode_mappingr+   r+   r,   r     s    0zAriaImageProcessor.padgrid_pinpointsr   c                    st   t |ts	td|}t|d}t||}	| j||	|d}
| j|
|	d}t||d} fdd|D }|S )aY  
        Process an image with variable resolutions by dividing it into patches.

        Args:
            image (`np.array`):
                The input image to be processed.
            grid_pinpoints (List[Tuple[int, int]]):
                A list of possible resolutions as tuples.
            patch_size (`int`):
                Size of the patches to divide the image into.
            resample (`PILImageResampling`):
                Resampling filter to use if resizing the image.
            data_format (`ChannelDimension` or `str`):
                The channel dimension format for the output image.
            input_data_format (`ChannelDimension` or `str`):
                The channel dimension format of the input image.

        Returns:
            `List[np.array]`: A list of NumPy arrays containing the processed image patches.
        z6grid_pinpoints must be a list of possible resolutions.r    rl   )r$   )r   r$   c                    s   g | ]	}t | d qS ))r!   Zinput_channel_dim)r   )rH   r*   rX   r+   r,   rJ     s    z8AriaImageProcessor.get_image_patches.<locals>.<listcomp>)rx   list	TypeErrorr   r   rp   rt   r-   )rM   r   rz   r   r>   rT   r$   Zpossible_resolutionsZ
image_sizeZbest_resolutionro   rs   r%   r+   rX   r,   r]     s   

z$AriaImageProcessor.get_image_patches)!__name__
__module____qualname____doc__Zmodel_input_namesr   ZBICUBICr   r   floatr`   r   rb   r   rL   r   ZFIRSTr   strr   rj   ra   arraytuplerp   rt   r   ry   Zndarrayr   r   r]   __classcell__r+   r+   rO   r,   r.   I   s   

	

'	

 G

 
Br.   ))typingr   r   r   r   r   numpyra   Zimage_processing_utilsr   r	   r
   r   Zimage_transformsr   r   r   r   r   Zimage_utilsr   r   r   r   r   r   r   r   r   r   utilsr   r   Z
get_loggerr}   r\   r   r`   r-   r.   __all__r+   r+   r+   r,   <module>   s   0
   
.