o
    Zh)m                     @   s  d dl Z d dlmZmZmZmZmZ d dlZddl	m
Z
mZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlm Z m!Z!m"Z" e! r[d dl#m$Z$ e"%e&Z'd	eee  fd
dZ(	dde)de)de)de)de)f
ddZ*G dd de
Z+dgZ,dS )    N)DictIterableListOptionalUnion   )BaseImageProcessorBatchFeature)convert_to_rgbpadresizeto_channel_dimension_format)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatis_scaled_imageis_valid_imagemake_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)
TensorTypeis_vision_availablelogging)Imagereturnc                 C   sx   t | ttfrt | d ttfrt| d d rdd | D S t | ttfr.t| d r.| S t| r5| gS td|  )a  
    Accepts images in list or nested list format, and makes a list of images for preprocessing.

    Args:
        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
            The input image.

    Returns:
        list: A list of images.
    r   c                 S   s   g | ]	}|D ]}|qqS  r    ).0Zimg_listZimgr    r    ]/var/www/auris/lib/python3.10/site-packages/transformers/models/emu3/image_processing_emu3.py
<listcomp><       z'make_batched_images.<locals>.<listcomp>z#Could not make batched images from )
isinstancelisttupler   
ValueError)imagesr    r    r"   make_batched_images0   s   0r*      @   P heightwidthfactor
min_pixels
max_pixelsc                 C   s$  | |k s||k rt d|  d| d| t| |t| | dkr0t dt| |t| |  t| | | }t|| | }|| |krit| | | }t| | | | }t|| | | }||fS || |k rt|| |  }t| | | | }t|| | | }||fS )a)  Rescales the image so that the following conditions are met:

    1. Both dimensions (height and width) are divisible by 'factor'.

    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].

    3. The aspect ratio of the image is maintained as closely as possible.

    zheight:z
 or width:z must be larger than factor:   z4absolute aspect ratio must be smaller than 200, got )r(   maxminroundmathsqrtfloorceil)r.   r/   r0   r1   r2   Zh_barZw_barbetar    r    r"   smart_resizeG   s$   r<   c                        s  e Zd ZdZddgZdejdddddddddd	fd
edededee	e
f dedeee
ee
 f  deee
ee
 f  dedede	de	de	ddf fddZddddddddejdf
ded
ee dedee dee
 dee deee
ee
 f  deee
ee
 f  dee dee deeeef  fddZ		d*deej deee	  deeeef  deeeef  fddZdddddddddddejdfded
ee d eeee	f  dedee dee
 dee deee
ee
 f  deee
ee
 f  dee ded!eeeef  dee deeeef  fd"d#Z						$	d+dedee dee
 dee deee
ee
 f  deee
ee
 f  d!eeef deeeef  fd%d&Z	d,d'ejdee
ee
 f dee
ee
 f deeeef  dejf
d(d)Z  ZS )-Emu3ImageProcessora	  
    Constructs a Emu3 image processor that dynamically resizes images based on the original images.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's (height, width) dimensions.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
            Resampling filter to use when resizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image.
        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_pad (`bool`, *optional*, defaults to `True`):
                Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
                number of patches in the batch. Padding will be applied to the bottom and right with zeros.
        min_pixels (`int`, *optional*, defaults to `512 * 512`):
            The min pixels of the image to resize the image.
        max_pixels (`int`, *optional*, defaults to `1024 * 1024`):
            The max pixels of the image to resize the image.
        spatial_factor (`int`, *optional*, defaults to 8):
            The spatial downsample factor the image will be downsampled in feature extracting phase
    pixel_valuesimage_sizesTgp?Ni   i      	do_resizeresample
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbdo_padr1   r2   spatial_factorr   c                    s|   t  jdi | || _|| _|| _|| _|| _|d ur|nt| _|d ur'|nt	| _
|
| _|| _|| _|
|d| _|| _d S )N)r1   r2   r    )super__init__rA   rB   rC   rD   rE   r   rF   r   rG   r1   r2   rJ   sizerH   )selfrA   rB   rC   rD   rE   rF   rG   rH   rI   r1   r2   rJ   kwargs	__class__r    r"   rL      s   
zEmu3ImageProcessor.__init__r)   data_formatinput_data_formatc                 C   s  t |}|	rdd |D }dd |D }t|d r!|r!td |du r+t|d }t|d |d\}}||}}g }|D ]=}|rZt||| j| j| j	d\}}t
|||f||d	}|rd| j|||d
}|ro| j||||d}t||
|d}|| q>t|}|S )a
  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
            vision_info (`List[Dict]`, *optional*):
                Optional list of dictionaries containing additional information about vision inputs.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Scale factor to use if rescaling the image.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        c                 S      g | ]}t |qS r    )r
   r!   imager    r    r"   r#          z2Emu3ImageProcessor._preprocess.<locals>.<listcomp>c                 S   rT   r    )r   rU   r    r    r"   r#      rW   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.N)Zchannel_dim)r0   r1   r2   )rM   rB   rS   scalerS   rV   meanstdrS   Zinput_channel_dim)r   r   loggerZwarning_oncer   r   r<   rJ   r1   r2   r   rescale	normalizer   appendnparray)rN   r)   rA   rB   rC   rD   rE   rF   rG   rH   rR   rS   r.   r/   Zresized_heightZresized_widthZprocessed_imagesrV   r    r    r"   _preprocess   sF   1


zEmu3ImageProcessor._preprocessc                    sD   t dd |D t dd |D f fddt||D }|S )au  
        Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.

        Args:
            pixel_values (`List[np.ndarray]`):
                An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
            image_sizes (`List[List[int]]`):
                A list of sizes for each image in `pixel_values` in (height, width) format.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use the inferred format of the input image.

        Returns:
            List[`np.ndarray`]: The padded images.
        c                 S      g | ]}|d  qS )r   r    r!   rM   r    r    r"   r#   %  rW   z8Emu3ImageProcessor._pad_for_batching.<locals>.<listcomp>c                 S   re   )   r    rf   r    r    r"   r#   &  rW   c              	      sD   g | ]\}}t |d d  |d   fd d |d  ff dqS )r   rg   )paddingrR   rS   )r   )r!   rV   rM   rR   rS   Z	max_shaper    r"   r#   (  s    &)r4   zip)rN   r>   r?   rR   rS   r    ri   r"   _pad_for_batching  s   	z$Emu3ImageProcessor._pad_for_batchingrM   return_tensorsc                 C   sd  |dur|n| j }|dur|n| j}|dur|n| j}|dur!|n| j}|dur*|n| j}|dur3|n| j}|dur<|n| j}|	durE|	n| j}	|
durN|
n| j}
|durW|n| j	}|durbt
|}|durnt|sntdt||||	|||d g }|D ]}| j||||||||	||
|d}|| q}dd |D }|r| ||}t|}t||d|dS )	a  
        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
                the longest edge resized to keep the input aspect ratio.
            resample (`int`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
                has an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            do_pad (`bool`, *optional*, defaults to `True`):
                Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
                number of patches in the batch. Padding will be applied to the bottom and right with zeros.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        NzkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)rD   rE   rF   rG   rA   rM   rB   )
rA   rB   rC   rD   rE   rF   rG   rR   rH   rS   c                 S   s   g | ]	}|j d d qS )N)shaperU   r    r    r"   r#     r$   z1Emu3ImageProcessor.preprocess.<locals>.<listcomp>)r>   r?   dataZtensor_type)rA   rM   rB   rC   rD   rE   rF   rG   rH   rI   r*   r   r(   r   rd   extendrk   rb   rc   r	   )rN   r)   rA   rM   rB   rC   rD   rE   rF   rG   rH   rI   rl   rR   rS   r>   rV   r?   r    r    r"   
preprocess3  s`   B


zEmu3ImageProcessor.preprocessPIL.Image.Imagec	                 C   sV  |dur|n| j }|du rd| j n|}|dur|n| j}|dur#|n| j}|dur,|n| j}t|}t|d tjrGt|dkrC|S |d S |du rQt	|d }g }	|D ]C}
t
|
}
|rf| j|
|||d}
|rz| j|
||d}
|
ddtj}
|r|r|dkrt|
tj|d	}
|	t|
 qU|	|
 qUd
|	i}|dkr|nd}t||dS )a%  
        Postprocess an image or batch of images tensor. Postprocess is the reverse process of preprocess.
        The parameters should be same as in preprocess.
        Args:
            images (`ImageInput`):
                Image to postprocess. Expects a single or batch of images with pixel values ranging from -1 to 1.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to `True`.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        Ng      ?r   rg   )rV   rF   rG   rS   rX      rs   r]   r>   ro   )rC   rD   rE   rF   rG   r   r%   r   lenr   r   unnormalizer_   ZclipZastyperb   Zuint8r   r   ZLASTra   Z	fromarrayrq   r	   )rN   r)   rC   rD   rE   rF   rG   rl   rS   r>   rV   rp   r    r    r"   postprocess  s6   'zEmu3ImageProcessor.postprocessrV   c                 C   s   d}t |trt||krtd| dt| n|g| }t |tr7t||kr6td| dt| n|g| }tdd t||D }tdd |D }| j||||d}|S )	a~  
        Unnormalizes `image` using the mean and standard deviation specified by `mean` and `std`.
        image = (image * image_std) + image_mean
        Args:
            image (`torch.Tensor` of shape `(batch_size, num_channels, image_size, image_size)` or `(num_channels, image_size, image_size)`):
                Batch of pixel values to postprocess.
            image_mean (`float` or `Iterable[float]`):
                The mean to use for unnormalization.
            image_std (`float` or `Iterable[float]`):
                The standard deviation to use for unnormalization.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        r   zmean must have z$ elements if it is an iterable, got zstd must have c                 s   s    | ]
\}}| | V  qd S Nr    )r!   r[   r\   r    r    r"   	<genexpr>  s    z1Emu3ImageProcessor.unnormalize.<locals>.<genexpr>c                 s   s    | ]}d | V  qdS )rg   Nr    )r!   r\   r    r    r"   ry      s    rZ   )r%   r   ru   r(   r'   rj   r`   )rN   rV   rF   rG   rS   Znum_channelsZrev_image_meanZrev_image_stdr    r    r"   rv     s"   



zEmu3ImageProcessor.unnormalize)NN)NNNNNrs   Nrx   )__name__
__module____qualname____doc__Zmodel_input_namesr   ZBICUBICboolr   intfloatr   r   rL   r   ZFIRSTr   strrd   rb   Zndarrayrk   r   r   rr   rw   rc   r   rv   __classcell__r    r    rP   r"   r=   f   sH   
	
!	

d

0	

}
	
Qr=   )r+   r,   r-   )-r7   typingr   r   r   r   r   numpyrb   Zimage_processing_utilsr   r	   Zimage_transformsr
   r   r   r   Zimage_utilsr   r   r   r   r   r   r   r   r   r   r   r   r   utilsr   r   r   ZPILr   Z
get_loggerrz   r^   r*   r   r<   r=   __all__r    r    r    r"   <module>   s8   <

   
D