o
    Zh>                     @   sR  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlmZmZ ddlmZmZmZ ddlmZmZmZmZmZmZmZmZmZ dd	lmZmZmZm Z  e !e"Z#e r^dd
l$m%Z% edd	d de&de&de&de&de'dee&e&f fddZ(de
j)de&de
j)fddZ*d!de
j)de&de&dee
j)e
j)f fddZ+G dd deZ,dgZ-dS )"z"Image processor class for SigLIP2.    N)	lru_cache)ListOptionalTupleUnion   )BaseImageProcessorBatchFeature)convert_to_rgbresizeto_channel_dimension_format)	ChannelDimension
ImageInputPILImageResamplinginfer_channel_dimension_formatis_scaled_imagemake_flat_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)
TensorTypefilter_out_non_signature_kwargsis_vision_availablelogging)Image   )maxsizeh㈵>image_heightimage_width
patch_sizemax_num_patchesepsreturnc                 C   s   dt dtdtdtfdd}|d d}}|| |krC|| d	 }||| |}	||||}
|	| |
|  }||kr;|}n|}|| |ks|}||| |}	||||}
|	|
fS )
a"  
    Determine image size based on max number of patches, ensure dimensions are divisible by patch size and image is at least 1 patch.

    Args:
        image_height (`int`):
            Original image height.
        image_width (`int`):
            Original image width.
        patch_size (`int`):
            Patch size for processing.
        max_num_patches (`int`):
            Maximum number of patches.
        eps (`float`):
            Small threshold for binary search.

    Returns:
        Tuple: (target_height, target_width)
    scalesizer    r#   c                 S   s,   ||  }t || | }t||}t|S )N)mathceilmaxint)r$   r%   r    Zscaled_size r*   c/var/www/auris/lib/python3.10/site-packages/transformers/models/siglip2/image_processing_siglip2.pyget_scaled_image_sizeI   s   
zAget_image_size_for_max_num_patches.<locals>.get_scaled_image_size
   g      Y@   )floatr)   )r   r   r    r!   r"   r,   Z	scale_minZ	scale_maxr$   Ztarget_heightZtarget_widthZnum_patchesr*   r*   r+   "get_image_size_for_max_num_patches2   s   r0   imagec                 C   sT   | j \}}}|| }|| }| |||||}|ddddd}||| d}|S )z
    Convert 3D array image of shape (image_height, image_width, num_channels) into 2D array of patches of shape
    (num_patches_height * num_patches_width, patch_size * patch_size * num_channels).
    r   r.      r      )shapeZreshapeZ	transpose)r1   r    r   r   Znum_channelsnum_patches_heightnum_patches_widthZpatched_imager*   r*   r+   convert_image_to_patchesb   s   r8   arraytarget_length	pad_valuec                 C   sn   | j d }|| }tj|ftjd}|dkr3d|fgdg| jd   }tj| |d|d} d|| d< | |fS )z2
    Pad the array along the first dimension.
    r   )Zdtype)r   r   r2   Zconstant)modeZconstant_valuesN)r5   npZonesZint32ndimpad)r9   r:   r;   Zcurrent_lengthZpadding_lengthmaskZpaddingsr*   r*   r+   pad_along_first_dimp   s   
rA   c                       sH  e Zd ZdZg dZdejddddddddf
ded	d
dededede	e
eee f  de	e
eee f  de	e dedef fddZe 												ddede	e d	e	d
 de	e de	e de	e de	e
eee f  de	e
eee f  de	e
eef  de	e
eef  de	e de	e de	e ddfddZ  ZS )Siglip2ImageProcessora3	  
    Constructs a SigLIP2 image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's dimensions to fit `max_num_patches` according to given `patch_size`.
            Can be overridden by `do_resize` in the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
            `do_normalize` in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch the image will be split to.
        max_num_patches (`int`, *optional*, defaults to 256):
            The image will be resized to have at most this number of patches,
            and then padded in "patch" dimension to match this number exactly.
    pixel_valuesZpixel_attention_maskspatial_shapesTgp?N   r   	do_resizeresampler   
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbr    r!   c                    sz   t  jdi | |d ur|ng d}|d ur|ng d}|| _|| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _d S )N)      ?rO   rO   r*   )super__init__rG   rH   rI   rJ   rK   rL   rM   rN   r    r!   )selfrG   rH   rI   rJ   rK   rL   rM   rN   r    r!   kwargs	__class__r*   r+   rQ      s   
zSiglip2ImageProcessor.__init__imagesreturn_tensorsinput_data_formatr#   zImage.Imagec                 C   s&  |dur|n| j }|dur|n| j}|dur|n| j}|dur!|n| j}|dur*|n| j}|dur3|n| j}|dur<|n| j}|durE|n| j}|durN|n| j}|durW|n| j	}t
j}t|}t|sitdt|||||d |r{dd |D }dd |D }|rt|d rtd |
du rt|d }
g }g }g }|D ]e}t|||
d	}|rt|jd |jd
 ||d\}}t|||f||d}|r| j|||d}|r| j||||d}t||}t||\}}|jd | }|jd
 | }|||f || || qt|||d|	d}|S )a  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                Size of the image after resizing.
            resample (`int`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
                has an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                `True`.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            patch_size (`int`, *optional*, defaults to `self.patch_size`):
                Patch size for processing, same as the patch size used in the model.
            max_num_patches (`int`, *optional*, defaults to `self.max_num_patches`):
                Maximum number of patches per image, the image will be resized to have at most this number of patches.
        NzkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.)rI   rJ   rK   rL   rM   c                 S      g | ]}t |qS r*   )r
   .0r1   r*   r*   r+   
<listcomp>      z4Siglip2ImageProcessor.preprocess.<locals>.<listcomp>c                 S   rY   r*   )r   rZ   r*   r*   r+   r\      r]   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.)Zinput_channel_dimr2   )r   r   r    r!   )r1   r%   rH   rX   )r1   r$   rX   )r1   meanZstdrX   rC   )dataZtensor_type)rG   rH   rI   rJ   rK   rL   rM   rN   r    r!   r   ZLASTr   r   
ValueErrorr   r   loggerZwarning_oncer   r   r0   r5   r   Zrescale	normalizer8   rA   appendr	   )rR   rV   rG   rH   rI   rJ   rK   rL   rM   rW   rX   rN   r    r!   Zdata_formatZpixel_masksrD   rE   r1   heightwidthZpatchesr@   r6   r7   Zbatch_featurer*   r*   r+   
preprocess   s   >


	z Siglip2ImageProcessor.preprocess)NNNNNNNNNNNN)__name__
__module____qualname____doc__Zmodel_input_namesr   ZBILINEARboolr/   r   r   r   r)   rQ   r   r   strr   r   rf   __classcell__r*   r*   rT   r+   rB   ~   s    "	
	
rB   )r   )r   ).rj   r&   	functoolsr   typingr   r   r   r   numpyr=   Zimage_processing_utilsr   r	   Zimage_transformsr
   r   r   Zimage_utilsr   r   r   r   r   r   r   r   r   utilsr   r   r   r   Z
get_loggerrg   ra   ZPILr   r)   r/   r0   Zndarrayr8   rA   rB   __all__r*   r*   r*   r+   <module>   s@   ,

/* 
Z