o
    Zh                     @   s  d Z ddlZddlmZmZmZmZ ddlZddl	m
Z
mZmZ ddlmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZ ddlmZmZmZm Z m!Z!m"Z"m#Z# e r[ddl$Z$e"%e&Z'd	eeee  ee ef d
eee  fddZ(G dd deZ)G dd de
Z*dgZ+dS )zImage processor class for Fuyu.    N)DictListOptionalUnion   )BaseImageProcessorBatchFeatureget_size_dict)padresizeto_channel_dimension_format)
ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatis_scaled_imageis_valid_imagemake_list_of_imagesto_numpy_arrayvalidate_preprocess_arguments)
TensorTypefilter_out_non_signature_kwargsis_torch_availableis_torch_deviceis_torch_dtypeloggingrequires_backendsimagesreturnc                 C   sP   t | r| ggS t| trtdd | D r| S t| tr$dd | D S td)Nc                 s   s    | ]}t |tV  qd S N)
isinstancelist.0image r&   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/fuyu/image_processing_fuyu.py	<genexpr>@   s    z.make_list_of_list_of_images.<locals>.<genexpr>c                 S      g | ]}t |qS r&   )r   r#   r&   r&   r'   
<listcomp>D       z/make_list_of_list_of_images.<locals>.<listcomp>zHimages must be a list of list of images or a list of images or an image.)r   r!   r"   all
ValueError)r   r&   r&   r'   make_list_of_list_of_images:   s   
r.   c                   @   s6   e Zd ZdZd
deeeef  fddZddd	Z	dS )FuyuBatchFeaturez
    BatchFeature class for Fuyu image processor and processor.

    The outputs dictionary from the processors contains a mix of tensors and lists of tensors.
    Ntensor_typec                    s   |du r| S | j |d\fdd  fdd|  D ]3\}t|tr<t|d tr<fdd	|D | < q t|trMfd
d	|D | < q || < q | S )a5  
        Convert the inner content to tensors.

        Args:
            tensor_type (`str` or [`~utils.TensorType`], *optional*):
                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
                `None`, no modification is done.
        N)r0   c                    s   | r| S  | S r    r&   elem)	as_tensor	is_tensorr&   r'   _convert_tensor^   s   z<FuyuBatchFeature.convert_to_tensors.<locals>._convert_tensorc                    s*   z | W S    dkrt dt d)NZoverflowing_valueszKUnable to create tensor returning overflowing values of different lengths. zUnable to create tensor, you should probably activate padding with 'padding=True' to have batched tensors with the same length.)r-   r1   )r5   keyr&   r'   _safe_convert_tensorc   s   
zAFuyuBatchFeature.convert_to_tensors.<locals>._safe_convert_tensorr   c                    s   g | ]} fd d|D qS )c                       g | ]} |qS r&   r&   r$   r2   r7   r&   r'   r*   r   r+   zBFuyuBatchFeature.convert_to_tensors.<locals>.<listcomp>.<listcomp>r&   )r$   elemsr:   r&   r'   r*   r   s    z7FuyuBatchFeature.convert_to_tensors.<locals>.<listcomp>c                    r8   r&   r&   r9   r:   r&   r'   r*   u   r+   )Z_get_is_as_tensor_fnsitemsr!   r"   )selfr0   valuer&   )r5   r7   r3   r4   r6   r'   convert_to_tensorsP   s   	
z#FuyuBatchFeature.convert_to_tensorsr   r   c           	         s"  t | dg ddli }ddu r?tdkr?d }t|r$nt|ts2t|s2t|tr5|n
t	dt| dfdd | 
 D ]?\}}t|trtt|d trtg }|D ]}| fd	d
|D  q`|||< qLt|tr fdd
|D ||< qL |||< qL|| _| S )a  
        Send all values to device by calling `v.to(*args, **kwargs)` (PyTorch only). This should support casting in
        different `dtypes` and sending the `BatchFeature` to a different `device`.

        Args:
            args (`Tuple`):
                Will be passed to the `to(...)` function of the tensors.
            kwargs (`Dict`, *optional*):
                Will be passed to the `to(...)` function of the tensors.

        Returns:
            [`BatchFeature`]: The same instance after modification.
        torchr   Ndevicez*Attempting to cast a BatchFeature to type z. This is not supported.c                    s2    | r| j i S d ur| jdS | S )N)rA   )Zis_floating_pointtor1   )argsrA   kwargsr@   r&   r'   _to   s
   
z FuyuBatchFeature.to.<locals>._toc                    r8   r&   r&   r9   rE   r&   r'   r*      r+   z'FuyuBatchFeature.to.<locals>.<listcomp>c                    r8   r&   r&   r9   rF   r&   r'   r*      r+   )r   r@   getlenr   r!   strr   intr-   r<   r"   appenddata)	r=   rC   rD   Znew_dataargkvnew_vr;   r&   )rE   rC   rA   rD   r@   r'   rB   {   s.   


zFuyuBatchFeature.tor    )r   r   )
__name__
__module____qualname____doc__r   r   rI   r   r?   rB   r&   r&   r&   r'   r/   I   s    +r/   c                !       s  e Zd ZdZg dZddejdddddddddfd	ed
ee	e
ef  dededede
dedeeee f deeee f dededee	e
ef  f fddZejddfdejd
e	e
ef dedeee
ef  deee
ef  dejfddZ				d4dejd
e	e
ef de
dedeee
ef  deee
ef  dejfdd Ze ddddddddddddejddfd	ee d
ee	e
ef  dee dee dee dee
 dee dee dee dee dee dee	e
ef  deee
ef  deee
ef  d!ee fd"d#Zd5d$ed%edee	e
ef  defd&d'Zd5dd(dee	e
ef  dd(fd)d*Z	d5d+d(d,d(d-d(d.d(d/ed0ed1edee	e
ef  defd2d3Z  ZS )6FuyuImageProcessora	  
    This class should handle the image processing part before the main FuyuForCausalLM. In particular, it should
    handle:

    - Processing Images:
        Taking a batch of images as input. If the images are variable-sized, it resizes them based on the desired patch
        dimensions. The image output is always img_h, img_w of (1080, 1920)

        Then, it patches up these images using the patchify_image function.

    - Creating Image Input IDs:
        For each patch, a placeholder ID is given to identify where these patches belong in a token sequence. For
        variable-sized images, each line of patches is terminated with a newline ID.

    - Image Patch Indices:
        For each image patch, the code maintains an index where these patches should be inserted in a token stream.


    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image to `size`.
        size (`Dict[str, int]`, *optional*, defaults to `{"height": 1080, "width": 1920}`):
            Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
        do_pad (`bool`, *optional*, defaults to `True`):
            Whether to pad the image to `size`.
        padding_value (`float`, *optional*, defaults to 1.0):
            The value to pad the image with.
        padding_mode (`str`, *optional*, defaults to `"constant"`):
            The padding mode to use when padding the image.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image.
        image_mean (`float`, *optional*, defaults to 0.5):
            The mean to use when normalizing the image.
        image_std (`float`, *optional*, defaults to 0.5):
            The standard deviation to use when normalizing the image.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image.
        rescale_factor (`float`, *optional*, defaults to `1 / 255`):
            The factor to use when rescaling the image.
        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 30, "width": 30}`):
            Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
    r   image_input_idsimage_patchesimage_patch_indices_per_batch#image_patch_indices_per_subsequenceTN      ?constantg      ?gp?	do_resizesizeresampledo_padpadding_valuepadding_modedo_normalize
image_mean	image_std
do_rescalerescale_factor
patch_sizec                    s   t  jdi | || _|d ur|nddd| _|| _|| _|| _|| _|| _|| _	|	| _
|
| _|| _|d ur<|| _d S ddd| _d S )Ni8  i  )heightwidth   r&   )super__init__r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   )r=   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   rD   	__class__r&   r'   rm      s   "zFuyuImageProcessor.__init__r%   data_formatinput_data_formatr   c                 K   s   t ||\}}|d |d }	}
||
kr||	kr|S |	| }|
| }t||}t|| }t|| }td|||f|||d|}|S )a  
        Resize an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to resize.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.

        Returns:
            `np.ndarray`: The resized image.
        ri   rj   )r%   r^   r_   rp   rq   Nr&   )r   minrJ   r   )r=   r%   r^   r_   rp   rq   rD   image_heightimage_widthtarget_heighttarget_widthZheight_scale_factorZwidth_scale_factorZoptimal_scale_factorZ
new_heightZ	new_widthZscaled_imager&   r&   r'   r   
  s&   #
zFuyuImageProcessor.resizemodeconstant_valuesc                 C   s\   t ||\}}|d |d }	}
d}d}|	| }|
| }t|||f||ff||||d}|S )a  
        Pad an image to `(size["height"], size["width"])`.

        Args:
            image (`np.ndarray`):
                Image to pad.
            size (`Dict[str, int]`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            data_format (`ChannelDimension` or `str`, *optional*):
                The data format of the output image. If unset, the same format as the input image is used.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        ri   rj   r   )paddingrw   rx   rp   rq   )r   r
   )r=   r%   r^   rw   rx   rp   rq   rs   rt   ru   rv   padding_toppadding_leftpadding_bottompadding_rightZpadded_imager&   r&   r'   	pad_imageD  s   zFuyuImageProcessor.pad_imagereturn_tensorsc                    s  |dur|nj }durnj|dur|nj}|dur!|nj}|dur*|nj}dur3nj|dur<|nj}durEnjdurNnjdurWnj	dur`nj
|duri|nj}durrnj|dur{|nj}t|trtdd |D rtdt|}t|||||d
 dd |D }|rt|d d rtd	 du rt|d d fd
d|D }t|rއfdd|D }fdd|D }dd |D }dd |D }dd t||D }|rfdd|D }|rfdd|D }|r,fdd|D } dur; fdd|D }||||d}t||dS )a  

        Utility function to preprocess the images and extract necessary information about original formats.

        Args:
            images (`ImageInput`):
                Images to preprocess. Expects a single image, a list or images or a list of lists of images. Pixel
                values range from 0 to 255, or between 0 and 1 if `do_rescale` is `False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image to `size`.
            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
                Whether to pad the image to `size`.
            padding_value (`float`, *optional*, defaults to `self.padding_value`):
                The value to pad the image with.
            padding_mode (`str`, *optional*, defaults to `self.padding_mode`):
                The padding mode to use when padding the image.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float`, *optional*, defaults to `self.image_mean`):
                The mean to use when normalizing the image.
            image_std (`float`, *optional*, defaults to `self.image_std`):
                The standard deviation to use when normalizing the image.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                The factor to use when rescaling the image.
            patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format of the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
        Nc                 s   s&    | ]}t |tot|d kV  qdS )   N)r!   r"   rH   r9   r&   r&   r'   r(     s   $ z0FuyuImageProcessor.preprocess.<locals>.<genexpr>z:Multiple images for a single sample are not yet supported.)
rf   rg   rc   rd   re   r`   Zsize_divisibilityr]   r^   r_   c                 S   s   g | ]	}d d |D qS )c                 S   r)   r&   )r   r#   r&   r&   r'   r*     r+   <FuyuImageProcessor.preprocess.<locals>.<listcomp>.<listcomp>r&   r$   r   r&   r&   r'   r*     s    z1FuyuImageProcessor.preprocess.<locals>.<listcomp>r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.c                       g | ]
}t |d   dqS r   )Zchannel_dimr   r   rq   r&   r'   r*         c                    "   g | ]} fd d|D qS )c                    s   g | ]
}j | d qS ))r^   rq   )r   r#   rq   r=   r^   r&   r'   r*     r   r   r&   r   r   r&   r'   r*         c                    r   r   r   r   r   r&   r'   r*     r   c                 S      g | ]}|d  gqS r   r&   r$   Z
image_sizer&   r&   r'   r*         c                 S   r   )   r&   r   r&   r&   r'   r*     r   c                 S   s"   g | ]\}}|d  |d   gqS r   r&   )r$   Zoriginal_sizeZresized_sizer&   r&   r'   r*     s    c                    s&   g | ]} fd d|D qS )c              	      s    g | ]}j | d qS ))r^   rw   rx   rq   )r~   r#   rq   rb   ra   r=   r^   r&   r'   r*     s    r   r&   r   r   r&   r'   r*     s    c                    r   )c                    s   g | ]
}j | d qS ))scalerq   )Zrescaler#   rq   rg   r=   r&   r'   r*     r   r   r&   r   r   r&   r'   r*      r   c                    s$   g | ]} fd d|D qS )c                    s   g | ]}j | d qS ))meanZstdrq   )	normalizer#   rd   re   rq   r=   r&   r'   r*     s    r   r&   r   r   r&   r'   r*     s    c                    s    g | ]} fd d|D qS )c                    s   g | ]}t | qS r&   )r   r#   rp   rq   r&   r'   r*     s    r   r&   r   r   r&   r'   r*     s    )r   image_unpadded_heightsimage_unpadded_widthsimage_scale_factors)rL   r0   )r]   r^   r_   r`   rf   rg   rc   rd   re   ra   rb   rh   r!   r"   anyr-   r.   r   r   loggerZwarning_oncer   r	   zipr/   )r=   r   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   rp   rq   r   Zbatch_imagesZoriginal_image_sizesZimage_sizesr   r   r   rL   r&   )	rp   rd   re   rq   rb   ra   rg   r=   r^   r'   
preprocessj  s   F
zFuyuImageProcessor.preprocessrs   rt   c           	      C   s   |dur|n| j }| j d | j d }}|| dkr$td|d| || dkr4td|d| || }|| }|| }|S )a  
        Calculate number of patches required to encode an image.

        Args:
            image_height (`int`):
                Height of the image.
            image_width (`int`):
                Width of the image.
            patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
        Nri   rj   r   zimage_height=z must be divisible by zimage_width=)rh   r-   )	r=   rs   rt   rh   patch_heightpatch_widthZnum_patches_per_dim_hZnum_patches_per_dim_wnum_patchesr&   r&   r'   get_num_patches  s   z"FuyuImageProcessor.get_num_patchesztorch.Tensorc           
      C   s   t | dg |dur|n| j}|d |d }}|j\}}}}|d||}|d||}	|	 }	|	||d||}	|	dddd	d
}	|	|d|| | }	|	S )a|  
        Convert an image into a tensor of patches.

        Args:
            image (`torch.Tensor`):
                Image to convert. Shape: [batch, channels, height, width]
            patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Dictionary in the format `{"height": int, "width": int}` specifying the size of the patches.
        r@   Nri   rj   r   r   r      r   )r   rh   shapeZunfold
contiguousviewZpermutereshape)
r=   r%   rh   r   r   Z
batch_sizeZchannels_Zunfolded_along_heightpatchesr&   r&   r'   patchify_image5  s   
z!FuyuImageProcessor.patchify_imageimage_inputimage_presentimage_unpadded_himage_unpadded_wimage_placeholder_idimage_newline_idvariable_sizedc	           '   	   C   s  t | dg |dur|n| j}|d |d }	}
g }g }g }t|jd D ]}g }g }t|jd D ]}|||f r|||f }|jd |jd }}|rt|t|||f |	 |	 }t|t|||f |
 |
 }|ddd|d|f }||}}| j||d}tj	|g|tj
|jd	}| j|dd
d}||jd ksJ |r|d||
 }tj	|jd dg|tj
|jd	}tj||gdd}|d}||g || || q2|tjg tj
|jd	 q2|| || q%g }g }|D ][}d}g }g } |D ]F}!|!|k}"t|"}tj|tj|!jd	|!}#t|!d}$t|!d}%tj|"ddd }&|#| |$|&< |#|%|&< ||$ | |% ||7 }q	|| ||  qt|||||ddS )a  Process images for model input. In particular, variable-sized images are handled here.

        Args:
            image_input (`torch.Tensor` of shape [batch_size, subsequence_size, num_channels, height, width]):
                Tensor of images padded to model input size.
            image_present (`torch.Tensor` of shape [batch_size, subsequence_size, num_images]):
                Tensor of 1s and 0s indicating whether an image is present.
            image_unpadded_h (`torch.Tensor` of shape [batch_size, subsequence_size]):
                Tensor of unpadded image heights.
            image_unpadded_w (`torch.Tensor` of shape [batch_size, subsequence_size]):
                Tensor of unpadded image widths.
            image_placeholder_id (int):
                The id of the image placeholder token. Comes from an associated tokenizer.
            image_newline_id (int):
                The id of the image newline token. Comes from an associated tokenizer.
            variable_sized (bool):
                Whether to process images as variable-sized.
            patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
                Size of the patches.
        r@   Nri   rj   r   r   r   )rs   rt   )ZdtyperA   )r%   r   )dimT)as_tuplerV   )rL   )r   rh   ranger   rr   mathceilr   r@   fullZint32rA   r   Z	unsqueezeZsqueezer   catrK   ZtensorZcount_nonzeroZarangeZint64Ztype_asZ	full_likeZnonzeror/   )'r=   r   r   r   r   r   r   r   rh   r   r   r   Zbatch_image_patchesZbatch_image_input_idsZbatch_indexrW   rX   Zsubseq_indexr%   rs   rt   Znew_hZnew_wr   Ztensor_of_image_idsr   Znewline_idsrY   rZ   Zsample_image_input_idsZindex_offsetZper_batch_indicesZper_subsequence_indicesZsubseq_image_input_idsZpatches_maskindicesZindices_in_stream_per_batchZ!indices_in_stream_per_subsequenceZpatches_indsr&   r&   r'   preprocess_with_tokenizer_infoO  s   







z1FuyuImageProcessor.preprocess_with_tokenizer_info)r\   r[   NNr    )rQ   rR   rS   rT   Zmodel_input_namesr   ZBILINEARboolr   r   rI   rJ   floatr   r   rm   npZndarrayr   r   r~   r   ZFIRSTr   r   r   r   r/   r   __classcell__r&   r&   rn   r'   rU      s"   -
	
"

>

&	
 (2$#	
rU   ),rT   r   typingr   r   r   r   numpyr   Zimage_processing_utilsr   r   r	   Zimage_transformsr
   r   r   Zimage_utilsr   r   r   r   r   r   r   r   r   r   utilsr   r   r   r   r   r   r   r@   Z
get_loggerrQ   r   r.   r/   rU   __all__r&   r&   r&   r'   <module>   s.   0$


m    
"