o
    Zhs                     @   s  d Z ddlZddlZddlmZmZmZmZmZm	Z	 ddl
ZddlmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZ ddlmZmZmZmZm Z  e rWddl!Z!e r^ddl"Z"e rodd	l#m$Z$ dd
l%m&Z&m'Z' erwddl(m)Z) e *e+Z,		d1de	eeej-f de.de.de/de/f
ddZ0dej-dej-fddZ1dej-deej-ej-f fddZ2d2dej-dej-de.dej-fd d!Z3dej-d"ej-d#ej-d$ej-dej-f
d%d&Z4d'e/d(ej-d)ej-d*ej-fd+d,Z5d-d. Z6G d/d0 d0eZ7d0gZ8dS )3z"Image processor class for VitPose.    N)TYPE_CHECKINGDictListOptionalTupleUnion   )BaseImageProcessorBatchFeatureto_channel_dimension_format)	IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDChannelDimension
ImageInputinfer_channel_dimension_formatis_scaled_imagemake_list_of_imagesto_numpy_arrayvalid_images)
TensorTypeis_scipy_availableis_torch_availableis_vision_availablelogging)inv)affine_transformgaussian_filter   )VitPoseEstimatorOutput      i@      ?boximage_widthimage_heightnormalize_factorpadding_factorc                 C   s   | dd \}}}}|| }	t j||d  ||d  gt jd}
||	| kr-|d |	 }n
||	| k r7||	 }t j|| || gt jd}|| }|
|fS )a  
    Encodes a bounding box in COCO format into (center, scale).

    Args:
        box (`Tuple`, `List`, or `np.ndarray`):
            Bounding box in COCO format (top_left_x, top_left_y, width, height).
        image_width (`int`):
            Image width.
        image_height (`int`):
            Image height.
        normalize_factor (`float`):
            Width and height scale factor.
        padding_factor (`float`):
            Bounding box padding factor.

    Returns:
        tuple: A tuple containing center and scale.

        - `np.ndarray` [float32](2,): Center of the bbox (x, y).
        - `np.ndarray` [float32](2,): Scale of the bbox width & height.
    N         ?Zdtype      ?)nparrayfloat32)r"   r#   r$   r%   r&   Z
top_left_xZ
top_left_ywidthheightZaspect_ratiocenterscale r2   c/var/www/auris/lib/python3.10/site-packages/transformers/models/vitpose/image_processing_vitpose.pybox_to_center_and_scale8   s   $r4   bboxesreturnc                 C   sd   | dddf | dddf  d | dddf< | dddf | dddf  d | dddf< | S )a  
    Converts bounding boxes from the COCO format to the Pascal VOC format.

    In other words, converts from (top_left_x, top_left_y, width, height) format
    to (top_left_x, top_left_y, bottom_right_x, bottom_right_y).

    Args:
        bboxes (`np.ndarray` of shape `(batch_size, 4)):
            Bounding boxes in COCO format.

    Returns:
        `np.ndarray` of shape `(batch_size, 4) in Pascal VOC format.
    N   r   r   r   r2   )r5   r2   r2   r3   coco_to_pascal_vocd   s   00r8   heatmapsc           	      C   s  t | tjs
td| jdkrtd| j\}}}}| ||df}t|d||df}t|d||df}t	|d
tj}|ddddd	f | |ddddd	f< |dddddf | |dddddf< tt	|dd
k|d}||fS )a  Get keypoint predictions from score maps.

    Args:
        heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
            Model predicted heatmaps.

    Returns:
        tuple: A tuple containing aggregated results.

        - coords (`np.ndarray` of shape `(batch_size, num_keypoints, 2)`):
            Predicted keypoint location.
        - scores (`np.ndarray` of shape `(batch_size, num_keypoints, 1)`):
            Scores (confidence) of the keypoints.
    zHeatmaps should be np.ndarrayr'   z Heatmaps should be 4-dimensionalr7   r   )r   r   r7   Nr   g        )
isinstancer+   ndarray
ValueErrorndimshapereshapeZargmaxZamaxZtileastyper-   where)	r9   
batch_sizenum_keypoints_r.   Zheatmaps_reshapedidxscorespredsr2   r2   r3   get_keypoint_predictionsx   s   
,,rI   coordsbatch_heatmapskernelc                    s$  |j \}}}}| j d }|dks||kstdt|d d  t fdd|D }t|dd}t|}tj|d	d
d }| d d | d d |d   }	|	|d |d  t	d|| 
d| 7 }	|	t
dd}	||	 }
||	d  }||	| d  }||	| d  }||	| d  }||	d  }||	d |  }d||  }d||  }tj||gdd}|
||dd}|d|
  | }|d|
  | }d|| | |
 |
 | | |  }tj||||gdd}|
||dd}tj|ttjjtd  }| td|| 8 } | S )a  DARK post-pocessing. Implemented by unbiased_data_processing.

    Paper references:
    - Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
    - Zhang et al. Distribution-Aware Coordinate Representation for Human Pose Estimation (CVPR 2020).

    Args:
        coords (`np.ndarray` of shape `(num_persons, num_keypoints, 2)`):
            Initial coordinates of human pose.
        batch_heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width)`):
            Batched heatmaps as predicted by the model.
            A batch_size of 1 is used for the bottom up paradigm where all persons share the same heatmap.
            A batch_size of `num_persons` is used for the top down paradigm where each person has its own heatmaps.
        kernel (`int`, *optional*, defaults to 3):
            Gaussian kernel size (K) for modulation.

    Returns:
        `np.ndarray` of shape `(num_persons, num_keypoints, 2)` ):
            Refined coordinates.
    r   r   zQThe batch size of heatmaps should be 1 or equal to the batch size of coordinates.r7   c                    s   g | ]} fd d|D qS )c                    s    g | ]}t |d   fddqS )g?r   r   )sigmaradiusZaxes)r   ).0ZheatmaprO   r2   r3   
<listcomp>   s     zApost_dark_unbiased_data_processing.<locals>.<listcomp>.<listcomp>r2   )rP   r9   rQ   r2   r3   rR      s    z6post_dark_unbiased_data_processing.<locals>.<listcomp>gMbP?2   )r   r   rT   r   r   rU   edge)mode).r   ).r   r:   r   r(   Zaxiszijmn,ijnk->ijmk)r?   r=   intr+   r,   Zcliplogpadflattenaranger@   rA   ZconcatenateZlinalgr   Zfinfor-   ZepseyeZeinsumsqueeze)rJ   rK   rL   rC   rD   r/   r.   Z
num_coordsZbatch_heatmaps_padindexZi_Zix1Ziy1Zix1y1Zix1_y1_Zix1_Ziy1_ZdxZdyZ
derivativeZdxxZdyyZdxyZhessianr2   rQ   r3   "post_dark_unbiased_data_processing   sF   


 ,$$ra   r0   r1   output_sizec                 C   s   | j d dvrtdt|dkrtdt|dkrtdt|dkr)td|d }|d |d	 d
  }|d	 |d d
  }t| }| ddd	f | |d	  |d	 d  |ddd	f< | dddf | |d  |d d  |dddf< |S )ao  Get final keypoint predictions from heatmaps and apply scaling and
    translation to map them back to the image.

    Note:
        num_keypoints: K

    Args:
        coords (`np.ndarray` of shape `(num_keypoints, ndims)`):

            * If ndims=2, corrds are predicted keypoint location.
            * If ndims=4, corrds are composed of (x, y, scores, tags)
            * If ndims=5, corrds are composed of (x, y, scores, tags,
              flipped_tags)

        center (`np.ndarray` of shape `(2,)`):
            Center of the bounding box (x, y).
        scale (`np.ndarray` of shape `(2,)`):
            Scale of the bounding box wrt original image of width and height.
        output_size (`np.ndarray` of shape `(2,)`):
            Size of the destination heatmaps in (height, width) format.

    Returns:
        np.ndarray: Predicted coordinates in the images.
    r   )r7   r'      z5Coordinates need to have either 2, 4 or 5 dimensions.r7   z9Center needs to have 2 elements, one for x and one for y.z,Scale needs to consist of a width and heightz2Output size needs to consist of a height and widthr    r   r*   Nr(   )r?   r=   lenr+   Z	ones_like)rJ   r0   r1   rb   scale_yscale_xZtarget_coordsr2   r2   r3   transform_preds   s   
44rg   theta
size_inputsize_dstsize_targetc                 C   s  t | } t jdt jd}|d |d  }|d |d  }t| | |d< t|  | |d< |d|d  t|  d|d  t|   d|d    |d	< t| | |d
< t| | |d< |d|d  t|  d|d  t|   d|d    |d< |S )a  
    Calculate the transformation matrix under the constraint of unbiased. Paper ref: Huang et al. The Devil is in the
    Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

    Source: https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py

    Args:
        theta (`float`):
            Rotation angle in degrees.
        size_input (`np.ndarray`):
            Size of input image [width, height].
        size_dst (`np.ndarray`):
            Size of output image [width, height].
        size_target (`np.ndarray`):
            Size of ROI in input plane [w, h].

    Returns:
        `np.ndarray`: A matrix for transformation.
    )r7   r   r)   r   r   rT   rM   g      r(   r   r7   r   r   rU   r   r7   )r+   Zdeg2radzerosr-   mathcossin)rh   ri   rj   rk   matrixrf   re   r2   r2   r3   get_warp_matrix	  s   
66rt   c                    s   fddt jd D }t|g dg}t|  d  d  d  d  d	  d
 f\ d<  d<  d<  d<  d
<  d	<  fdd|D }tj|dd}|S )a[  
    This function implements cv2.warpAffine function using affine_transform in scipy. See https://docs.scipy.org/doc/scipy/reference/generated/scipy.ndimage.affine_transform.html and https://docs.opencv.org/4.x/d4/d61/tutorial_warp_affine.html for more details.

    Note: the original implementation of cv2.warpAffine uses cv2.INTER_LINEAR.
    c                    s   g | ]} d |f qS ).r2   )rP   i)srcr2   r3   rR   4  s    z%scipy_warp_affine.<locals>.<listcomp>r:   )r   r   r   rU   rm   rM   rT   rn   rl   c                    s   g | ]
}t | d dqS )r   )Zoutput_shapeorder)r   )rP   Zchannel)M_invsizer2   r3   rR   C  s    rX   )ranger?   r+   Zvstackr   stack)rv   Mry   ZchannelsZM_scipyZnew_srcr2   )rx   ry   rv   r3   scipy_warp_affine.  s   (	r}   c                       s  e Zd ZdZdgZ							d)dedeeee	f  ded	e
e	ef d
edee
eee f  dee
eee f  f fddZ		d*dejdee dee dedeee	f dee dee
eef  dejfddZddddddddejdf
dede
eee  ejf dee deeee	f  dee d	ee d
ee dee
eee f  dee
eee f  dee
eef  de
eef dee
eef  dejjfddZ	d+dejdejdejde	fd d!Z			d,d"d#de
eeee   ejf d$e	d%ee d&e
eee f f
d'd(Z  ZS )-VitPoseImageProcessora\  
    Constructs a VitPose image processor.

    Args:
        do_affine_transform (`bool`, *optional*, defaults to `True`):
            Whether to apply an affine transformation to the input images.
        size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 192}`):
            Resolution of the image after `affine_transform` is applied. Only has an effect if `do_affine_transform` is set to `True`. Can
            be overridden by `size` in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.).
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the input with mean and standard deviation.
        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`, *optional*):
            The sequence of means for each channel, to be used when normalizing images.
        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`, *optional*):
            The sequence of standard deviations for each channel, to be used when normalizing images.
    pixel_valuesTNp?do_affine_transformry   
do_rescalerescale_factordo_normalize
image_mean	image_stdc           	         sp   t  jdi | || _|d ur|nddd| _|| _|| _|| _|d ur'|nt| _|d ur0|nt	| _
d| _d S )N      )r/   r.   r    r2   )super__init__r   ry   r   r   r   r   r   r   r   r%   )	selfr   ry   r   r   r   r   r   kwargs	__class__r2   r3   r   a  s   
zVitPoseImageProcessor.__init__imager0   r1   rotationdata_formatinput_data_formatr6   c           	      C   s   |du r|n|}|d |d f}t ||d t|d |d }|tjkr'|nt|tj|}t|||d |d fd	}t||tj}|S )
a  
        Apply an affine transformation to an image.

        Args:
            image (`np.array`):
                Image to transform.
            center (`Tuple[float]`):
                Center of the bounding box (x, y).
            scale (`Tuple[float]`):
                Scale of the bounding box with respect to height/width.
            rotation (`float`):
                Rotation angle in degrees.
            size (`Dict[str, int]`):
                Size of the destination image.
            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format of the output image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image.
        Nr.   r/   g       @r*   r    r   r   )rv   r|   ry   )rt   r+   r,   r   ZLASTr   r}   )	r   r   r0   r1   r   ry   r   r   Ztransformationr2   r2   r3   r   v  s    
z&VitPoseImageProcessor.affine_transformimagesboxesreturn_tensorsc              
      s  |dur|n| j }|dur|n| j}|dur|n| j}|dur!|n| j}|dur*|n| j}|dur3|n| j}|	dur<|	n| j}	t|}t|sKt	dt
|trft|t|krft	dt| dt| t
|tjrt||jd krt	dt| d|jd  dd |D }t|d r|rtd du rt|d | j rg }t||D ](\}}|D ]!}t||d	 |d
 | jd\}}| j|||d|d}|| qq|}g }|D ]}|r| j||d}|r| j|||	d}|| qۇ fdd|D }d|i}t||
d}|S )a	  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.

            boxes (`List[List[List[float]]]` or `np.ndarray`):
                List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
                box coordinates in COCO format (top_left_x, top_left_y, width, height).

            do_affine_transform (`bool`, *optional*, defaults to `self.do_affine_transform`):
                Whether to apply an affine transformation to the input images.
            size (`Dict[str, int]` *optional*, defaults to `self.size`):
                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
                resizing.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image values between [0 - 1].
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to use if `do_normalize` is set to `True`.
            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to use if `do_normalize` is set to `True`.
            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'tf'`: Return TensorFlow `tf.constant` objects.
                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.
                - `'jax'`: Return JAX `jnp.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
              width).
        NzkInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray.z%Batch of images and boxes mismatch : z != r   c                 S   s   g | ]}t |qS r2   )r   rP   r   r2   r2   r3   rR     s    z4VitPoseImageProcessor.preprocess.<locals>.<listcomp>zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.r.   r/   )r#   r$   r%   )r   ry   r   )r   r1   r   )r   meanZstdr   c                    s   g | ]	}t | d qS ))Zinput_channel_dimr   r   r   r   r2   r3   rR   !  s    r   )dataZtensor_type)r   ry   r   r   r   r   r   r   r   r=   r;   listrd   r+   r<   r?   r   loggerZwarning_oncer   zipr4   r%   r   appendZrescale	normalizer
   )r   r   r   r   ry   r   r   r   r   r   r   r   r   Z
new_imagesr   Zimage_boxesr"   r0   r1   Ztransformed_imageZ
all_imagesr   Zencoded_inputsr2   r   r3   
preprocess  sl   8
z VitPoseImageProcessor.preprocess   r9   rL   c                 C   sb   |j \}}}}t|\}	}
t|	||d}t|D ]}t|| || || ||gd||< q||
fS )aI  
        Get final keypoint predictions from heatmaps and transform them back to
        the image.

        Args:
            heatmaps (`np.ndarray` of shape `(batch_size, num_keypoints, height, width])`):
                Model predicted heatmaps.
            center (`np.ndarray` of shape `(batch_size, 2)`):
                Center of the bounding box (x, y).
            scale (`np.ndarray` of shape `(batch_size, 2)`):
                Scale of the bounding box wrt original images of width and height.
            kernel (int, *optional*, defaults to 11):
                Gaussian kernel size (K) for modulation, which should match the heatmap gaussian sigma when training.
                K=17 for sigma=3 and k=11 for sigma=2.

        Returns:
            tuple: A tuple containing keypoint predictions and scores.

            - preds (`np.ndarray` of shape `(batch_size, num_keypoints, 2)`):
                Predicted keypoint location in images.
            - scores (`np.ndarray` of shape `(batch_size, num_keypoints, 1)`):
                Scores (confidence) of the keypoints.
        rL   )r0   r1   rb   )r?   rI   ra   rz   rg   )r   r9   r0   r1   rL   rC   rE   r/   r.   rJ   rG   rH   ru   r2   r2   r3   keypoints_from_heatmaps+  s   &z-VitPoseImageProcessor.keypoints_from_heatmapsoutputsr   kernel_size	thresholdtarget_sizesc           $      C   s<  |j j\}}}}|dur|t|krtdtj|dftjd}	tj|dftjd}
ttj	| }t
|D ]J}|durY|| d || d }}t||||g}|| | ||< | jd | jd }}t|| ||d	\}}||	|ddf< ||
|ddf< q5| j|j   |	|
|d
\}}tj|dftjd}|	ddddf |ddddf< |
ddddf |ddddf< t|}t|}td|}tt|}g }t|||}|D ]<}g }|D ]0}t|\}}} | }|}!|dur	||k}"||" }||" }|!|" }!|||!| d}#||# q|| q|S )a  
        Transform the heatmaps into keypoint predictions and transform them back to the image.

        Args:
            outputs (`VitPoseEstimatorOutput`):
                VitPoseForPoseEstimation model outputs.
            boxes (`List[List[List[float]]]` or `np.ndarray`):
                List or array of bounding boxes for each image. Each box should be a list of 4 floats representing the bounding
                box coordinates in COCO format (top_left_x, top_left_y, width, height).
            kernel_size (`int`, *optional*, defaults to 11):
                Gaussian kernel size (K) for modulation.
            threshold (`float`, *optional*, defaults to None):
                Score threshold to keep object detection predictions.
            target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*):
                Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size
                `(height, width)` of each image in the batch. If unset, predictions will be resize with the default value.
        Returns:
            `List[List[Dict]]`: A list of dictionaries, each dictionary containing the keypoints and boxes for an image
            in the batch as predicted by the model.
        NzTMake sure that you pass in as many target sizes as the batch dimension of the logitsr7   r)   r   r   r.   r/   )r#   r$   r   r'   )Z	keypointsrG   labelsZbbox)r9   r?   rd   r=   r+   ro   r-   r   	itertoolschainrz   r,   ry   r4   r   cpunumpytorchZtensorr]   r8   r   nextr_   r   )$r   r   r   r   r   r   rC   rD   rE   ZcentersscalesZflattened_boxesru   r#   r$   Zscale_factorr.   r/   r0   r1   rH   rG   Z	all_boxesZposesr   Zbboxes_xyxyresultsZpose_bbox_pairsZimage_bboxesZimage_resultsZposeZscoreZ	bbox_xyxyZkeypoints_labelsZkeepZpose_resultr2   r2   r3   post_process_pose_estimationU  sZ   
$$


z2VitPoseImageProcessor.post_process_pose_estimation)TNTr   TNN)NN)r   )r   NN)__name__
__module____qualname____doc__Zmodel_input_namesboolr   r   strrY   r   floatr   r   r+   r,   r   r   r   ZFIRSTr   r<   r   PILZImager   r   r   __classcell__r2   r2   r   r3   r~   H  s    

	
5	


 

.r~   )r    r!   )r   )9r   r   rp   typingr   r   r   r   r   r   r   r+   Zimage_processing_utilsr	   r
   Zimage_transformsr   Zimage_utilsr   r   r   r   r   r   r   r   r   utilsr   r   r   r   r   r   r   Zscipy.linalgr   Zscipy.ndimager   r   Zmodeling_vitposer   Z
get_loggerr   r   r<   rY   r   r4   r8   rI   ra   rg   rt   r}   r~   __all__r2   r2   r2   r3   <module>   sT    ,

, "!(@ 0%  
f