o
    Zh42                     @   s   d Z ddlmZ ddlmZmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ d
dlmZ eeZeG dd deZeG dd deZdddZG dd de	jZG dd de	jZeddG dd deZddgZdS )zPyTorch VitPose model.    )	dataclass)OptionalTupleUnionN)nn   )PreTrainedModel)ModelOutputauto_docstringlogging)load_backbone   )VitPoseConfigc                   @   sj   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )VitPoseEstimatorOutputaQ  
    Class for outputs of pose estimation models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
        heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
            Heatmaps as predicted by the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
            (also called feature maps) of the model at the output of each stage.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossheatmaps.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchZFloatTensor__annotations__r   r   r   r    r   r   [/var/www/auris/lib/python3.10/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   '   s   
 r   c                   @   s>   e Zd ZeZdZdZdZdee	j
e	je	jf ddfddZdS )	VitPosePreTrainedModelZvitpixel_valuesTmodulereturnNc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS dS )zInitialize the weightsg        )meanZstdNg      ?)
isinstancer   LinearConv2dinitZtrunc_normal_weightdatator   Zfloat32configZinitializer_rangeZdtypebiasZzero_	LayerNormZfill_)selfr   r   r   r   _init_weightsJ   s   

z$VitPosePreTrainedModel._init_weights)r   r   r   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingr   r   r"   r#   r*   r,   r   r   r   r   r   C   s    &r   gaussian-heatmapc                 C   s   |dvrt d| jdkrt d| j\}}}}d}|dkr7d}| dddddd	f  | dddddd	f< | |d
|||} |  }| D ]$\}	}
| dd|
d	f |dd|	d	f< | dd|	d	f |dd|
d	f< qH|||||f}|d
}|S )a  Flip the flipped heatmaps back to the original form.

    Args:
        output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
            The output heatmaps obtained from the flipped images.
        flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
            Pairs of keypoints which are mirrored (for example, left ear -- right ear).
        target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
            Target type to use. Can be gaussian-heatmap or combined-target.
            gaussian-heatmap: Classification target with gaussian distribution.
            combined-target: The combination of classification target (response map) and regression target (offset map).
            Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

    Returns:
        torch.Tensor: heatmaps that flipped back to the original image
    )r-   combined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   r.   r   N.)
ValueErrorndimshapereshapeclonetolistflip)Zoutput_flipped
flip_pairsZtarget_type
batch_sizeZnum_keypointsheightwidthZchannelsZoutput_flipped_backleftrightr   r   r   	flip_backY   s"   
. "
r>   c                       sD   e Zd ZdZd
 fddZddejdeej dejfdd	Z  Z	S )VitPoseSimpleDecoderz
    Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
    feature maps into heatmaps.
    r   Nc                    sH   t    t | _tj|jddd| _tj|j	j
|jdddd| _d S )NZbilinearF)scale_factormodeZalign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationZUpsampler@   
upsamplingr#   backbone_confighidden_size
num_labelsconvr+   r(   	__class__r   r   rG      s   

zVitPoseSimpleDecoder.__init__hidden_stater8   c                 C   s4   |  |}| |}| |}|d urt||}|S N)rI   rJ   rN   r>   r+   rR   r8   r   r   r   r   forward   s   



zVitPoseSimpleDecoder.forward)r   NrS   )
r   r   r   r   rG   r   Tensorr   rU   __classcell__r   r   rP   r   r?      s    *	r?   c                       sB   e Zd ZdZdef fddZd
dejdeej fdd	Z	  Z
S )VitPoseClassicDecoderz
    Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
    turning the feature maps into heatmaps.
    r(   c                    s   t    tj|jjdddddd| _td| _t	 | _
tjddddddd| _td| _t	 | _tjd|jdddd| _d S )	N   r/      r   F)rC   rD   rE   r)   r   rB   )rF   rG   r   ZConvTranspose2drK   rL   deconv1ZBatchNorm2d
batchnorm1rH   relu1deconv2
batchnorm2relu2r#   rM   rN   rO   rP   r   r   rG      s   


zVitPoseClassicDecoder.__init__NrR   r8   c                 C   s\   |  |}| |}| |}| |}| |}| |}| |}|d ur,t||}|S rS   )r[   r\   r]   r^   r_   r`   rN   r>   rT   r   r   r   rU      s   







zVitPoseClassicDecoder.forwardrS   )r   r   r   r   r   rG   r   rV   r   rU   rW   r   r   rP   r   rX      s    $rX   z?
    The VitPose model with a pose estimation head on top.
    )Zcustom_introc                       s   e Zd Zdeddf fddZe						ddejdeej deej d	eej d
ee	 dee	 dee	 de
eef fddZ  ZS )VitPoseForPoseEstimationr(   r   Nc                    s|   t  | t|| _t| jjdstdt| jjds!tdt| jjds,td|jr3t|nt	|| _
|   d S )NrL   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)rF   rG   r   backbonehasattrr(   r1   Zuse_simple_decoderr?   rX   headZ	post_initrO   rP   r   r   rG      s   
z!VitPoseForPoseEstimation.__init__r   dataset_indexr8   labelsoutput_attentionsoutput_hidden_statesreturn_dictc                 C   sF  |dur|n| j j}|dur|n| j j}|dur|n| j j}d}|dur(td| jj|||||d}	|r:|	jd n|	d d }
|
jd }| j j	j
d | j j	jd  }| j j	j
d | j j	jd  }|
ddd|d|| }
| j|
|d}|s|r|f|	dd  }n	|f|	dd  }|dur|f| S |S t|||	j|	jd	S )
ac  
        dataset_index (`torch.Tensor` of shape `(batch_size,)`):
            Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

            This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
        flip_pairs (`torch.tensor`, *optional*):
            Whether to mirror pairs of keypoints (for example, left ear -- right ear).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
        >>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
        >>> inputs = processor(image, boxes=boxes, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> heatmaps = outputs.heatmaps
        ```NzTraining is not yet supported)rg   rj   ri   rk   r0   r   r   rZ   )r8   )r   r   r   r   )r(   Zuse_return_dictrj   ri   NotImplementedErrorrd   Zforward_with_filtered_kwargsZfeature_mapsr3   rK   rb   rc   Zpermuter4   
contiguousrf   r   r   r   )r+   r   rg   r8   rh   ri   rj   rk   r   ZoutputsZsequence_outputr9   Zpatch_heightZpatch_widthr   outputr   r   r   rU      s@   (	
z VitPoseForPoseEstimation.forward)NNNNNN)r   r   r   r   rG   r
   r   rV   r   boolr   tupler   rU   rW   r   r   rP   r   ra      s4    
	ra   )r-   )r   dataclassesr   typingr   r   r   r   Ztorch.utils.checkpointr   Zmodeling_utilsr   utilsr	   r
   r   Zutils.backbone_utilsr   Zconfiguration_vitposer   Z
get_loggerr   loggerr   r   r>   Moduler?   rX   ra   __all__r   r   r   r   <module>   s.   

(&g