
    fTh42                     n   S r SSKJr  SSKJrJrJr  SSKrSSKrSSKJ	r	  SSK
Jr  SSKJrJrJr  SS	KJr  S
SKJr  \R(                  " \5      r\ " S S\5      5       r\ " S S\5      5       rSS jr " S S\	R4                  5      r " S S\	R4                  5      r\" SS9 " S S\5      5       rSS/rg)zPyTorch VitPose model.    )	dataclass)OptionalTupleUnionN)nn   )PreTrainedModel)ModelOutputauto_docstringlogging)load_backbone   )VitPoseConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
VitPoseEstimatorOutput'   a  
Class for outputs of pose estimation models.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Loss is not supported at this moment. See https://github.com/ViTAE-Transformer/ViTPose/tree/main/mmpose/models/losses for further detail.
    heatmaps (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`):
        Heatmaps as predicted by the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
        (also called feature maps) of the model at the output of each stage.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nlossheatmaps.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r   __static_attributes__r       d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vitpose/modeling_vitpose.pyr   r   '   sq    ( )-D(5$$
%,,0Hhu(()0=AM8E%"3"3S"89:A:>Ju00#567>r!   r   c                   ~    \ rS rSr\rSrSrSrS\	\
R                  \
R                  \
R                  4   SS4S jrS	rg)
VitPosePreTrainedModelC   vitpixel_valuesTmodulereturnNc                    [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR                  R                  5      UR                  l        UR                  b%  UR                  R                  R                  5         gg[        U[        R                   5      (       aJ  UR                  R                  R                  5         UR                  R                  R#                  S5        gg)zInitialize the weightsg        )meanstdNg      ?)
isinstancer   LinearConv2dinittrunc_normal_weightdatator   float32configinitializer_rangedtypebiaszero_	LayerNormfill_)selfr(   s     r"   _init_weights$VitPosePreTrainedModel._init_weightsJ   s    fryy"))455 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '--KK""$MM$$S) .r!   r   )r   r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr   r   r.   r/   r;   r>   r    r   r!   r"   r$   r$   C   sD     L$O&*#*E"))RYY*L$M *RV *r!   r$   c                    US;  a  [        S5      eU R                  S:w  a  [        S5      eU R                  u  p4pVSnUS:X  a  SnU SS2SSS2S	4   * U SS2SSS2S	4'   U R                  US
XuU5      n U R	                  5       nUR                  5        H)  u  pU SS2U
S	4   USS2U	S	4'   U SS2U	S	4   USS2U
S	4'   M+     UR                  X4XV45      nUR                  S
5      nU$ )a  Flip the flipped heatmaps back to the original form.

Args:
    output_flipped (`torch.tensor` of shape `(batch_size, num_keypoints, height, width)`):
        The output heatmaps obtained from the flipped images.
    flip_pairs (`torch.Tensor` of shape `(num_keypoints, 2)`):
        Pairs of keypoints which are mirrored (for example, left ear -- right ear).
    target_type (`str`, *optional*, defaults to `"gaussian-heatmap"`):
        Target type to use. Can be gaussian-heatmap or combined-target.
        gaussian-heatmap: Classification target with gaussian distribution.
        combined-target: The combination of classification target (response map) and regression target (offset map).
        Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing for Human Pose Estimation (CVPR 2020).

Returns:
    torch.Tensor: heatmaps that flipped back to the original image
)gaussian-heatmapcombined-targetz9target_type should be gaussian-heatmap or combined-target   zCoutput_flipped should be [batch_size, num_keypoints, height, width]r   rF   r   N.)
ValueErrorndimshapereshapeclonetolistflip)output_flipped
flip_pairstarget_type
batch_sizenum_keypointsheightwidthchannelsoutput_flipped_backleftrights              r"   	flip_backr[   Y   s$   " AATUUa^__/=/C/C,JvH''(6q!$Q$|(D'Dq!$Q$|$#++JHeTN(..0 "((*,:1eS=,IAtSL)-;AtSL-IAucM* + .55zRX6`a-2226r!   c                      ^  \ rS rSrSrS	U 4S jjrS
S\R                  S\\R                     S\R                  4S jjr	Sr
U =r$ )VitPoseSimpleDecoder   z
Simple decoding head consisting of a ReLU activation, 4x upsampling and a 3x3 convolution, turning the
feature maps into heatmaps.
r)   c                   > [         TU ]  5         [        R                  " 5       U l        [        R
                  " UR                  SSS9U l        [        R                  " UR                  R                  UR                  SSSS9U l        g )NbilinearF)scale_factormodealign_cornersr   r   kernel_sizestridepadding)super__init__r   ReLU
activationUpsamplera   
upsamplingr/   backbone_confighidden_size
num_labelsconvr=   r6   	__class__s     r"   ri   VitPoseSimpleDecoder.__init__   se    '')++63F3FZglmII""..0A0AqYZde
	r!   hidden_staterQ   c                     U R                  U5      nU R                  U5      nU R                  U5      nUb  [        X25      nU$ N)rk   rm   rq   r[   r=   ru   rQ   r   s       r"   forwardVitPoseSimpleDecoder.forward   sA    |4|499\*! 6Hr!   )rk   rq   rm   )r)   Nrw   )r   r   r   r   r   ri   r   Tensorr   ry   r    __classcell__rs   s   @r"   r]   r]      s@    

	ELL 	hu||>T 	`e`l`l 	 	r!   r]   c                   x   ^  \ rS rSrSrS\4U 4S jjrS	S\R                  S\	\R                     4S jjr
SrU =r$ )
VitPoseClassicDecoder   z
Classic decoding head consisting of a 2 deconvolutional blocks, followed by a 1x1 convolution layer,
turning the feature maps into heatmaps.
r6   c           	        > [         TU ]  5         [        R                  " UR                  R
                  SSSSSS9U l        [        R                  " S5      U l        [        R                  " 5       U l
        [        R                  " SSSSSSS9U l        [        R                  " S5      U l        [        R                  " 5       U l        [        R                  " SUR                  SSSS9U l        g )	N   rG      r   F)re   rf   rg   r9   r   rd   )rh   ri   r   ConvTranspose2drn   ro   deconv1BatchNorm2d
batchnorm1rj   relu1deconv2
batchnorm2relu2r/   rp   rq   rr   s     r"   ri   VitPoseClassicDecoder.__init__   s    ))""..1VW^c
 ..-WWY
))#s!UV]bc..-WWY
IIc6#4#4!AWXY	r!   ru   rQ   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nUb  [        X25      nU$ rw   )r   r   r   r   r   r   rq   r[   rx   s       r"   ry   VitPoseClassicDecoder.forward   sy    ||L1|4zz,/||L1|4zz,/99\*! 6Hr!   )r   r   rq   r   r   r   r   rw   )r   r   r   r   r   r   ri   r   r{   r   ry   r    r|   r}   s   @r"   r   r      s;    
Z} ZELL hu||>T  r!   r   z?
    The VitPose model with a pose estimation head on top.
    )custom_introc                      ^  \ rS rSrS\SS4U 4S jjr\      SS\R                  S\	\R                     S\	\R                     S	\	\R                     S
\	\
   S\	\
   S\	\
   S\\\4   4S jj5       rSrU =r$ )VitPoseForPoseEstimation   r6   r)   Nc                   > [         TU ]  U5        [        U5      U l        [	        U R                  R
                  S5      (       d  [        S5      e[	        U R                  R
                  S5      (       d  [        S5      e[	        U R                  R
                  S5      (       d  [        S5      eUR                  (       a  [        U5      O
[        U5      U l
        U R                  5         g )Nro   z0The backbone should have a hidden_size attribute
image_sizez0The backbone should have an image_size attribute
patch_sizez/The backbone should have a patch_size attribute)rh   ri   r   backbonehasattrr6   rI   use_simple_decoderr]   r   head	post_initrr   s     r"   ri   !VitPoseForPoseEstimation.__init__   s     %f- t}}++];;OPPt}}++\::OPPt}}++\::NOO4:4M4M(0ShioSp	 	r!   r'   dataset_indexrQ   labelsoutput_attentionsoutput_hidden_statesreturn_dictc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnUb  [	        S5      eU R
                  R                  UUUUUS9n	U(       a  U	R                  S   OU	S   S   n
U
R                  S   nU R                   R                  R                  S   U R                   R                  R                  S   -  nU R                   R                  R                  S   U R                   R                  R                  S   -  nU
R                  SSS5      R                  USX5      R                  5       n
U R                  XS9nU(       d%  U(       a
  U4U	SS -   nO	U4U	SS -   nUb  U4U-   $ U$ [!        UUU	R"                  U	R$                  S	9$ )
a  
dataset_index (`torch.Tensor` of shape `(batch_size,)`):
    Index to use in the Mixture-of-Experts (MoE) blocks of the backbone.

    This corresponds to the dataset index used during training, e.g. For the single dataset index 0 refers to the corresponding dataset. For the multiple datasets index 0 refers to dataset A (e.g. MPII) and index 1 refers to dataset B (e.g. CrowdPose).
flip_pairs (`torch.tensor`, *optional*):
    Whether to mirror pairs of keypoints (for example, left ear -- right ear).

Examples:

```python
>>> from transformers import AutoImageProcessor, VitPoseForPoseEstimation
>>> import torch
>>> from PIL import Image
>>> import requests

>>> processor = AutoImageProcessor.from_pretrained("usyd-community/vitpose-base-simple")
>>> model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> boxes = [[[412.8, 157.61, 53.05, 138.01], [384.43, 172.21, 15.12, 35.74]]]
>>> inputs = processor(image, boxes=boxes, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)
>>> heatmaps = outputs.heatmaps
```NzTraining is not yet supported)r   r   r   r   rH   r   r   r   )rQ   )r   r   r   r   )r6   use_return_dictr   r   NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsrK   rn   r   r   permuterL   
contiguousr   r   r   r   )r=   r'   r   rQ   r   r   r   r   r   outputssequence_outputrS   patch_heightpatch_widthr   outputs                   r"   ry    VitPoseForPoseEstimation.forward   s   P &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq%&EFF--<<'!5/# = 
 7B'..r2wqzRT~$**1-
{{22==a@DKKD_D_DjDjklDmmkk11<<Q?4;;C^C^CiCijkCll##Aq!,44Z\_jjl 	 99_9D#"wqr{2"wqr{2)-)9TGf$EvE%!//))	
 	
r!   )r   r   )NNNNNN)r   r   r   r   r   ri   r   r   r{   r   boolr   tupler   ry   r    r|   r}   s   @r"   r   r      s    }  $  15-1)-,0/3&*P
llP
  -P
 U\\*	P

 &P
 $D>P
 'tnP
 d^P
 
u,,	-P
 P
r!   r   )rE   )r   dataclassesr   typingr   r   r   r   torch.utils.checkpointr   modeling_utilsr	   utilsr
   r   r   utils.backbone_utilsr   configuration_vitposer   
get_loggerr   loggerr   r$   r[   Moduler]   r   r   __all__r   r!   r"   <module>r      s     ! ) )    - 
 2 0 
		H	%
 ?[ ? ?6 *_ * **%P299 6#BII #L 
d
5 d

d
N $%?
@r!   