
    fTh                        S r SSKrSSKJr  SSKJrJrJrJ	r	  SSK
rSSKrSSKrSSKJr  SSKJr  SSKJrJr  SS	KJrJr  SS
KJr  SSKJrJrJrJr  SSKJrJ r J!r!  \RD                  " \#5      r$S\RJ                  S\RJ                  4S jr&S\RJ                  S\RJ                  4S jr'S\RJ                  S\(4S jr)SCS\RJ                  S\*S\+S\(S\RJ                  4
S jjr,SDS jr-S r. " S S\R^                  5      r0 " S S\R^                  5      r1 " S S\R^                  5      r2\ " S  S!\5      5       r3 " S" S#\R^                  5      r4 " S$ S%\R^                  5      r5 " S& S'\R^                  5      r6 " S( S)\R^                  5      r7 " S* S+\R^                  5      r8 " S, S-\85      r9 " S. S/\R^                  5      r: " S0 S1\R^                  5      r;\ " S2 S3\5      5       r< " S4 S5\R^                  5      r= " S6 S7\R^                  5      r> " S8 S9\R^                  5      r? " S: S;\<5      r@ " S< S=\R^                  5      rA " S> S?\<5      rB\ " S@ SA\<5      5       rC/ SBQrDg)EzPyTorch GroupViT model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfiglogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/groupvit/modeling_groupvit.pycontrastive_lossr#   '   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)r#   t)r%   caption_loss
image_losss      r"   groupvit_lossr*   ,   s*    #J/L!*,,.1J%,,r$   dimc                     U R                  U5      nUR                  USS9S   n[        R                  " U [        R                  S9R                  XS5      nXBR                  5       -
  U-   nU$ )NTkeepdimr   memory_format      ?)softmaxmaxr   
zeros_likelegacy_contiguous_formatscatter_detach)r   r+   y_softindexy_hardrets         r"   hard_softmaxr<   2   sf    ^^C FJJsDJ)!,EfE4R4RS\\]`ilmF
==?
"V
+CJr$   tauhardc           	      ,   [         R                  R                  R                  [         R                  " SU R
                  U R                  S9[         R                  " SU R
                  U R                  S95      nUR                  U R                  5      nX-   U-  nUR                  U5      nU(       a]  UR                  USS9S   n[         R                  " U [         R                  S9R                  X7S5      nXR                  5       -
  U-   n	U	$ Un	U	$ )N        )r   dtyper1   Tr-   r   r/   )r   distributionsgumbelGumbeltensorr   rA   sampleshaper2   r3   r4   r5   r6   r7   )
r   r=   r>   r+   gumbel_distgumbelsr8   r9   r:   r;   s
             r"   gumbel_softmaxrJ   <   s    %%,,33SfllCSfllCK   .G3&G__S!F

3
-a0!!&8V8VW``admpq}}&/ J Jr$   c                    X-  U R                   S   -  S-  nX:  a4  [        [        R                  " X$-  5      5      nU R                   S   U-  nO3[        [        R                  " X-  5      5      nU R                   S   U-  nU R                   S   nU R                   S   nU R	                  XxXe5      n [
        R                  R                  XU4SUS9n U $ )a  
Args:
    attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
    height (`int`): height of the output attention map
    width (`int`): width of the output attention map
    align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

Returns:
    `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
         ?r   r   bilinearsizemodealign_corners)rG   intnproundreshaper   r   interpolate)	
attentionsheightwidthrR   scale
feat_widthfeat_height
batch_sizegroupss	            r"   resize_attention_mapr`   R   s     ^z//22s:E~%-01
 &&q)Z7"((6>23%%a(K7
!!!$Ja F##JPJ**%z + J r$   c           	      V   / n[         R                  " 5          SnU  Hj  nUR                  SSS5      R                  5       nUc  UnOX4-  n[	        UR                  SSS5      R                  5       /UQ76 nUR                  U5        Ml     SSS5        US   nU$ ! , (       d  f       N= f)a  
Args:
    attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
    hw_shape (`tuple(int)`): height and width of the output attention map
Returns:
    `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
Nr   rL   r   )r   no_gradpermute
contiguousr`   append)rX   hw_shape	attn_mapsprev_attn_masks
attn_maskscur_attn_mapfinal_groupings          r"   get_grouping_from_attentionsrm   p   s     I	$J#++Aq!4??AJ&","1">/0G0G1a0P0[0[0]i`hiL\* % 
 r]N! 
s   A3B
B(c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )GroupViTCrossAttentionLayer   configc                   > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  S9U l        [        U5      U l
        [        R
                  " UR                  UR                  S9U l        g Neps)super__init__GroupViTAttentionattnr   	LayerNormhidden_sizelayer_norm_epsnorm2GroupViTMLPmlp	norm_postselfrq   	__class__s     r"   rw   $GroupViTCrossAttentionLayer.__init__   sb    %f-	\\&"4"4&:O:OP
v&f&8&8f>S>STr$   c                     UnX0R                  XS9S   -   nX0R                  U R                  U5      5      -   nU R                  U5      nU$ )N)encoder_hidden_statesr   ry   r   r}   r   )r   querykeyxs       r"   forward#GroupViTCrossAttentionLayer.forward   sK    		%	;A>>A''NN1r$   r   )	__name__
__module____qualname____firstlineno__r   rw   r   __static_attributes____classcell__r   s   @r"   ro   ro      s    U3 U r$   ro   c                   @   ^  \ rS rSrS\4U 4S jjrSS jrS rSrU =r	$ )GroupViTAssignAttention   rq   c                   > [         TU ]  5         UR                  S-  U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  UR                  5      U l        [        R
                  " UR                  UR                  5      U l	        UR                  U l
        g )N      )rv   rw   r{   r[   r   Linearq_projk_projv_projproj
assign_epsr   s     r"   rw    GroupViTAssignAttention.__init__   s    ''-
ii 2 2F4F4FGii 2 2F4F4FGii 2 2F4F4FGIIf00&2D2DE	 ++r$   c                     U(       a  U R                   (       a  [        USUS9nU$ U(       a  [        USS9nU$ [        R                  R                  USS9nU$ )N)r+   r>   r+   )trainingrJ   r<   r   r   r2   )r   ry   rC   r>   s       r"   get_attn GroupViTAssignAttention.get_attn   sX    dmm!$BT:D  #Db1  }},,Tr,:r$   c                 `   UnU R                  U5      nU R                  U5      nU R                  U5      nXR                  SS5      -  U R                  -  nU R                  U5      nU R                  USSS9nXUR                  SSS9U R                  -   -  nXS-  nU R                  U5      nXv4$ )Nr   rb   F)rC   r>   Tr+   r.   )	r   r   r   	transposer[   r   sumr   r   )r   r   r   valueraw_attnry   	soft_attnouts           r"   r   GroupViTAssignAttention.forward   s    E" kk# E" MM"b11TZZ?}}X&MM(5uME	xxBx5GHliin~r$   )r   r   r   r   r[   r   )TT)
r   r   r   r   r   rw   r   r   r   r   r   s   @r"   r   r      s    ,3 ,	 r$   r   c                   <   ^  \ rS rSrS\4U 4S jjrS rS rSrU =r	$ )GroupViTTokenAssign   rq   c                 V  > [         TU ]  5         X0l        [        R                  " UR
                  UR                  S9U l        [        UR                  [        R                  R                  5      (       a  UR                  OUR                  UR                  4nU Vs/ s H  n[        XQR
                  -  5      PM     snu  pg[        XXc5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR
                  UR                  S9U l        [%        U5      U l        [)        U5      U l        [        R                  " UR
                  UR                  S9U l        [/        XR
                  XqR
                  5      U l        g s  snf rs   )rv   rw   num_output_groupr   rz   r{   r|   norm_tokens
isinstanceassign_mlp_ratiocollectionsabcIterablerS   GroupViTMixerMLP	mlp_internorm_post_tokensnorm_xro   pre_assign_attnr   assign
norm_new_xr~   mlp_channels)	r   rq   num_group_tokenr   r   r   
tokens_dimchannels_dimr   s	           r"   rw   GroupViTTokenAssign.__init__   sD    0<<(:(:@U@UV &11;??3K3KLL ##))6+B+BC 	
 JZ#ZIYAC,>,>(>$?IY#Z 
)&:` "V-?-?VEZEZ [ll6#5#56;P;PQ:6B-f5,,v'9'9v?T?TU'0B0BLRdRde $[s   !F&c                 J    U R                  U5      nU R                  U5      nU$ )z
Args:
    group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

Returns:
    projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
)r   r   )r   group_tokensprojected_group_tokenss      r"   project_group_token'GroupViTTokenAssign.project_group_token   s+     "&!=!%!6!67M!N%%r$   c                    U R                  U5      nU R                  U5      nU R                  U5      nU R                  X15      nU R	                  X15      u  pEXC-  nX@R                  U R                  U5      5      -   nXE4$ )z
Args:
    image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
    group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
)r   r   r   r   r   r   r   )r   image_tokensr   r   new_image_tokens	attentions         r"   r   GroupViTTokenAssign.forward   s     ''5{{<0!%!9!9,!G!%!5!56L![&*kk2H&W#2+.?.?P`@a.bb**r$   )	r   r   r   r   r   r   r   r   r   )
r   r   r   r   r   rw   r   r   r   r   r   s   @r"   r   r      s!    f3 f*&+ +r$   r   c                   :   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   Sr\\	S
'   Sr\\	S'   S\\   4S jrSrg)GroupViTModelOutputi  a  
Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
        Classification scores for each pixel.

        <Tip warning={true}>

        The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
        to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
        original image size as post-processing. You should always check your logits shape and resize as needed.

        </Tip>

    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of
        [`GroupViTTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of
        [`GroupViTVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`GroupViTTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`GroupViTVisionModel`].
Nlosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedstext_model_outputvision_model_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r   r   N)getattrto_tuple).0kr   s     r"   	<genexpr>/GroupViTModelOutput.to_tuple.<locals>.<genexpr>1  s<      
   LLDGRYZ^`aRbRkRkRmm s   25)tuplekeysr   s   `r"   r   GroupViTModelOutput.to_tuple0  s#     
YY[
 
 	
r$    )r   r   r   r   __doc__r   r   r   FloatTensor__annotations__r   r   r   r   r   r   r   r   r   r   r   r   r   r$   r"   r   r     s    B )-D(5$$
%,48hu001837OXe//077;%"3"34;/3K%++,304L(5,,-448186:3:
%* 
r$   r   c            	          ^  \ rS rSrSr    SS\S\\\\\4   4   S\S\4U 4S jjjrSS\	R                  S	\S
\	R                  4S jjrSrU =r$ )GroupViTPatchEmbeddingsi7  z
Image to Patch Embedding.

image_size
patch_sizenum_channels	embed_dimc                 `  > [         TU ]  5         [        U[        R                  R
                  5      (       a  UOX4n[        U[        R                  R
                  5      (       a  UOX"4nUS   US   -  US   US   -  -  nXl        X l        XPl        [        R                  " X4X"S9U l        g )Nr   r   )kernel_sizestride)rv   rw   r   r   r   r   r   r   num_patchesr   Conv2d
projection)r   r   r   r   r   r   r   s         r"   rw    GroupViTPatchEmbeddings.__init__<  s     	#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$&))Lgr$   pixel_valuesinterpolate_pos_encodingr   c                 >   UR                   u  p4pVU(       dV  XPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R                  U5      R	                  S5      R                  SS5      nU$ )Nr   r   zInput image size (*z) doesn't match model ().rL   )rG   r   
ValueErrorr   flattenr   )r   r   r   r^   r   rY   rZ   r   s           r"   r   GroupViTPatchEmbeddings.forwardM  s    2>2D2D/
&'++u8J/J (% 9+,Adooa.@-AE  OOL)11!4>>q!Dr$   )r   r   r   r   )      r	   i   F)r   r   r   r   r   rS   r   r   rw   r   Tensorboolr   r   r   r   s   @r"   r   r   7  s     24hh #uS#X./h 	h
 h h"	ELL 	D 	]b]i]i 	 	r$   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\
S\R                  4S jjrSrU =r$ )GroupViTVisionEmbeddingsiY  rq   c                   > [         TU ]  5         [        UR                  UR                  UR
                  UR                  S9U l        U R                  R                  n[        R                  " [        R                  " SX!R                  5      5      U l        [        R                  " UR                  5      U l        [        R                   " UR                  UR"                  S9U l        UR                  U l        Xl        g )N)r   r   r   r   r   rt   )rv   rw   r   r   r   r   r{   patch_embeddingsr   r   	Parameterr   zerosposition_embeddingsDropoutdropoutrz   r|   	layernormrq   )r   rq   r   r   s      r"   rw   !GroupViTVisionEmbeddings.__init__Z  s     7((((,,((	!
 ++77#%<<A{L^L^0_#` zz&..1f&8&8f>S>ST ++r$   
embeddingsrY   rZ   r   c                 ,   UR                   S   nU R                  R                   S   n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  nUR                   S   nX R
                  -  nX0R
                  -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and no class embeddings.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   rb   rM   r   r	   rL   bicubicFrO   )rG   r
  r   jit
is_tracingr   r   rV   rd   r   r   rW   view)r   r  rY   rZ   r   num_positionspatch_pos_embedr+   
new_height	new_widthsqrt_num_positionss              r"   r   1GroupViTVisionEmbeddings.interpolate_pos_encodingj  s    !&&q)0066q9 yy##%%+*F6?+++22r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nr$   r   r   c                     UR                   u  p4pVU R                  XS9nU R                  U5      nUR                  5       u  p8n	U(       a  XpR	                  XuU5      -   nOXpR
                  -   nU R                  U5      nU$ )N)r   )rG   r  r  rP   r   r
  r  )
r   r   r   r^   r   rY   rZ   r  seq_len_s
             r"   r    GroupViTVisionEmbeddings.forward  s    2>2D2D/
&**<*k
^^J/
!+!2
Q $#&C&CJX]&^^J#&>&>>J\\*-
r$   )rq   r  r  r  r   r
  r  )r   r   r   r   r   rw   r   r  rS   r   r  r   r   r   r   s   @r"   r  r  Y  sh    3  $5<< $ $UX $]b]i]i $LELL D ]b]i]i  r$   r  c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )GroupViTTextEmbeddingsi  rq   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nposition_ids)r   rb   F)
persistent)rv   rw   r{   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsposition_embeddingregister_bufferr   r    expandr   rq   r   r   s      r"   rw   GroupViTTextEmbeddings.__init__  s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r$   	input_idsr"  inputs_embedsr   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nrb   r   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rG   r(  weightr   r"  r&  )r   r-  r"  r.  
seq_lengthmax_position_embeddingr
  r  s           r"   r   GroupViTTextEmbeddings.forward  s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r$   )r(  r&  NNN)r   r   r   r   r   rw   r   r   
LongTensorr   r  r   r   r   r   s   @r"   r   r     sp    

1 

 153759	E,,- u//0   1 12	
 
 r$   r   c            
       &  ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjr\S	 5       r	S
 r
SS\R                  S\\R                     S\R                  4S jjr  SS\R                  S\\R                     S\\   S\\R"                     4S jjrSrU =r$ )GroupViTStagei  zMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.rq   depthnum_prev_group_tokenr   r   c           	      j  > [         TU ]  5         X l        X@l        US:  a;  [        R
                  " [        R                  " SXAR                  5      5      U l	        OS U l	        [        R                  " [        U5       Vs/ s H  n[        U5      PM     sn5      U l        US:  a  [        UUUS9U l        OS U l        US:  ab  US:  a\  [        R                   " [        R"                  " UR                  UR$                  S9['        XUR                  S-  U5      5      U l        g S U l        g s  snf )Nr   r   )rq   r   r   rt   rL   )rv   rw   r8  r   r   r  r   r	  r{   group_token
ModuleListrangeGroupViTEncoderLayerlayersr   
downsample
Sequentialrz   r|   r   group_projector)r   rq   r8  r9  r   r   r  r   s          r"   rw   GroupViTStage.__init__  s     	
.Q!||EKK?L^L^,_`D#Dmm5QV<$X<a%9&%A<$XYQ1 /!1DO #DO!#!(;#%==V//V5J5JK v?Q?QUV?VXgh$D 
 $(D # %Ys   D0c                     U R                   S L$ N)r;  r   s    r"   with_group_tokenGroupViTStage.with_group_token  s    t++r$   c                     U R                   (       a,  US S 2S U R                  * 24   US S 2U R                  * S 24   4$ US 4$ rE  )rF  r   )r   r   s     r"   split_xGroupViTStage.split_x  sN      Q/4/////0!A8L8L7L7N4N2OOOd7Nr$   r   r;  r   c                 8    Uc  U$ [         R                  " X/SS9$ )Nr   r   )r   cat)r   r   r;  s      r"   concat_xGroupViTStage.concat_x  s!    Hyy!)q11r$   hidden_statesprev_group_tokenoutput_attentionsc                    U R                   (       aM  U R                  R                  UR                  S5      SS5      nU R                  b  X@R	                  U5      -   nOSnUnU R                  XT5      nU R                   H  nU" USSS9nUS   nM     U R                  U5      u  pTSn	U R                  b  U R                  XT5      u  pYXT4n
U(       a  X4-   n
U
$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the grouping tensors of Grouping block.
r   rb   N)attention_maskcausal_attention_mask)	rF  r;  r*  rP   rB  rM  r?  rI  r@  )r   rO  rP  rQ  r;  r   cat_xlayer	layer_outr   outputss              r"   r   GroupViTStage.forward   s       **11-2D2DQ2GRPK##/),@,@AQ,RRKa-[[EeDPTUIaLE ! e,	??&??1:LA",Gr$   )r8  r@  rB  r;  r?  r   rE  NF)r   r   r   r   r   r   rS   rw   propertyrF  rI  r   r  r   rM  r  r   r   r   r   r   r   s   @r"   r7  r7    s    W ($ (  ( "	 (
  (  (D , ,2%,, 2Xell5K 2W\WcWc 2 48,1	'||' #5<<0' $D>	'
 
u  	!' 'r$   r7  c            
          ^  \ rS rSr   SS\S\\   S\\   S\\   4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )r~   i*  rq   r{   intermediate_sizeoutput_sizec                   > [         TU ]  5         Xl        [        UR                     U l        Ub  UOUR                  nUb  UOUR                  nUb  UOUn[        R                  " X#5      U l
        [        R                  " X45      U l        g rE  )rv   rw   rq   r
   
hidden_actactivation_fnr{   r]  r   r   fc1fc2)r   rq   r{   r]  r^  r   s        r"   rw   GroupViTMLP.__init__+  s|     	#F$5$56%0%<k&BTBT1B1N-TZTlTl%0%<k+99[<99.<r$   rO  r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rE  )rb  ra  rc  )r   rO  s     r"   r   GroupViTMLP.forward;  s4    /**=9/r$   )ra  rq   rb  rc  r4  )r   r   r   r   r   r   rS   rw   r   r  r   r   r   r   s   @r"   r~   r~   *  sj     &*+/%)=$= c]= $C=	=
 c]= = U\\ ell  r$   r~   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )r   iB  c                 f   > [         TU ]  UR                  SS5      5      nUR                  SS5      $ Nr   rL   )rv   r   r   )r   r   r   s     r"   r   GroupViTMixerMLP.forwardC  s-    GOAKK1-.{{1a  r$   r   )r   r   r   r   r   r   r   r   s   @r"   r   r   B  s    ! !r$   r   c                   F  ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	    SS\R                  S	\
\R                     S
\
\R                     S\
\R                     S\
\   S\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )rx   iH  z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   r   )rv   rw   rq   r{   r   num_attention_heads	num_headshead_dimr   r[   attention_dropoutr  r   r   r   r   r   out_projr   s     r"   rw   GroupViTAttention.__init__K  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar$   rE   r  bszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ ri  )r  rn  ro  r   re   )r   rE   r  rs  s       r"   _shapeGroupViTAttention._shape^  s5    {{3GQQRSUVWbbddr$   rO  rS  rT  r   rQ  r   c                    UR                  5       u  pgnUSLn	U R                  U5      U R                  -  n
U	(       aE  U R                  U R	                  U5      SU5      nU R                  U R                  U5      SU5      nODU R                  U R	                  U5      SU5      nU R                  U R                  U5      SU5      nX`R                  -  SU R                  4nU R                  XU5      R                  " U6 n
UR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " XR                  SS5      5      nUR                  5       X`R                  -  X~4:w  a-  [        SX`R                  -  X~4 SUR                  5        35      eUbv  UR                  5       USX~4:w  a"  [        SUSX~4 SUR                  5        35      eUR                  X`R                  X~5      U-   nUR                  X`R                  -  X~5      nUbv  UR                  5       USX~4:w  a"  [        SUSX~4 SUR                  5        35      eUR                  X`R                  X~5      U-   nUR                  X`R                  -  X~5      n[        R                  R                  USS9nU(       a;  UR                  X`R                  X~5      nUR                  X`R                  -  X~5      nOSn[        R                  R!                  XR                   U R"                  S	9n[        R                  " UU5      nUR                  5       X`R                  -  XpR                  4:w  a5  [        S
X`R                  XpR                  4 SUR                  5        35      eUR                  X`R                  XpR                  5      nUR                  SS5      nUR%                  XgU5      nU R'                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNrb   r   rL   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   )pr   z `attn_output` should be of size )rP   r   r[   ru  r   r   rn  ro  r  r   bmmr   r   r   r   r2   r  r   rV   rq  )r   rO  rS  rT  r   rQ  rs  tgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                      r"   r   GroupViTAttention.forwarda  s    #0"4"4"6i2$> {{=1DJJ>T[[1F%GSQJ;;t{{3H'I2sSLT[[%?SIJ;;t{{='A2sKLNN*B>
{{<#>CCZP__j1
#((*5//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(*  !,$))+Q/II 7a8R7S T-22457  (,,S..'SVkkL',,S>>-A7TL%""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL',,S>>-A7TL}},,\r,B
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1!))#	BmmK0111r$   )
rq   r  r   ro  r   rn  rq  r   r[   r   )NNNF)r   r   r   r   r   rw   r   r  rS   ru  r   r   r  r   r   r   r   r   s   @r"   rx   rx   H  s    GB&eU\\ eC ec e 268<=A,1R2||R2 !.R2  (5	R2
  ((9(9:R2 $D>R2 
u||Xell3XeELL>Q5RR	SR2 R2r$   rx   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )r>  i  rq   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g rs   )rv   rw   r{   r   rx   	self_attnr   rz   r|   layer_norm1r~   r   layer_norm2r   s     r"   rw   GroupViTEncoderLayer.__init__  sm    ++*62<<F<Q<QRv&<<F<Q<QRr$   rO  rS  rT  rQ  r   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rO  rS  rT  rQ  )r  r  r  r   )r   rO  rS  rT  rQ  residualr  rX  s           r"   r   GroupViTEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr$   )r   r  r  r   r  r  )r   r   r   r   r   rw   r   r  r   r  r   r   r   r   r   r   s   @r"   r>  r>    sk    S~ S -2&||& &  %||	&
 $D>& 
u  	!& &r$   r>  c                   &    \ rS rSr\rSrSrS rSr	g)GroupViTPreTrainedModeli  groupvitTc                 b   U R                   R                  n[        U[        R                  [        R
                  45      (       aV  UR                  R                  R                  SUS9  UR                  b$  UR                  R                  R                  5         Oh[        U[        R                  5      (       aI  UR                  R                  R                  5         UR                  R                  R                  S5        U R                   R                  n[        U[        5      (       ac  UR                  R                  R                  R                  SUS-  S9  UR                   R                  R                  R                  SUS-  S9  g[        U["        5      (       Ga   U R                   R                  nUR$                  S-  SUR                   R&                  -  S-  -  U-  nUR$                  S-  U-  n[        R(                  R                  UR*                  R                  US9  [        R(                  R                  UR,                  R                  US9  [        R(                  R                  UR.                  R                  US9  [        R(                  R                  UR0                  R                  US9  g[        U[2        5      (       a  U R                   R                  nUR                   R4                  S-  SUR                   R&                  -  S-  -  U-  nSUR                   R4                  -  S-  U-  n[        R(                  R                  UR6                  R                  US9  [        R(                  R                  UR8                  R                  US9  gg)	zInitialize the weightsr@   )meanstdNr1   g{Gz?r   rL   )r  )rq   initializer_ranger   r   r   r   r0  datanormal_biaszero_rz   fill_initializer_factorr   r&  r(  rx   r   num_hidden_layersinitr   r   r   rq  r~   r{   rb  rc  )r   module
init_rangefactorin_proj_stdout_proj_stdfc_stds          r"   _init_weights%GroupViTPreTrainedModel._init_weights  s    [[22
fryy"))455 MM&&CZ&@{{&  &&(--KK""$MM$$S)//f455""))..66CVd]6S%%,,1199sQU9V 122[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE,,[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? -r$   r   N)
r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingr  r   r   r$   r"   r  r    s    !L"&*#@r$   r  c                      ^  \ rS rSrS\SS4U 4S jjr   SS\R                  S\\	   S\\	   S	\\	   S\
\\4   4
S
 jjrSrU =r$ )GroupViTVisionEncoderi  rq   r   Nc                 j  > [         TU ]  5         Xl        [        R                  " [        [        UR                  5      5       Vs/ s HO  n[        UUR                  U   UR                  U   UR                  U   US:  a  UR                  US-
     OSS9PMQ     sn5      U l        SU l        g s  snf )Nr   r   )rq   r8  r   r   r9  F)rv   rw   rq   r   r<  r=  r!   depthsr7  num_group_tokensnum_output_groupsstagesgradient_checkpointing)r   rq   ir   s      r"   rw   GroupViTVisionEncoder.__init__  s    mm s6==12	 3A ! --*$*$;$;A$>%+%=%=a%@LMPQE)A)A!a%)HWX 3	
 ',#	s   AB0rO  output_hidden_statesrQ  return_dictc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOS nU(       a  SOS nS n[	        U R
                  5       H=  u  pU(       a  XQ4-   nU	" XU5      n
U
S   nU
S   nU(       d  M-  U
S   c  M5  XjS   4-   nM?     U(       a  XQ4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )Nr   r   r   rL   c              3   .   #    U  H  oc  M  Uv   M     g 7frE  r   r   vs     r"   r   0GroupViTVisionEncoder.forward.<locals>.<genexpr>E  s     g$Uq$U   	last_hidden_staterO  rX   )rq   rQ  r  use_return_dict	enumerater  r   r   )r   rO  r  rQ  r  all_hidden_statesall_groupingsr   r  stagelayer_outputss              r"   r   GroupViTVisionEncoder.forward#  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]"6BD/T!$++.HA#$58H$H!!-?PQM)!,M(+L  ]1%5%A -q1A0C C /   14D Dg]}$Uggg+Yf
 	
r$   )rq   r  r  r4  )r   r   r   r   r   rw   r   r  r   r  r   r   r   r   r   r   r   s   @r"   r  r    sv    ,3 , ,( 04,0&*%
||%
 'tn%
 $D>	%

 d^%
 
uo%	&%
 %
r$   r  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jjrSrU =r$ )GroupViTTextEncoderiK  z
Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
[`GroupViTEncoderLayer`].

Args:
    config: GroupViTTextConfig
rq   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf rZ  )
rv   rw   rq   r   r<  r=  r  r>  r?  r  )r   rq   r  r   s      r"   rw   GroupViTTextEncoder.__init__T  sT    mm5QWQiQiKj$kKja%9&%AKj$kl&+# %ls   A&rS  rT  rQ  r  r  r   c                 L   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       Hr  u  pU(       a  Xy4-   nU R                  (       a1  U R                  (       a   U R                  UR                  U	UUU5      nO	U" U	UUUS9nUS   n	U(       d  Mj  XS   4-   nMt     U(       a  Xy4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr   )rQ  r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frE  r   r  s     r"   r   .GroupViTTextEncoder.forward.<locals>.<genexpr>  s     e$Sq$Sr  r  )rq   rQ  r  r  r  r?  r  r   _gradient_checkpointing_func__call__r   r   )r   r.  rS  rT  rQ  r  r  encoder_statesall_attentionsrO  idxencoder_layerr  s                r"   r   GroupViTTextEncoder.forwardZ  s8   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M  !/3C2E!E- #90  +.>>Ne]N$Seee+Vd
 	
r$   )rq   r  r?  )NNNNN)r   r   r   r   r   r   rw   r   r   r  r  r   r   r   r   r   r   r   s   @r"   r  r  K  s    ,1 , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
 O
r$   r  c                      ^  \ rS rSrS\4U 4S jjr\      SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jj5       rSrU =r$ )GroupViTTextTransformeri  rq   c                    > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        UR                  U l        g rs   )rv   rw   rq   r{   r   r  r  encoderr   rz   r|   final_layer_normeos_token_idr+  s      r"   rw    GroupViTTextTransformer.__init__  s]    &&	08*62 "Y<Q<Q R #//r$   r-  rS  r"  rQ  r  r  r   c           	      0   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUR                  5       nUR                  SUS   5      nU R                  XS9n[        XxR                  UR                  S9n	Ub  [        X(R                  5      nU R                  UUU	UUUS9n
U
S   nU R                  U5      nU R                  S:X  ae  U[        R                   " UR"                  S   UR                  S9UR%                  [        R&                  UR                  S9R)                  SS	94   nOU[        R                   " UR"                  S   UR                  S9UR%                  [        R&                  UR                  S9U R                  :H  R'                  5       R)                  SS	94   nU(       d	  X4U
S
S  -   $ [+        UUU
R,                  U
R.                  S9$ )NzYou have to specify input_idsrb   )r-  r"  r   )r.  rS  rT  rQ  r  r  r   rL   )rA   r   r   r   r  pooler_outputrO  rX   )rq   rQ  r  r  r   rP   r  r  r   rA   r   r   r  r  r  r   r    rG   torS   argmaxr   rO  rX   )r   r-  rS  r"  rQ  r  r  input_shaperO  rT  encoder_outputsr  pooled_outputs                r"   r   GroupViTTextTransformer.forward  s&    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&NN2{27	)W !A,,]5I5I!

 %7H[H[\N,,')"7/!5# ' 
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M %58KKK)/')77&11	
 	
r$   )rq   r  r  r  r  NNNNNN)r   r   r   r   r   rw   r   r   r   r  r  r   r   r   r   r   r   r   s   @r"   r  r    s    	01 	0  -115/3,0/3&*L
ELL)L
 !.L
 u||,	L

 $D>L
 'tnL
 d^L
 
u00	1L
 L
r$   r  c                     ^  \ rS rSr\rS\4U 4S jjrS\R                  4S jr	S r
\      SS\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )GroupViTTextModeli  rq   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rE  )rv   rw   r  
text_model	post_initr   s     r"   rw   GroupViTTextModel.__init__  s&     1&9r$   r   c                 B    U R                   R                  R                  $ rE  r  r  r&  r   s    r"   get_input_embeddings&GroupViTTextModel.get_input_embeddings  s    ))999r$   c                 8    XR                   R                  l        g rE  r  )r   r   s     r"   set_input_embeddings&GroupViTTextModel.set_input_embeddings  s    5:""2r$   r-  rS  r"  rQ  r  r  c           	      *    U R                  UUUUUUS9$ )a  
Examples:

```python
>>> from transformers import CLIPTokenizer, GroupViTTextModel

>>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r-  rS  r"  rQ  r  r  r  )r   r-  rS  r"  rQ  r  r  s          r"   r   GroupViTTextModel.forward  s,    2 )%/!5#  
 	
r$   r  r  )r   r   r   r   r   r  rw   r   Moduler  r  r   r   r   r  r  r   r   r   r   r   r   r   s   @r"   r  r    s    %L1 :bii :;  -115/3,0/3&*
ELL)
 !.
 u||,	

 $D>
 'tn
 d^
 
u00	1
 
r$   r  c                      ^  \ rS rSrS\4U 4S jjr\    SS\\R                     S\\
   S\\
   S\\
   S\\\4   4
S	 jj5       rS
rU =r$ )GroupViTVisionTransformeri:  rq   c                    > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        g rs   )rv   rw   rq   r{   r  r  r  r  r   rz   r|   r  r+  s      r"   rw   "GroupViTVisionTransformer.__init__;  sL    &&	26:,V4i5J5JKr$   r   r  rQ  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  U5      nU R                  UUUUS9nUS   nU R                  U5      nUR                  SS9nU(       d	  Xx4USS  -   $ [        UUUR                  UR                  S9$ )Nz You have to specify pixel_values)rO  r  rQ  r  r   r   r   r  )rq   rQ  r  r  r   r  r  r  r  r   rO  rX   )	r   r   r  rQ  r  rO  r  r  r  s	            r"   r   !GroupViTVisionTransformer.forwardD  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@5,,'!5/#	 ' 
 ,A. !NN+<=)..1.5%58KKK)/')77&11	
 	
r$   )rq   r  r  r  NNNN)r   r   r   r   r   rw   r   r   r   r   r  r   r   r   r   r   r   r   s   @r"   r  r  :  s    L3 L  59/3,0&*'
u001'
 'tn'
 $D>	'

 d^'
 
u00	1'
 '
r$   r  c                      ^  \ rS rSr\rSrS\4U 4S jjrS\4S jr	\
    SS\\R                     S\\   S\\   S	\\   S\\\4   4
S
 jj5       rSrU =r$ )GroupViTVisionModelio  r   rq   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rE  )rv   rw   r  vision_modelr  r   s     r"   rw   GroupViTVisionModel.__init__s  s'     5f=r$   r   c                 B    U R                   R                  R                  $ rE  )r  r  r  r   s    r"   r  (GroupViTVisionModel.get_input_embeddingsy  s      ++<<<r$   rQ  r  r  c                 &    U R                  UUUUS9$ )ah  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, GroupViTVisionModel

>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r   rQ  r  r  r  )r   r   rQ  r  r  s        r"   r   GroupViTVisionModel.forward|  s(    8   %/!5#	 ! 
 	
r$   r  r  )r   r   r   r   r   r  main_input_namerw   r   r  r   r   r   r   r  r   r   r   r   r   r   r   s   @r"   r  r  o  s    'L$O3 =&= =  59,0/3&* 
u001 
 $D> 
 'tn	 

 d^ 
 
u00	1 
  
r$   r  c                   >  ^  \ rS rSr\rS\4U 4S jjr\      SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\\   S
\	R                  4S jj5       r\    SS\\	R                     S\\   S\\   S	\\   S
\	R                  4
S jj5       r\         SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S	\\   S
\\\4   4S jj5       rSrU =r$ )GroupViTModeli  rq   c                 >  > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l
        UR                  U l        UR                  U l        [        U5      U l        [!        U5      U l        [$        R&                  " [$        R(                  " U R                  U R                  SS9[$        R*                  " U R                  5      [$        R,                  " SS9[$        R(                  " U R                  U R                  SS95      U l        [$        R&                  " [$        R(                  " U R                  U R                  SS9[$        R*                  " U R                  5      [$        R,                  " SS9[$        R(                  " U R                  U R                  SS95      U l        [$        R2                  " [4        R6                  " U R8                  R:                  5      5      U l        U R?                  5         g )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type T)r  )inplace) rv   rw   r   text_configr   	TypeErrortypevision_configr   projection_dimprojection_intermediate_dimr{   text_embed_dimvision_embed_dimr  r  r  r  r   rA  r   BatchNorm1dReLUvisual_projectiontext_projectionr  r   rE   rq   logit_scale_init_valuelogit_scaler  )r   rq   r  r  r   s       r"   rw   GroupViTModel.__init__  s    &,,.@AA++,-Q0 
 &..0DEE--./q2 
 ((,,$33+1+M+M()55 - 9 91+>5mD!#IId++T-M-MTXYNN4;;<GGD!IId668K8KRVW	"
  "}}IId))4+K+KRVWNN4;;<GGD!IId668K8KRVW	 
 <<T[[5W5W(XY 	r$   r-  rS  r"  rQ  r  r  r   c           	          Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUS9nUS   nU R                  U5      n	U	$ )aF  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`GroupViTTextModel`].

Examples:

```python
>>> from transformers import CLIPTokenizer, GroupViTModel

>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```r  r   )rq   rQ  r  r  r  r  )
r   r-  rS  r"  rQ  r  r  text_outputsr  text_featuress
             r"   get_text_featuresGroupViTModel.get_text_features  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%/!5# ' 
 %Q,,];r$   r   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUS9nUS   nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`GroupViTVisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, GroupViTModel

>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> image_features = model.get_image_features(**inputs)
```r  r   )rq   rQ  r  r  r  r  )r   r   rQ  r  r  vision_outputsr  image_featuress           r"   get_image_features GroupViTModel.get_image_features  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5#	 + 
 'q)//>r$   return_lossoutput_segmentationc
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SnUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R                  UUUU	S9n
U R                  UUUUUU	S9nU
S   nU R                  U5      nUS   nU R                  U5      nXR                  SSS9-  nXR                  SSS9-  nU R                  R                  5       n[        R                  " XR                  5       5      U-  nUR                  5       nSnU(       Gaf  U
S   nU R                  UR                  SUR                   S   5      5      nU(       a  U
S	   nOU
S
   n[#        UUR                   S
S 5      nUUR                  SSS9-  n[        R                  " UUR                  5       5      U-  nUR                  UR                   S   SUR                   S   5      R%                  SS
S5      nUR                  UR                   S   UR                   S   S5      n[        R                  " UU5      U-  nUR                  UR                   S   UR                   S   UR                   S
   UR                   S	   5      nSnU(       a  ['        U5      nU	(       d  Ub
  UUUUUUU
4nOUXXU
4nUb  U4U-   $ U$ [)        UUUUUUUU
S9$ )a  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.
output_segmentation (`bool`, *optional*):
    Whether or not to return the segmentation logits.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, GroupViTModel

>>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
>>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```NTr  r  r   rb   r   r   r	   rL   )r   r   r   r   r   r   r   r   )rq   rQ  r'  r  r  r  r  r  r  normr  expr   matmulr'   rV   rG   rm   rd   r*   r   )r   r-  r   rS  r"  r&  rQ  r  r'  r  r"  r  r   r   r  r   r   
seg_logitsimage_group_embedsrX   groupinglogits_per_image_groupflatten_groupingr   outputs                            r"   r   GroupViTModel.forward/  sc   R 2C1N-TXT_T_TqTq#6#BHgHg 	  $$8$D $++JjJj 	 &1%<k$++B]B]**%/!5#	 + 
 )%/!5# ' 
 &a(--l;"1o**;7 $&7&7B&7&MM!$4$4T$4$JJ &&**,,,{NN4DES*,,.
 "0!2!%!7!78J8R8RSUWiWoWoprWs8t!u#+A.
+A.
3J@R@RSTSU@VWH "46H6M6MRT^b6M6c!c%*\\2Dkmmo%VYd%d"%;%C%C""1%r;+<+<Q+?&gaA #
  (//q0A8>>RSCTVXY &<>NOR]]J#++  #Z%5%5a%8(..:KX^^\]M^J  1D%$#  " +O,ftu)-)9TGf$EvE"-+ *#%* .	
 		
r$   )	r  r  r  r  r  r  r  r  r  r  r  )	NNNNNNNNN)r   r   r   r   r   r  rw   r   r   r   r  r  r   r  r$  r5  r   r   r   r   r   r   r   s   @r"   r	  r	    s   !L)~ )V  -115/3,0/3&*,ELL), !., u||,	,
 $D>, 'tn, d^, 
		, ,\  59,0/3&*.u001. $D>. 'tn	.
 d^. 
		. .`  15481537&*,0/3.2&*N
E,,-N
 u001N
 !.	N

 u//0N
 d^N
 $D>N
 'tnN
 &d^N
 d^N
 
u))	*N
 N
r$   r	  )r	  r  r  r  )r   Frb   r  )Er   collections.abcr   dataclassesr   typingr   r   r   r   numpyrT   r   torch.utils.checkpointr   activationsr
   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   configuration_groupvitr   r   r   
get_loggerr   loggerr  r#   r*   rS   r<   floatr  rJ   r`   rm   r  ro   r   r   r   r   r  r   r7  r~   r   rx   r>  r  r  r  r  r  r  r  r	  __all__r   r$   r"   <module>rB     s[     ! . .     ! d K - D D \ \ 
		H	%
`U\\ `ell `
-ell -u|| - C 5<< e t RU _d_k_k ,<:"))  -bii -`4+")) 4+n /
+ /
 /
dbii DGryy GV%RYY %P[BII [|")) 0!{ !k2		 k2^/299 /d $@o $@ $@N7
BII 7
t^
")) ^
BY
bii Y
x/
/ /
d2
		 2
j.
1 .
b ]
+ ]
 ]
@	 cr$   