o
    Zh                     @   s  d Z ddlZddlmZ ddlmZmZmZm	Z	 ddl
ZddlZddlZddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZ ddlmZm Z m!Z! e"e#Z$dej%dej%fddZ&dej%dej%fddZ'dej%de(fddZ)dKdej%de*de+de(dej%f
ddZ,dLddZ-d d! Z.G d"d# d#ej/Z0G d$d% d%ej/Z1G d&d' d'ej/Z2eG d(d) d)eZ3G d*d+ d+ej/Z4G d,d- d-ej/Z5G d.d/ d/ej/Z6G d0d1 d1ej/Z7G d2d3 d3ej/Z8G d4d5 d5e8Z9G d6d7 d7ej/Z:G d8d9 d9ej/Z;eG d:d; d;eZ<G d<d= d=ej/Z=G d>d? d?ej/Z>G d@dA dAej/Z?G dBdC dCe<Z@G dDdE dEej/ZAG dFdG dGe<ZBeG dHdI dIe<ZCg dJZDdS )MzPyTorch GroupViT model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalZcross_entropytorcharangelenr   )r    r   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/groupvit/modeling_groupvit.pycontrastive_loss'   s   r!   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r!   t)r"   Zcaption_lossZ
image_lossr   r   r    groupvit_loss,   s   r$   dimc                 C   sJ   |  |}|j|ddd }tj| tjd||d}||  | }|S )NTkeepdimr   Zmemory_format      ?)softmaxmaxr   
zeros_likelegacy_contiguous_formatscatter_detach)r   r%   y_softindexy_hardretr   r   r    hard_softmax2   s
   
r4   Ftauhardc           
      C   s   t jjt jd| j| jdt jd| j| jd}|| j}| | | }|	|}|rK|j
|ddd }t j| t jd||d}||  | }	|	S |}	|	S )N        )r   dtyper)   Tr&   r   r(   )r   distributionsgumbelZGumbeltensorr   r9   sampleshaper*   r+   r,   r-   r.   r/   )
r   r6   r7   r%   Zgumbel_distZgumbelsr0   r1   r2   r3   r   r   r    gumbel_softmax<   s   
r?   c           	      C   s   || | j d  d }||kr tt|| }| j d | }ntt|| }| j d | }| j d }| j d }| ||||} tjj| ||fd|d} | S )a  
    Args:
        attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
        height (`int`): height of the output attention map
        width (`int`): width of the output attention map
        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

    Returns:
        `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
             ?r   r   Zbilinearsizemodealign_corners)r>   intnproundreshaper   r   interpolate)	
attentionsheightwidthrE   scaleZ
feat_widthZfeat_height
batch_sizegroupsr   r   r    resize_attention_mapR   s   

rQ   c                 C   s   g }t  7 d}| D ]*}|ddd }|du r|}n|| }t|ddd g|R  }|| qW d   n1 s@w   Y  |d }|S )a1  
    Args:
        attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
        hw_shape (`tuple(int)`): height and width of the output attention map
    Returns:
        `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
    Nr   r@   r   r5   )r   Zno_gradpermute
contiguousrQ   append)rK   Zhw_shapeZ	attn_mapsZprev_attn_masksZ
attn_masksZcur_attn_mapZfinal_groupingr   r   r    get_grouping_from_attentionsp   s   	
rU   c                       s*   e Zd Zdef fddZdd Z  ZS )GroupViTCrossAttentionLayerconfigc                    sJ   t    t|| _tj|j|jd| _t	|| _
tj|j|jd| _d S NZeps)super__init__GroupViTAttentionattnr   	LayerNormhidden_sizelayer_norm_epsnorm2GroupViTMLPmlp	norm_postselfrW   	__class__r   r    r[      s
   


z$GroupViTCrossAttentionLayer.__init__c                 C   s<   |}|| j ||dd  }|| | | }| |}|S )N)encoder_hidden_statesr   )r]   rc   ra   rd   )rf   querykeyxr   r   r    forward   s
   
z#GroupViTCrossAttentionLayer.forward)__name__
__module____qualname__r   r[   rm   __classcell__r   r   rg   r    rV      s    rV   c                       s4   e Zd Zdef fddZd	ddZdd Z  ZS )
GroupViTAssignAttentionrW   c                    sj   t    |jd | _t|j|j| _t|j|j| _t|j|j| _t|j|j| _	|j
| _
d S )N      )rZ   r[   r_   rN   r   Linearq_projk_projv_projproj
assign_epsre   rg   r   r    r[      s   
z GroupViTAssignAttention.__init__Tc                 C   sD   |r| j rt|d|d}|S |rt|dd}|S tjj|dd}|S )N)r%   r7   r%   )trainingr?   r4   r   r   r*   )rf   r]   r;   r7   r   r   r    get_attn   s   
z GroupViTAssignAttention.get_attnc                 C   s   |}|  |}| |}| |}||dd | j }| |}| j|ddd}||jddd| j  }|| }| |}||fS )Nrz   r5   F)r;   r7   Tr%   r'   )	ru   rv   rw   	transposerN   r}   sumry   rx   )rf   rj   rk   valueZraw_attnr]   Z	soft_attnoutr   r   r    rm      s   




zGroupViTAssignAttention.forward)TT)rn   ro   rp   r   r[   r}   rm   rq   r   r   rg   r    rr      s    

rr   c                       s2   e Zd Zdef fddZdd Zdd Z  ZS )GroupViTTokenAssignrW   c                    s   t    || _tj j jd| _t j	t
jjr j	n j	 j	f} fdd|D \}}t |||| _tj j jd| _tj j jd| _t | _t | _tj j jd| _t  j| j| _d S )NrY   c                    s   g | ]	}t | j qS r   )rF   r_   ).0rl   rW   r   r    
<listcomp>   s    z0GroupViTTokenAssign.__init__.<locals>.<listcomp>)rZ   r[   num_output_groupr   r^   r_   r`   norm_tokens
isinstanceassign_mlp_ratiocollectionsabcIterableGroupViTMixerMLP	mlp_internorm_post_tokensnorm_xrV   pre_assign_attnrr   assign
norm_new_xrb   mlp_channels)rf   rW   num_group_tokenr   r   Z
tokens_dimZchannels_dimrg   r   r    r[      s   



zGroupViTTokenAssign.__init__c                 C   s   |  |}| |}|S )z
        Args:
            group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

        Returns:
            projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
        )r   r   )rf   group_tokensprojected_group_tokensr   r   r    project_group_token   s   
	
z'GroupViTTokenAssign.project_group_tokenc                 C   s^   |  |}| |}| |}| ||}| ||\}}||7 }|| | | }||fS )z
        Args:
            image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
            group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
        )r   r   r   r   r   r   r   )rf   Zimage_tokensr   r   Znew_image_tokens	attentionr   r   r    rm      s   


zGroupViTTokenAssign.forward)rn   ro   rp   r   r[   r   rm   rq   r   r   rg   r    r      s    r   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeej ed< dZeed	< dZeed
< dee fddZdS )GroupViTModelOutputa\  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
            Classification scores for each pixel.

            <Tip warning={true}>

            The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
            to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
            original image size as post-processing. You should always check your logits shape and resize as needed.

            </Tip>

        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of
            [`GroupViTTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`GroupViTVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`GroupViTTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`GroupViTVisionModel`].
    Nlosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedstext_model_outputvision_model_outputr   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r   r   N)getattrto_tuple)r   krf   r   r    	<genexpr>1  s
    
z/GroupViTModelOutput.to_tuple.<locals>.<genexpr>)tuplekeysr   r   r   r    r   0  s   zGroupViTModelOutput.to_tuple)rn   ro   rp   __doc__r   r   r   FloatTensor__annotations__r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r     s   
 !r   c                	       sh   e Zd ZdZ				ddedeeeeef f ded	ef fd
dZddej	de
dej	fddZ  ZS )GroupViTPatchEmbeddingsz#
    Image to Patch Embedding.
          r      
image_size
patch_sizenum_channels	embed_dimc                    s   t    t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }|| _|| _|| _t	j
||||d| _d S )Nr   r   )Zkernel_sizeZstride)rZ   r[   r   r   r   r   r   r   num_patchesr   Conv2d
projection)rf   r   r   r   r   r   rg   r   r    r[   <  s   
 z GroupViTPatchEmbeddings.__init__Fpixel_valuesinterpolate_pos_encodingr   c              
   C   sx   |j \}}}}|s.|| jd ks|| jd kr.td| d| d| jd  d| jd  d	| |ddd}|S )Nr   r   zInput image size (*z) doesn't match model ().r@   )r>   r   
ValueErrorr   flattenr   )rf   r   r   rO   r   rL   rM   rl   r   r   r    rm   M  s   zGroupViTPatchEmbeddings.forward)r   r   r   r   F)rn   ro   rp   r   rF   r   r   r[   r   Tensorboolrm   rq   r   r   rg   r    r   7  s     $r   c                       s\   e Zd Zdef fddZdejdededejfdd	Zddejde	dejfddZ
  ZS )GroupViTVisionEmbeddingsrW   c                    sx   t    t|j|j|j|jd| _| jj}t	
td||j| _t	|j| _t	j|j|jd| _|j| _|| _d S )N)r   r   r   r   r   rY   )rZ   r[   r   r   r   r   r_   patch_embeddingsr   r   	Parameterr   zerosposition_embeddingsZDropoutdropoutr^   r`   	layernormrW   )rf   rW   r   rg   r   r    r[   Z  s   

z!GroupViTVisionEmbeddings.__init__
embeddingsrL   rM   r   c                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r5   rA   r   r   r@   ZbicubicFrB   )r>   r   r   Zjit
is_tracingr   r   rI   rR   r   r   rJ   view)rf   r   rL   rM   r   Znum_positionsZpatch_pos_embedr%   Z
new_heightZ	new_widthZsqrt_num_positionsr   r   r    r   j  s&   




z1GroupViTVisionEmbeddings.interpolate_pos_encodingFr   r   c           
      C   sd   |j \}}}}| j||d}| |}| \}}}	|r&|| ||| }n|| j }| |}|S )N)r   )r>   r   r   rC   r   r   r   )
rf   r   r   rO   r   rL   rM   r   seq_len_r   r   r    rm     s   


z GroupViTVisionEmbeddings.forwardr   )rn   ro   rp   r   r[   r   r   rF   r   r   rm   rq   r   r   rg   r    r   Y  s    $&r   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )GroupViTTextEmbeddingsrW   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nposition_ids)r   r5   F)
persistent)rZ   r[   r_   r   Z	EmbeddingZ
vocab_sizetoken_embeddingZmax_position_embeddingsposition_embeddingZregister_bufferr   r   expandrf   rW   r   rg   r   r    r[     s   

zGroupViTTextEmbeddings.__init__N	input_idsr   inputs_embedsr   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )Nr5   rz   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r>   r   weightr   r   r   )rf   r   r   r   Z
seq_lengthZmax_position_embeddingr   r   r   r   r    rm     s"   

zGroupViTTextEmbeddings.forwardNNN)rn   ro   rp   r   r[   r   r   
LongTensorr   r   rm   rq   r   r   rg   r    r     s    r   c                
       s   e Zd ZdZdededededef
 fddZed	d
 Zdd Z	dde
jdee
j de
jfddZ		dde
jdee
j dee dee
j fddZ  ZS )GroupViTStagezMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.rW   depthnum_prev_group_tokenr   r   c                    s   t    || _|| _|dkrttd| j| _	nd | _	t
 fddt|D | _|dkr;t ||d| _nd | _|dkr^|dkr^ttj j jdt | jd || _d S d | _d S )Nr   r   c                       g | ]}t  qS r   GroupViTEncoderLayerr   r   r   r   r    r         z*GroupViTStage.__init__.<locals>.<listcomp>)rW   r   r   rY   r@   )rZ   r[   r   r   r   r   r   r   r_   group_token
ModuleListrangelayersr   
downsample
Sequentialr^   r`   r   group_projector)rf   rW   r   r   r   r   rg   r   r    r[     s(   



zGroupViTStage.__init__c                 C   s
   | j d uS N)r   r   r   r   r    with_group_token  s   
zGroupViTStage.with_group_tokenc                 C   s>   | j r|d d d | j f |d d | j d f fS |d fS r   )r   r   rf   rl   r   r   r    split_x  s   0zGroupViTStage.split_xNrl   r   r   c                 C   s   |d u r|S t j||gddS )Nr   r{   )r   cat)rf   rl   r   r   r   r    concat_x  s   zGroupViTStage.concat_xFhidden_statesprev_group_tokenoutput_attentionsc                 C   s   | j r| j|ddd}| jdur|| | }nd}|}| ||}| jD ]}||ddd}|d }q(| |\}}d}	| jdurL| ||\}}	||f}
|rW|
|	f }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the grouping tensors of Grouping block.
        r   r5   N)attention_maskcausal_attention_mask)	r   r   r   rC   r   r   r   r   r   )rf   r   r   r   r   rl   Zcat_xlayerZ	layer_outr   outputsr   r   r    rm      s&   




zGroupViTStage.forwardr   )NF)rn   ro   rp   r   r   rF   r[   propertyr   r   r   r   r   r   r   r   r   rm   rq   r   r   rg   r    r     s8    "
"r   c                
       sX   e Zd Z			ddedee dee dee f fddZdejd	ejfd
dZ	  Z
S )rb   NrW   r_   intermediate_sizeoutput_sizec                    sp   t    || _t|j | _|d ur|n|j}|d ur|n|j}|d ur&|n|}t	||| _
t	||| _d S r   )rZ   r[   rW   r	   Z
hidden_actactivation_fnr_   r   r   rt   fc1fc2)rf   rW   r_   r   r   rg   r   r    r[   +  s   
zGroupViTMLP.__init__r   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rf   r   r   r   r    rm   ;  s   


zGroupViTMLP.forwardr   )rn   ro   rp   r   r   rF   r[   r   r   rm   rq   r   r   rg   r    rb   *  s    rb   c                       s   e Zd Z fddZ  ZS )r   c                    s    t  |dd}|ddS Nr   r@   )rZ   rm   r   r   rg   r   r    rm   C  s   zGroupViTMixerMLP.forward)rn   ro   rp   rm   rq   r   r   rg   r    r   B  s    r   c                       s   e Zd ZdZ fddZdejdedefddZ							
ddejde	ej de	ej de	ej
 de	e deeje	ej e	eej  f fddZ  ZS )r\   z=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   rs   )rZ   r[   rW   r_   r   Znum_attention_heads	num_headshead_dimr   rN   Zattention_dropoutr   r   rt   rv   rw   ru   out_projre   rg   r   r    r[   K  s"   

zGroupViTAttention.__init__r<   r   bszc                 C   s    | ||| j| jdd S r   )r   r   r   r   rS   )rf   r<   r   r   r   r   r    _shape^  s    zGroupViTAttention._shapeNFr   r   r   ri   r   r   c                 C   s  |  \}}}|du}	| || j }
|	r*| | |d|}| | |d|}n| | |d|}| | |d|}|| j d| jf}| |
||j| }
|j| }|j| }| d}t	
|
|dd}|  || j ||fkrtd|| j ||f d|   |dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}|dur|  |d||fkrtd|d||f d|   ||| j||| }||| j ||}tjj|dd}|r||| j||}||| j ||}nd}tjj|| j| jd	}t	
||}|  || j || jfkr@td
|| j|| jf d|   ||| j|| j}|dd}||||}| |}||fS )z#Input shape: Batch x Time x ChannelNr5   r   r@   z$Attention weights should be of size z	, but is z!Attention mask should be of size r{   )pr|   z `attn_output` should be of size )rC   ru   rN   r  rv   rw   r   r   r   r   Zbmmr   r   r   r   r*   r   r|   rI   r   )rf   r   r   r   ri   r   r   Ztgt_lenr   Zis_cross_attentionZquery_statesZ
key_statesZvalue_statesZ
proj_shapeZsrc_lenattn_weightsZattn_weights_reshapedZ
attn_probsZattn_outputr   r   r    rm   a  sl   




zGroupViTAttention.forward)NNNF)rn   ro   rp   r   r[   r   r   rF   r  r   r   r   r   rm   rq   r   r   rg   r    r\   H  s*    r\   c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )r   rW   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S rX   )rZ   r[   r_   r   r\   	self_attnr   r^   r`   layer_norm1rb   rc   layer_norm2re   rg   r   r    r[     s   


zGroupViTEncoderLayer.__init__Fr   r   r   r   r   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   )r  r  r  rc   )rf   r   r   r   r   Zresidualr  r   r   r   r    rm     s"   




zGroupViTEncoderLayer.forwardr   )rn   ro   rp   r   r[   r   r   r   r   r   r   rm   rq   r   r   rg   r    r     s    r   c                   @   s    e Zd ZeZdZdZdd ZdS )GroupViTPreTrainedModelZgroupvitTc                 C   s  | j j}t|tjtjfr"|jjjd|d |j	dur!|j	j
  nt|tjr5|j	j
  |jjd | j j}t|trX|jjjjd|d d |jjjjd|d d dS t|tr| j j}|jd d|j j d  | }|jd | }tjj|jj|d tjj|jj|d tjj|jj|d tjj|jj|d dS t|tr| j j}|j jd d|j j d  | }d|j j d | }tjj|jj|d tjj|jj|d dS dS )	zInitialize the weightsr8   )meanstdNr)   g{Gz?rs   r@   )r	  )rW   Zinitializer_ranger   r   rt   r   r   dataZnormal_biasZzero_r^   Zfill_Zinitializer_factorr   r   r   r\   r   num_hidden_layersinitru   rv   rw   r   rb   r_   r   r   )rf   moduleZ
init_rangefactorZin_proj_stdZout_proj_stdZfc_stdr   r   r    _init_weights  s8   



 z%GroupViTPreTrainedModel._init_weightsN)rn   ro   rp   r   config_classZbase_model_prefixZsupports_gradient_checkpointingr  r   r   r   r    r    s
    r  c                       sb   e Zd Zdeddf fddZ			ddejdee dee d	ee de	e
ef f
d
dZ  ZS )GroupViTVisionEncoderrW   r   Nc                    s>   t     | _t fddtt jD | _d| _	d S )Nc              
      sF   g | ]}t   j|  j|  j| |d kr j|d  nd dqS )r   r   )rW   r   r   r   r   )r   depthsZnum_group_tokensZnum_output_groups)r   ir   r   r    r     s    z2GroupViTVisionEncoder.__init__.<locals>.<listcomp>F)
rZ   r[   rW   r   r   r   r   r  stagesgradient_checkpointingre   rg   r   r    r[     s   


zGroupViTVisionEncoder.__init__r   output_hidden_statesr   return_dictc                 C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|r"dnd }|r(dnd }d }t| jD ](\}}	|r<||f }|	|||}
|
d }|
d }|rY|
d d urY||
d f }q1|ra||f }|sotdd |||fD S t|||dS )Nr   r   r   r@   c                 s       | ]	}|d ur|V  qd S r   r   r   vr   r   r    r   E      z0GroupViTVisionEncoder.forward.<locals>.<genexpr>last_hidden_stater   rK   )rW   r   r  use_return_dict	enumerater  r   r   )rf   r   r  r   r  Zall_hidden_statesZall_groupingsr   r  Zstagelayer_outputsr   r   r    rm   #  s.   

zGroupViTVisionEncoder.forwardr   )rn   ro   rp   r   r[   r   r   r   r   r   r   r   rm   rq   r   r   rg   r    r    s     
r  c                       st   e Zd ZdZdef fddZ					ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )GroupViTTextEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
    [`GroupViTEncoderLayer`].

    Args:
        config: GroupViTTextConfig
    rW   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    r   r   r   r   r   r   r    r   W  r   z0GroupViTTextEncoder.__init__.<locals>.<listcomp>F)	rZ   r[   rW   r   r   r   r  r   r  re   rg   r   r    r[   T  s   
 
zGroupViTTextEncoder.__init__Nr   r   r   r  r  r   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]1\}
}|r<||	f }| jrM| jrM| |j	|	|||}n||	|||d}|d }	|rb||d f }q1|rj||	f }|sxt
dd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr   )r   r   r   c                 s   r  r   r   r  r   r   r    r     r  z.GroupViTTextEncoder.forward.<locals>.<genexpr>r  )rW   r   r  r  r   r   r  r|   Z_gradient_checkpointing_func__call__r   r   )rf   r   r   r   r   r  r  Zencoder_statesZall_attentionsr   idxZencoder_layerr!  r   r   r    rm   Z  sF   &

zGroupViTTextEncoder.forward)NNNNN)rn   ro   rp   r   r   r[   r   r   r   r   r   r   r   rm   rq   r   r   rg   r    r"  K  s*    	
r"  c                       s   e Zd Zdef fddZe						ddeej deej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )GroupViTTextTransformerrW   c                    sH   t    || _|j}t|| _t|| _tj	||j
d| _|j| _d S rX   )rZ   r[   rW   r_   r   r   r"  encoderr   r^   r`   final_layer_normeos_token_idr   rg   r   r    r[     s   


z GroupViTTextTransformer.__init__Nr   r   r   r   r  r  r   c                 C   sj  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| }|d|d }| j||d}t||j	|j
d}	|d urLt||j	}| j|||	|||d}
|
d }| |}| jdkr|tj|jd |j
d|jtj|j
djdd	f }n|tj|jd |j
d|jtj|j
d| jk jdd	f }|s||f|
d
d   S t|||
j|
jdS )NzYou have to specify input_idsr5   )r   r   r   )r   r   r   r   r  r  r   r@   )r9   r   r{   r   r  Zpooler_outputr   rK   )rW   r   r  r  r   rC   r   r   r
   r9   r   r   r&  r'  r(  r   r   r>   torF   Zargmaxr   r   rK   )rf   r   r   r   r   r  r  Zinput_shaper   r   encoder_outputsr  pooled_outputr   r   r    rm     s\   

	

	zGroupViTTextTransformer.forwardNNNNNN)rn   ro   rp   r   r[   r   r   r   r   r   r   r   r   rm   rq   r   r   rg   r    r%    s0    
r%  c                       s   e Zd ZeZdef fddZdejfddZdd Z	e
												dd
eej deej deej dee dee dee deeef fddZ  ZS )GroupViTTextModelrW   c                    "   t  | t|| _|   d S r   )rZ   r[   r%  
text_model	post_initre   rg   r   r    r[        
zGroupViTTextModel.__init__r   c                 C   
   | j jjS r   r0  r   r   r   r   r   r    get_input_embeddings     
z&GroupViTTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r4  )rf   r   r   r   r    set_input_embeddings  s   z&GroupViTTextModel.set_input_embeddingsNr   r   r   r   r  r  c                 C   s   | j ||||||dS )a9  
        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTTextModel

        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r   r   r  r  )r0  )rf   r   r   r   r   r  r  r   r   r    rm     s   zGroupViTTextModel.forwardr-  )rn   ro   rp   r   r  r[   r   Moduler5  r7  r   r   r   r   r   r   r   r   rm   rq   r   r   rg   r    r.    s6    
r.  c                       sh   e Zd Zdef fddZe				ddeej dee	 dee	 dee	 d	e
eef f
d
dZ  ZS )GroupViTVisionTransformerrW   c                    s@   t    || _|j}t|| _t|| _tj	||j
d| _d S rX   )rZ   r[   rW   r_   r   r   r  r&  r   r^   r`   r   r   rg   r   r    r[   ;  s   


z"GroupViTVisionTransformer.__init__Nr   r  r   r  r   c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| |}| j||||d}|d }| |}|jdd}|sO||f|dd   S t	|||j
|jdS )Nz You have to specify pixel_values)r   r  r   r  r   r   r{   r)  )rW   r   r  r  r   r   r&  r   r  r   r   rK   )	rf   r   r  r   r  r   r+  r  r,  r   r   r    rm   D  s0   

z!GroupViTVisionTransformer.forwardNNNN)rn   ro   rp   r   r[   r   r   r   r   r   r   r   r   rm   rq   r   r   rg   r    r:  :  s$    	
r:  c                       s~   e Zd ZeZdZdef fddZdefddZe					dde
ej d	e
e d
e
e de
e deeef f
ddZ  ZS )GroupViTVisionModelr   rW   c                    r/  r   )rZ   r[   r:  vision_modelr1  re   rg   r   r    r[   s  r2  zGroupViTVisionModel.__init__r   c                 C   r3  r   )r=  r   r   r   r   r   r    r5  y  r6  z(GroupViTVisionModel.get_input_embeddingsNr   r  r  c                 C   s   | j ||||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTVisionModel

        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   r  r  )r=  )rf   r   r   r  r  r   r   r    rm   |  s   zGroupViTVisionModel.forwardr;  )rn   ro   rp   r   r  Zmain_input_namer[   r   r5  r   r   r   r   r   r   r   r   rm   rq   r   r   rg   r    r<  o  s*    
r<  c                       s<  e Zd ZeZdef fddZe						ddeej	 deej	 deej	 dee
 d	ee
 d
ee
 dejfddZe				ddeej dee
 d	ee
 d
ee
 dejf
ddZe									ddeej deej deej	 deej dee
 dee
 d	ee
 dee
 d
ee
 deeef fddZ  ZS )GroupViTModelrW   c              
      s6  t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _
|j| _|j| _t|| _t|| _ttj| j| j
ddt| j
tjddtj| j
| j	dd| _ttj| j| j
ddt| j
tjddtj| j
| j	dd| _tt| jj| _|   d S )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type T)r  )Zinplace) rZ   r[   r   text_configr   	TypeErrortypevision_configr   Zprojection_dimZprojection_intermediate_dimr_   Ztext_embed_dimZvision_embed_dimr%  r0  r:  r=  r   r   rt   ZBatchNorm1dZReLUvisual_projectiontext_projectionr   r   r<   rW   Zlogit_scale_init_valuelogit_scaler1  )rf   rW   rA  rD  rg   r   r    r[     sF   





zGroupViTModel.__init__Nr   r   r   r   r  r  r   c           
      C   sh   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||||d}|d }| |}	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTTextModel`].

        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr8  r   )rW   r   r  r  r0  rF  )
rf   r   r   r   r   r  r  text_outputsr,  Ztext_featuresr   r   r    get_text_features  s   	
zGroupViTModel.get_text_featuresr   c                 C   sd   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||d}|d }| |}|S )aH  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`GroupViTVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nr>  r   )rW   r   r  r  r=  rE  )rf   r   r   r  r  vision_outputsr,  Zimage_featuresr   r   r    get_image_features  s   
z GroupViTModel.get_image_featuresreturn_lossoutput_segmentationc
              
   C   sP  |dur|n| j j}|dur|n| j j}|rd}|dur|n| j j}|	dur(|	n| j j}	| j||||	d}
| j||||||	d}|
d }| |}|d }| |}||j	ddd }||j	ddd }| j
 }t|| | }| }d}|r|
d }| |d|jd }|r|
d	 }n|
d
 }t||jd
d }||j	ddd }t|| | }||jd d|jd dd
d}||jd |jd d}t||| }||jd |jd |jd
 |jd	 }d}|rt|}|	s|dur|||||||
f}n||||||
f}|dur|f| S |S t||||||||
dS )aM  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_segmentation (`bool`, *optional*):
            Whether or not to return the segmentation logits.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NTr>  r8  r   r5   r~   r   r   r@   )r   r   r   r   r   r   r   r   )rW   r   rM  r  r  r=  r0  rE  rF  ZnormrG  expr   matmulr#   rI   r>   rU   rR   r$   r   )rf   r   r   r   r   rL  r   r  rM  r  rJ  rH  r   r   rG  r   r   Z
seg_logitsZimage_group_embedsrK   groupingZlogits_per_image_groupZflatten_groupingr   outputr   r   r    rm   /  s   )	




 

zGroupViTModel.forwardr-  r;  )	NNNNNNNNN)rn   ro   rp   r   r  r[   r   r   r   r   r   r   rI  rK  r   r   r   r   rm   rq   r   r   rg   r    r?    s    +.0	

r?  )r?  r  r.  r<  )r   Fr5   r   )Er   collections.abcr   dataclassesr   typingr   r   r   r   numpyrG   r   Ztorch.utils.checkpointr   Zactivationsr	   Zmodeling_attn_mask_utilsr
   r   Zmodeling_outputsr   r   Zmodeling_utilsr   utilsr   r   r   r   Zconfiguration_groupvitr   r   r   Z
get_loggerrn   loggerr   r!   r$   rF   r4   floatr   r?   rQ   rU   r9  rV   rr   r   r   r   r   r   r   rb   r   r\   r   r  r  r"  r%  r.  r:  r<  r?  __all__r   r   r   r    <module>   s^   
$

072"K(^o2':a\251  "