o
    Zh4                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ ddlmZmZmZmZ ddlmZm Z m!Z! e"e#Z$dej%dej%fddZ&dej%dej%fddZ'eG dd deZ(eG dd deZ)eG dd deZ*G dd dej+Z,G dd dej+Z-	dFd ej+d!ej%d"ej%d#ej%d$eej% d%e.d&e.fd'd(Z/G d)d* d*ej+Z0G d+d, d,ej+Z1G d-d. d.ej+Z2eG d/d0 d0eZ3G d1d2 d2ej+Z4G d3d4 d4ej+Z5G d5d6 d6e3Z6G d7d8 d8ej+Z7G d9d: d:e3Z8eG d;d< d<e3Z9G d=d> d>ej+Z:G d?d@ d@e3Z;edAdBG dCdD dDe3Z<g dEZ=dS )GzPyTorch CLIPSeg model.    N)	dataclass)AnyCallableOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                 C   s   t j| tjt| | jdS )Ndevice)r   
functionalZcross_entropytorcharangelenr   )r    r!   [/var/www/auris/lib/python3.10/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_loss'   s   r#   
similarityc                 C   s    t | }t |  }|| d S )Ng       @)r#   t)r$   Zcaption_lossZ
image_lossr!   r!   r"   clipseg_loss,   s   r&   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )CLIPSegOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                       t  fdd  D S )Nc                 3   .    | ]}|d vr | nt  | V  qdS ))r-   r.   Ngetattrto_tuple.0kselfr!   r"   	<genexpr>R   
    
z)CLIPSegOutput.to_tuple.<locals>.<genexpr>tuplekeysr7   r!   r7   r"   r3   Q      zCLIPSegOutput.to_tuple)__name__
__module____qualname____doc__r(   r   r   FloatTensor__annotations__r)   r*   r+   r,   r-   r   r.   r   r   r3   r!   r!   r!   r"   r'   2   s   
 r'   c                   @   sP   e Zd ZU dZdZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dS )CLIPSegDecoderOutputa  
    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
            Classification scores for each pixel.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Nr   hidden_states
attentions)r?   r@   rA   rB   r   r   r   rC   rD   rF   r   rG   r!   r!   r!   r"   rE   X   s
   
 rE   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeed< dZeed< d	ee fd
dZdS )CLIPSegImageSegmentationOutputa,  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        ...
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`CLIPSegVisionModel`].
    Nr(   r   conditional_embeddingspooled_outputr.   decoder_outputr   c                    r/   )Nc                 3   r0   ))r.   rK   Nr1   r4   r7   r!   r"   r9      r:   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>r;   r7   r!   r7   r"   r3   ~   r>   z'CLIPSegImageSegmentationOutput.to_tuple)r?   r@   rA   rB   r(   r   r   rC   rD   r   rI   rJ   r.   r   rK   rE   r   r   r3   r!   r!   r!   r"   rH   l   s   
 	rH   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )CLIPSegVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _t| j| j| _| jdt	| jddd d S )NF)Zin_channelsZout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rM   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   Zrandnclass_embeddingConv2dZnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr   expandr8   rM   	__class__r!   r"   rX      s"   
"z CLIPSegVisionEmbeddings.__init__
embeddingsheightwidthr   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrT   g      ?r	   rQ   ZbicubicF)sizemodeZalign_cornersdim)shaperd   weightZ	unsqueezer   Zjit
is_tracingrR   r\   r   reshapepermuter   r   Zinterpolateviewcat)r8   rj   rk   rl   ra   rd   rb   Zclass_pos_embedZpatch_pos_embedrp   Z
new_heightZ	new_widthZsqrt_num_positionsr!   r!   r"   interpolate_pos_encoding   s*   



z0CLIPSegVisionEmbeddings.interpolate_pos_encodingTpixel_valuesc           
   
   C   s   |j \}}}}|s&|| jks|| jkr&td| d| d| j d| j d	| |}|ddd}| j|dd}tj	||gdd}	|rR|	| 
|	|| }	|	S |	| | j }	|	S )	NzInput image size (*z) doesn't match model ().rQ   r   rT   ro   )rq   r[   
ValueErrorr`   flatten	transposer^   rf   r   rw   rx   rd   rR   )
r8   ry   rx   
batch_size_rk   rl   Zpatch_embedsZclass_embedsrj   r!   r!   r"   forward   s    
zCLIPSegVisionEmbeddings.forward)T)r?   r@   rA   r   rX   r   Tensorintrx   rC   r   __classcell__r!   r!   rh   r"   rL      s     )rL   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )CLIPSegTextEmbeddingsrM   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )NrR   rS   FrU   )rW   rX   rY   r   rc   Z
vocab_sizetoken_embeddingZmax_position_embeddingsrd   re   r   r   rf   r8   rM   rZ   rh   r!   r"   rX      s   

zCLIPSegTextEmbeddings.__init__N	input_idsrR   inputs_embedsr   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )NrT   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rq   rd   rr   r|   rR   r   )r8   r   rR   r   
seq_lengthZmax_position_embeddingZposition_embeddingsrj   r!   r!   r"   r      s"   

zCLIPSegTextEmbeddings.forward)NNN)r?   r@   rA   r   rX   r   r   
LongTensorrC   r   r   r   r!   r!   rh   r"   r      s    r           modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrT   r   )rp   dtype)ptrainingr   rQ   )r   matmulr~   r   r   ZsoftmaxZfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr!   r!   r"   eager_attention_forward  s   
r   c                       sv   e Zd ZdZdeeef f fddZ			ddej	de
ej	 d	e
ej	 d
e
e deej	e
ej	 f f
ddZ  ZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrM   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r{         F)rW   rX   rM   rY   rZ   num_attention_heads	num_headshead_dimr|   scaleZattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrg   rh   r!   r"   rX     s$   

zCLIPSegAttention.__init__NFrF   r   causal_attention_maskoutput_attentionsr   c              
   C   sL  |j \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
| jj	dkrY|durR|durR|| }n|durX|}n|du| _
t}| jj	dkrz| jj	dkrt|rttd nt| jj	 }|| ||	|
|| j
| j| jsdn| jd	\}}|||| }| |}|sd}||fS )
z#Input shape: Batch x Time x Channelr   rQ   Zflash_attention_2NeagerZsdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r   )rq   r   r   r   rv   r   r   r~   rM   Z_attn_implementationr   r   loggerZwarning_oncer   r   r   r   rt   r   r   )r8   rF   r   r   r   r   r   rZ   Zqueriesr=   valuesZattention_interfacer   r   r!   r!   r"   r   /  sH   	






zCLIPSegAttention.forward)NNF)r?   r@   rA   rB   r   r   r   rX   r   r   r   boolr   r   r   r!   r!   rh   r"   r     s"    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )
CLIPSegMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)rW   rX   rM   r
   
hidden_actactivation_fnr   r   rY   intermediate_sizefc1fc2rg   rh   r!   r"   rX   j  s
   
zCLIPSegMLP.__init__rF   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r8   rF   r!   r!   r"   r   q  s   


zCLIPSegMLP.forward)r?   r@   rA   rX   r   r   r   r   r!   r!   rh   r"   r   i  s    r   c                       sT   e Zd Zdef fddZ	ddejdejdejdee d	e	ej
 f
d
dZ  ZS )CLIPSegEncoderLayerrM   c                    R   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S N)ZepsrW   rX   rY   rZ   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rg   rh   r!   r"   rX   z     


zCLIPSegEncoderLayer.__init__FrF   r   r   r   r   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rF   r   r   r   )r   r   r   r   r8   rF   r   r   r   Zresidualr   Zoutputsr!   r!   r"   r     s"   




zCLIPSegEncoderLayer.forwardF)r?   r@   rA   r   rX   r   r   r   r   r   rC   r   r   r!   r!   rh   r"   r   y  s    r   c                   @   s    e Zd ZeZdZdZdd ZdS )CLIPSegPreTrainedModelclipTc                 C   sV  | j j}t|tr"|jjjjd|d d |jjjjd|d d nt|t	rW| j j}t
jj|jd|jd | d t
jj|jj|j j| d t
jj|jj|j j| d nt|tr| j j}|jd d|j j d  | }|jd | }t
jj|jj|d t
jj|jj|d t
jj|jj|d t
jj|jj|d n_t|tr| j j}|j jd d|j j d  | }d|j j d | }t
jj|jj|d t
jj|jj|d n't|trt
jj|jj|jd | j j d t
jj|jj|jd | j j d t|t
jr|j j!  |jj"d t|t
j#r'|j dur)|j j!  dS dS dS )	zInitialize the weightsr   g{Gz?)meanstdr   )r   rQ   g      ?N)$rM   Zinitializer_factor
isinstancer   r   rr   dataZnormal_rd   rL   r   initr^   rZ   r`   Zinitializer_ranger   num_hidden_layersr   r   r   r   r   rY   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   rP   Zzero_Zfill_r   )r8   r   factorZin_proj_stdZout_proj_stdZfc_stdr!   r!   r"   _init_weights  sL   



 
z$CLIPSegPreTrainedModel._init_weightsN)r?   r@   rA   r   config_classZbase_model_prefixZsupports_gradient_checkpointingr   r!   r!   r!   r"   r     s
    r   c                       st   e Zd ZdZdef fddZ					ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    rM   c                    s:   t     | _t fddt jD | _d| _d S )Nc                       g | ]}t  qS r!   )r   r5   r   rM   r!   r"   
<listcomp>      z+CLIPSegEncoder.__init__.<locals>.<listcomp>F)	rW   rX   rM   r   
ModuleListranger   layersgradient_checkpointingrg   rh   r   r"   rX     s   
 
zCLIPSegEncoder.__init__Nr   r   r   output_hidden_statesreturn_dictr   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]1\}
}|r<||	f }| jrM| jrM| |j	|	|||}n||	|||d}|d }	|rb||d f }q1|rj||	f }|sxt
dd |	||fD S t|	||dS )	a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr!   )r   r   r   c                 s       | ]	}|d ur|V  qd S r   r!   r5   vr!   r!   r"   r9   7      z)CLIPSegEncoder.forward.<locals>.<genexpr>)last_hidden_staterF   rG   )rM   r   r   use_return_dict	enumerater   r   r   Z_gradient_checkpointing_func__call__r<   r   )r8   r   r   r   r   r   r   Zencoder_statesall_attentionsrF   idxZencoder_layerlayer_outputsr!   r!   r"   r     sF   &

zCLIPSegEncoder.forwardNNNNN)r?   r@   rA   rB   r   rX   r   r   r   r   r   r   r   r   r   r!   r!   rh   r"   r     s*    	
r   c                       s   e Zd Zdef fddZe						ddeej deej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )CLIPSegTextTransformerrM   c                    sH   t    || _|j}t|| _t|| _tj	||j
d| _|j| _d S r   )rW   rX   rM   rY   r   rj   r   encoderr   r   r   final_layer_normeos_token_idr   rh   r!   r"   rX   >  s   


zCLIPSegTextTransformer.__init__Nr   r   rR   r   r   r   r   c                 C   sj  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| }|d|d }| j||d}t||j	|j
d}	|d urLt||j	}| j|||	|||d}
|
d }| |}| jdkr|tj|jd |j
d|jtj|j
djdd	f }n|tj|jd |j
d|jtj|j
d| jk jdd	f }|s||f|
d
d   S t|||
j|
jdS )NzYou have to specify input_idsrT   )r   rR   r   )r   r   r   r   r   r   r   rQ   )r   r   ro   r   r   pooler_outputrF   rG   )rM   r   r   r   r|   rm   rv   rj   r   r   r   r   r   r   r   r   r   rq   r   r   Zargmaxr   rF   rG   )r8   r   r   rR   r   r   r   Zinput_shaperF   r   encoder_outputsr   rJ   r!   r!   r"   r   I  s\   

	

	zCLIPSegTextTransformer.forwardNNNNNN)r?   r@   rA   r   rX   r   r   r   r   r   r   r   r   r   r   r!   r!   rh   r"   r   =  s0    
r   c                       s   e Zd ZeZddgZdef fddZdejfddZ	d	d
 Z
e						ddeej deej deej dee dee dee deeef fddZ  ZS )CLIPSegTextModelr   r   rM   c                    "   t  | t|| _|   d S r   )rW   rX   r   
text_model	post_initrg   rh   r!   r"   rX        
zCLIPSegTextModel.__init__r   c                 C   
   | j jjS r   r   rj   r   r7   r!   r!   r"   get_input_embeddings     
z%CLIPSegTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r   )r8   r   r!   r!   r"   set_input_embeddings  s   z%CLIPSegTextModel.set_input_embeddingsNr   r   rR   r   r   r   c                 C   s   | j ||||||dS )a;  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rR   r   r   r   )r   )r8   r   r   rR   r   r   r   r!   r!   r"   r     s   zCLIPSegTextModel.forwardr   )r?   r@   rA   r   r   Z_no_split_modulesrX   r   Moduler   r   r   r   r   r   r   r   r   r   r   r   r!   r!   rh   r"   r     s8    
r   c                       sp   e Zd Zdef fddZe				ddeej dee	 dee	 d	ee	 d
ee	 de
eef fddZ  ZS )CLIPSegVisionTransformerrM   c                    sR   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _d S r   )rW   rX   rM   rY   rL   rj   r   r   r   pre_layrnormr   r   post_layernormr   rh   r!   r"   rX     s   


z!CLIPSegVisionTransformer.__init__NTry   r   r   r   rx   r   c           
      C   s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||d}| |}| j||||d}|d }|d d dd d f }	| |	}	|sS||	f|dd   S t||	|j	|j
dS )N)rx   )r   r   r   r   r   r   r   )rM   r   r   r   rj   r  r   r  r   rF   rG   )
r8   ry   r   r   r   rx   rF   r   r   rJ   r!   r!   r"   r     s.   	

z CLIPSegVisionTransformer.forward)NNNT)r?   r@   rA   r   rX   r   r   r   rC   r   r   r   r   r   r   r!   r!   rh   r"   r    s(    

r  c                       s   e Zd ZeZdZdef fddZdejfddZ	e
						ddeej d
ee dee dee dee deeef fddZ  ZS )CLIPSegVisionModelry   rM   c                    r   r   )rW   rX   r  vision_modelr   rg   rh   r!   r"   rX     r   zCLIPSegVisionModel.__init__r   c                 C   r   r   )r  rj   r`   r7   r!   r!   r"   r   
  r   z'CLIPSegVisionModel.get_input_embeddingsNTr   r   rx   r   c                 C   s   | j |||||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```ry   r   r   rx   r   )r  )r8   ry   r   r   rx   r   r!   r!   r"   r     s   zCLIPSegVisionModel.forwardNNNTN)r?   r@   rA   r   r   Zmain_input_namerX   r   r  r   r   r   r   rC   r   r   r   r   r   r   r!   r!   rh   r"   r     s0    
r  c                       s>  e Zd ZeZdef fddZe						ddeej	 deej	 deej	 dee
 d	ee
 d
ee
 dejfddZe					ddeej dee
 d	ee
 de
d
ee
 dejfddZe									ddeej deej deej	 deej dee
 dee
 d	ee
 de
d
ee
 deeef fddZ  ZS )r   rM   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}|j	| _	|j
| _|j
| _t|| _t|| _tj| j| j	dd| _tj| j| j	dd| _tt| jj| _|   d S )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rP   )rW   rX   r   text_configr   	TypeErrortypevision_configr   projection_dimrY   r   r   r   r   r  r  r   r   r   r   r]   r   ZtensorrM   Zlogit_scale_init_valuelogit_scaler   )r8   rM   r
  r  rh   r!   r"   rX   7  s0   

zCLIPSegModel.__init__Nr   r   rR   r   r   r   r   c           
      C   sh   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||||d}|d }| |}	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```Nr   r   )rM   r   r   r   r   r   )
r8   r   r   rR   r   r   r   text_outputsrJ   Ztext_featuresr!   r!   r"   get_text_featuresW  s   	
zCLIPSegModel.get_text_featuresTry   rx   c           	      C   sf   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j|||||d}|d }| |}|S )aI  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```Nr  r   )rM   r   r   r   r  r   )	r8   ry   r   r   rx   r   vision_outputsrJ   Zimage_featuresr!   r!   r"   get_image_features  s    
zCLIPSegModel.get_image_featuresreturn_lossc
              	   C   s(  |dur|n| j j}|dur|n| j j}|	dur|	n| j j}	| j|||||	d}
| j||||||	d}|
d }| |}|d }| |}||jdddd }||jdddd }| j	
 }t|| | }| }d}|rtt|}|	s||||||
f}|dur|f| S |S t|||||||
d	S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, CLIPSegModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr  r   r   rQ   rT   T)r   rp   Zkeepdim)r(   r)   r*   r+   r,   r-   r.   )rM   r   r   r   r  r   r   r   Znormr  expr   r   r%   r&   r'   )r8   r   ry   r   rR   r  r   r   rx   r   r  r  r,   r+   r  r*   r)   r(   outputr!   r!   r"   r     sV   '	


zCLIPSegModel.forwardr   r  )	NNNNNNNTN)r?   r@   rA   r   r   rX   r   r   r   r   r   rC   r  r  r   r   r   r'   r   r   r!   r!   rh   r"   r   3  s     .2	

r   c                       sX   e Zd ZdZdef fddZ	ddejdejdejd	ee	 d
e
ej f
ddZ  ZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    rM   c                    r   r   r   rg   rh   r!   r"   rX      r   zCLIPSegDecoderLayer.__init__FrF   r   r   r   r   c                 C   sd   |}| j ||||d\}}|| }| |}|}| |}|| }| |}|f}|r0||f7 }|S r   )r   r   r   r   r   r!   r!   r"   r   (  s"   




zCLIPSegDecoderLayer.forwardr   )r?   r@   rA   rB   r   rX   r   r   r   r   r   rC   r   r   r!   r!   rh   r"   r    s    r  c                       s\   e Zd Zdef fddZ			ddeej dejdee	 d	ee	 d
ee	 f
ddZ
  ZS )CLIPSegDecoderrM   c                    sX  t     j| _t j j| _t j j| _ j	r` j
jd  j
jd f}ttj j jdddt tj j jd |d |d dt tj jd d|d |d d| _ntj jd j
j j
jd| _t j}t fd	d
t|D | _t j
 j_ j_ j_d_tfdd
tt jD | _d S )N   r	   r   )rN   paddingrQ   r   )rN   rO   )rO   c                    s   g | ]}t  jj jqS r!   )r   r   r  rY   
reduce_dimr   r   r!   r"   r   s  s    z+CLIPSegDecoder.__init__.<locals>.<listcomp>Zreluc                    r   r!   )r  r   )decoder_configr!   r"   r   {  r   )rW   rX   conditional_layerr   r   r  r  film_mulfilm_addZ"use_complex_transposed_convolutionr  r\   Z
Sequentialr_   ZReLUZConvTranspose2dtransposed_convolutionr    extract_layersr   r   reducescopydeepcopyrY   Zdecoder_num_attention_headsr   Zdecoder_intermediate_sizer   r   r   )r8   rM   Ztransposed_kernelsdepthrh   )rM   r  r"   rX   S  sB   
(zCLIPSegDecoder.__init__NTrF   rI   r   r   r   c                 C   sp  |rdnd }|r
dnd }|d d d }d }	t t|| j| jD ]O\}
\}}}|	d ur1|||	 }	n||}	|
| jkrR| ||	ddd | | }	|	ddd}	||	d d |d}|d }	|re||	f7 }|rn||d f7 }q|	d d dd d d f ddd}	tt	
|	jd }|jd }|	||	jd ||}	| |	d}|stdd |||fD S t|||d	S )
Nr!   rT   r   r   rQ   )r   r   r   c                 s   r   r   r!   r   r!   r!   r"   r9     r   z)CLIPSegDecoder.forward.<locals>.<genexpr>)r   rF   rG   )r   zipr   r"  r  r  ru   r  r   mathsqrtrq   rv   r   Zsqueezer<   rE   )r8   rF   rI   r   r   r   Zall_hidden_statesr   activationsr  iZ
activationlayerreducer   rm   r   r   r!   r!   r"   r   }  sD   "

$
zCLIPSegDecoder.forward)NNT)r?   r@   rA   r   rX   r   r   r   r   r   r   r   r!   r!   rh   r"   r  R  s     .r  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )Zcustom_introc                       s   e Zd ZeZdef fddZ					ddee deej	 deej	 deej	 d	eej	 f
d
dZ
e											ddeej deej d	eej deej deej	 deej deej dee dee dedee deeef fddZ  ZS )CLIPSegForImageSegmentationrM   c                    s:   t  | || _t|| _|j| _t|| _|   d S r   )	rW   rX   rM   r   r   r!  r  decoderr   rg   rh   r!   r"   rX     s   

z$CLIPSegForImageSegmentation.__init__Nr   r   r   rR   conditional_pixel_valuesc                 C   s   |d ur.t ||krtdt  | jj|||d}W d    |S 1 s'w   Y  |S |d urYt ||kr<tdt  | j|}W d    |S 1 sRw   Y  |S td)Nz@Make sure to pass as many prompt texts as there are query images)r   rR   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r    r|   r   no_gradr   r  r  )r8   r   r   r   rR   r/  rI   r!   r!   r"   get_conditional_embeddings  s.   



z6CLIPSegForImageSegmentation.get_conditional_embeddingsTry   rI   labelsr   r   rx   r   r   c                    s  |dur|n| j j}t Q | jj||d|
|d}| j|d }|r'|jn|d   fdd| jD }|rHt	|j
|j|	rA|jnd|jd}n|	sV|dd |d	d  n|}W d   n1 sbw   Y  |du ry| j|jd
 ||||d}n|jd
 |jd
 krtd|jd | j jkrtd| j||||	|d}|r|jn|d
 }d}|dur||j}t }|||}|s|||||f}|dur|f| S |S t||||||dS )aX  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        conditional_pixel_values (`torch.FloatTensor`, *optional*):
            The pixel values of the conditional images.
        conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
            The conditional embeddings for the query images. If provided, the model will use this instead of computing
            the embeddings from the conditional_pixel_values.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from PIL import Image
        >>> import requests

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```NTr  r   rQ   c                    s   g | ]} |d   qS )r   r!   )r5   r*  rF   r!   r"   r   &  s    z7CLIPSegForImageSegmentation.forward.<locals>.<listcomp>r   r	   r   )r   r   r   rR   r/  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r   r   )r(   r   rI   rJ   r.   rK   )rM   r   r   r0  r   r  r   rF   r!  r   r   r   rG   r1  rq   r|   r  r.  r   r   r   r   ZBCEWithLogitsLossrH   )r8   r   ry   r/  rI   r   rR   r2  r   r   rx   r   r  rJ   r)  Zdecoder_outputsr   r(   Zloss_fnr  r!   r3  r"   r     s~   /

z#CLIPSegForImageSegmentation.forwardr   )NNNNNNNNNTN)r?   r@   rA   r   r   rX   r   r   r   r   r1  r   rC   r   r   r   r   r'   r   r   r!   r!   rh   r"   r-    sp    
	

r-  )r   r   r   r  r-  )r   )>rB   r#  r'  dataclassesr   typingr   r   r   r   r   r   Ztorch.utils.checkpointr   r)  r
   Zmodeling_attn_mask_utilsr   r   Zmodeling_outputsr   r   Zmodeling_utilsr   r   utilsr   r   r   r   Zconfiguration_clipsegr   r   r   Z
get_loggerr?   r   r   r#   r&   r'   rE   rH   r  rL   r   floatr   r   r   r   r   r   r   r   r  r  r   r  r  r-  __all__r!   r!   r!   r"   <module>   s|   
$T0
Q20a[443 f9d .