
    fTh4                     ~   S r SSKrSSKrSSKJr  SSKJrJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJr  SSKJrJr  SS	KJrJr  SS
KJrJr  SSKJrJrJrJr  SSKJrJ r J!r!  \RD                  " \#5      r$S\RJ                  S\RJ                  4S jr&S\RJ                  S\RJ                  4S jr'\ " S S\5      5       r(\ " S S\5      5       r)\ " S S\5      5       r* " S S\RV                  5      r, " S S\RV                  5      r- SBS\RV                  S\RJ                  S\RJ                  S \RJ                  S!\\RJ                     S"\.S#\.4S$ jjr/ " S% S&\RV                  5      r0 " S' S(\RV                  5      r1 " S) S*\RV                  5      r2\ " S+ S,\5      5       r3 " S- S.\RV                  5      r4 " S/ S0\RV                  5      r5 " S1 S2\35      r6 " S3 S4\RV                  5      r7 " S5 S6\35      r8\ " S7 S8\35      5       r9 " S9 S:\RV                  5      r: " S; S<\35      r;\" S=S>9 " S? S@\35      5       r</ SAQr=g)CzPyTorch CLIPSeg model.    N)	dataclass)AnyCallableOptionalTupleUnion)nn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)r	   
functionalcross_entropytorcharangelenr   )r   s    d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_lossr%   '   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)r%   t)r'   caption_loss
image_losss      r$   clipseg_lossr,   ,   s*    #J/L!*,,.1J%,,r&   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)CLIPSegOutput2   a  
Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r5   r6   Ngetattrto_tuple.0kselfs     r$   	<genexpr>)CLIPSegOutput.to_tuple.<locals>.<genexpr>R   s<      
   LLDGRYZ^`aRbRkRkRmm    25tuplekeysr?   s   `r$   r;   CLIPSegOutput.to_tupleQ   #     
YY[
 
 	
r&    )__name__
__module____qualname____firstlineno____doc__r0   r   r!   FloatTensor__annotations__r1   r2   r3   r4   r5   r   r6   r   r   r;   __static_attributes__rI   r&   r$   r.   r.   2   s    ( )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r&   r.   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)CLIPSegDecoderOutputX   a  
Args:
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
Nr   hidden_states
attentionsrI   )rJ   rK   rL   rM   rN   r   r   r!   rO   rP   rU   r   rV   rQ   rI   r&   r$   rS   rS   X   sR     +/FHU&&'.8<M8E%"3"345<59Ju00129r&   rS   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S'   Sr\\	S	'   S
\\   4S jrSrg)CLIPSegImageSegmentationOutputl   a  
Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    ...
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
Nr0   r   conditional_embeddingspooled_outputr6   decoder_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r6   r\   Nr9   r<   s     r$   r@   :CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>   s;      
   IIDGwW[]^O_OhOhOjj rB   rC   rF   s   `r$   r;   'CLIPSegImageSegmentationOutput.to_tuple~   rH   r&   rI   )rJ   rK   rL   rM   rN   r0   r   r!   rO   rP   r   rZ   r[   r6   r   r\   rS   r   r   r;   rQ   rI   r&   r$   rX   rX   l   s     )-D(5$$
%,*.FHU&&'.:>HU%6%67>15M8E--.56:3:+/N(/
%* 
r&   rX   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )CLIPSegVisionEmbeddings   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__rd   hidden_size	embed_dim
image_size
patch_sizer	   	Parameterr!   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr"   expandr?   rd   	__class__s     r$   rr    CLIPSegVisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr&   
embeddingsheightwidthr   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nrn   g      ?r
   rk   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer!   jit
is_tracingrl   rv   r   reshapepermuter	   r   interpolateviewcat)r?   r   r   r   r}   r   r~   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r$   interpolate_pos_encoding0CLIPSegVisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr&   pixel_valuesc                    UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  U5      nUR	                  S5      R                  SS5      nU R                  R                  USS5      n[        R                  " X/SS9n	U(       a  XR                  XU5      -   n	U	$ XR                  U R                  5      -   n	U	$ )	NzInput image size (*z) doesn't match model ().rk   r   rn   r   )r   ru   
ValueErrorr|   flatten	transposery   r   r!   r   r   r   rl   )
r?   r   r   
batch_size_r   r   patch_embedsclass_embedsr   s
             r$   forwardCLIPSegVisionEmbeddings.forward   s   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++L9#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr&   )	ry   rd   rt   ru   r}   r~   r|   rv   r   )T)rJ   rK   rL   rM   r   rr   r!   Tensorintr   rO   r   rQ   __classcell__r   s   @r$   rb   rb      sj    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Y^YeYe  r&   rb   c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )CLIPSegTextEmbeddings   rd   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nrl   rm   Fro   )rq   rr   rs   r	   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r!   r"   r   r?   rd   rt   r   s      r$   rr   CLIPSegTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r&   	input_idsrl   inputs_embedsr   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nrn   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r   r   r   rl   r   )r?   r   rl   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r$   r   CLIPSegTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r&   )r   r   )NNN)rJ   rK   rL   rM   r   rr   r   r!   
LongTensorrO   r   r   rQ   r   r   s   @r$   r   r      sp    

0 

 153759	E,,- u//0   1 12	
 
 r&   r   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nrn   r   )r   dtype)ptrainingr   rk   )r!   matmulr   r	   r   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r$   eager_attention_forwardr     s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r&   c                      ^  \ rS rSrSrS\\\4   4U 4S jjr   SS\	R                  S\\	R                     S\\	R                     S\\   S	\\	R                  \\	R                     4   4
S
 jjrSrU =r$ )CLIPSegAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperrd   c                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rq   rr   rd   rs   rt   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr	   Lineark_projv_projq_projout_projr   s     r$   rr   CLIPSegAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar&   rU   r   causal_attention_maskoutput_attentionsr   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                   (       d  SOU R"                  S	9u  pUR%                  XVU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )
z#Input shape: Batch x Time x Channelr   rk   flash_attention_2Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )r   r   r   r   r   r   r   r   rd   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )r?   rU   r   r   r   r   r   rt   queriesrE   valuesattention_interfacer   r   s                 r$   r   CLIPSegAttention.forward/  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r&   )rd   r   rt   r   r   r   r   r   r   r   r   )NNF)rJ   rK   rL   rM   rN   r   r   r   rr   r!   r   r   boolr   r   rQ   r   r   s   @r$   r   r     s    GBu%8:K%KL B. 268<,16)||6) !.6)  (5	6)
 $D>6) 
u||Xell33	46) 6)r&   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
CLIPSegMLPii  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)rq   rr   rd   r   
hidden_actactivation_fnr	   r   rs   intermediate_sizefc1fc2r   s     r$   rr   CLIPSegMLP.__init__j  sb    #F$5$5699V//1I1IJ99V55v7I7IJr&   rU   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )r?   rU   s     r$   r   CLIPSegMLP.forwardq  s4    /**=9/r&   )r   rd   r   r   )
rJ   rK   rL   rM   rr   r!   r   r   rQ   r   r   s   @r$   r   r   i  s)    KU\\ ell  r&   r   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )CLIPSegEncoderLayeriy  rd   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g N)epsrq   rr   rs   rt   r   	self_attnr	   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r$   rr   CLIPSegEncoderLayer.__init__z  m    ++)&1<<F<Q<QRf%<<F<Q<QRr&   rU   r   r   r   r   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rU   r   r   r   )r  r	  r  r  r?   rU   r   r   r   residualr   outputss           r$   r   CLIPSegEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr&   rt   r  r  r  r	  F)rJ   rK   rL   rM   r   rr   r!   r   r   r   r   rO   r   rQ   r   r   s   @r$   r  r  y  sk    S} S -2&||& &  %||	&
 $D>& 
u  	!& &r&   r  c                   &    \ rS rSr\rSrSrS rSr	g)CLIPSegPreTrainedModeli  clipTc                 F
   U R                   R                  n[        U[        5      (       ad  UR                  R
                  R                  R                  SUS-  S9  UR                  R
                  R                  R                  SUS-  S9  GO[        U[        5      (       a  U R                   R                  n[        R                  R                  UR                  SUR                  S-  U-  S9  [        R                  R                  UR                  R
                  UR                   R                  U-  S9  [        R                  R                  UR                  R
                  UR                   R                  U-  S9  GO[        U[         5      (       Ga!  U R                   R                  nUR                  S-  SUR                   R"                  -  S-  -  U-  nUR                  S-  U-  n[        R                  R                  UR$                  R
                  US9  [        R                  R                  UR&                  R
                  US9  [        R                  R                  UR(                  R
                  US9  [        R                  R                  UR*                  R
                  US9  GO[        U[,        5      (       a  U R                   R                  nUR                   R.                  S-  SUR                   R"                  -  S-  -  U-  nSUR                   R.                  -  S-  U-  n[        R                  R                  UR0                  R
                  US9  [        R                  R                  UR2                  R
                  US9  O[        U[4        5      (       a  [        R                  R                  UR6                  R
                  UR8                  S-  U R                   R                  -  S9  [        R                  R                  UR:                  R
                  UR<                  S-  U R                   R                  -  S9  [        U[        R>                  5      (       aI  UR@                  R                  RC                  5         UR
                  R                  RE                  S5        [        U[        RF                  5      (       a3  UR@                  b%  UR@                  R                  RC                  5         ggg)	zInitialize the weightsr   g{Gz?)meanstdr   )r  rk   g      ?N)$rd   initializer_factor
isinstancer   r   r   datanormal_r   rb   r	   initry   rt   r|   initializer_ranger   num_hidden_layersr   r   r   r   r   rs   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr
  rj   zero_fill_r   )r?   r   factorin_proj_stdout_proj_stdfc_stds         r$   _init_weights$CLIPSegPreTrainedModel._init_weights  s   //f344""))..66CVd]6S%%,,1199sQU9V 788[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 011[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE
++[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?--GGOO&&--))4/$++2P2PP   GGOO((//++T1DKK4R4RR  
 fbll++KK""$MM$$S)fbii((V[[-DKK""$ .E(r&   rI   N)
rJ   rK   rL   rM   r   config_classbase_model_prefixsupports_gradient_checkpointingr1  rQ   rI   r&   r$   r  r    s     L&*#'%r&   r  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jjrSrU =r$ )CLIPSegEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`CLIPSegEncoderLayer`].

Args:
    config: CLIPSegConfig
rd   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rq   rr   rd   r	   
ModuleListranger%  r  layersgradient_checkpointing)r?   rd   r   r   s      r$   rr   CLIPSegEncoder.__init__  sT    mm%PVPhPhJi$jJiQ%8%@Ji$jk&+# %ks   A&r   r   r   output_hidden_statesreturn_dictr   c                 L   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       Hr  u  pU(       a  Xy4-   nU R                  (       a1  U R                  (       a   U R                  UR                  U	UUU5      nO	U" U	UUUS9nUS   n	U(       d  Mj  XS   4-   nMt     U(       a  Xy4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrI   )r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rI   r=   vs     r$   r@   )CLIPSegEncoder.forward.<locals>.<genexpr>7  s     e$Sq$S   	)last_hidden_staterU   rV   )rd   r   r>  use_return_dict	enumerater;  r<  r   _gradient_checkpointing_func__call__rD   r   )r?   r   r   r   r   r>  r?  encoder_statesall_attentionsrU   idxencoder_layerlayer_outputss                r$   r   CLIPSegEncoder.forward  s8   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M  !/3C2E!E- #90  +.>>Ne]N$Seee+Vd
 	
r&   )rd   r<  r;  NNNNN)rJ   rK   rL   rM   rN   r   rr   r   r!   r   r   r   r   r   r   rQ   r   r   s   @r$   r7  r7    s    ,} , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
 O
r&   r7  c                      ^  \ rS rSrS\4U 4S jjr\      SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jj5       rSrU =r$ )CLIPSegTextTransformeri=  rd   c                    > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        UR                  U l        g r  )rq   rr   rd   rs   r   r   r7  encoderr	   r
  r  final_layer_normeos_token_idr   s      r$   rr   CLIPSegTextTransformer.__init__>  s]    &&	/7%f- "Y<Q<Q R #//r&   r   r   rl   r   r>  r?  r   c           	      0   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eUR                  5       nUR                  SUS   5      nU R                  XS9n[        XxR                  UR                  S9n	Ub  [        X(R                  5      nU R                  UUU	UUUS9n
U
S   nU R                  U5      nU R                  S:X  ae  U[        R                   " UR"                  S   UR                  S9UR%                  [        R&                  UR                  S9R)                  SS	94   nOU[        R                   " UR"                  S   UR                  S9UR%                  [        R&                  UR                  S9U R                  :H  R'                  5       R)                  SS	94   nU(       d	  X4U
S
S  -   $ [+        UUU
R,                  U
R.                  S9$ )NzYou have to specify input_idsrn   )r   rl   r   )r   r   r   r   r>  r?  r   rk   )r   r   r   r   rF  pooler_outputrU   rV   )rd   r   r>  rG  r   r   r   r   r   r   r   r   rU  rV  rW  r!   r"   r   r   r   argmaxr   rU   rV   )r?   r   r   rl   r   r>  r?  input_shaperU   r   encoder_outputsrF  r[   s                r$   r   CLIPSegTextTransformer.forwardI  s&    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&NN2{27	)W !A,,]5I5I!
 %7H[H[\N,,')"7/!5# ' 
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M %58KKK)/')77&11	
 	
r&   )rd   r   rU  rW  rV  NNNNNN)rJ   rK   rL   rM   r   rr   r   r   r!   r   r   r   r   r   r   rQ   r   r   s   @r$   rS  rS  =  s    	00 	0  -115/3,0/3&*K
ELL)K
 !.K
 u||,	K

 $D>K
 'tnK
 d^K
 
u00	1K
 K
r&   rS  c                     ^  \ rS rSr\rSS/rS\4U 4S jjrS\R                  4S jr
S r\      SS	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )CLIPSegTextModeli  r   r  rd   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rq   rr   rS  
text_model	post_initr   s     r$   rr   CLIPSegTextModel.__init__  s&     08r&   r   c                 B    U R                   R                  R                  $ r   rd  r   r   rF   s    r$   get_input_embeddings%CLIPSegTextModel.get_input_embeddings  s    ))999r&   c                 8    XR                   R                  l        g r   rh  )r?   r   s     r$   set_input_embeddings%CLIPSegTextModel.set_input_embeddings  s    5:""2r&   r   r   rl   r   r>  r?  c           	      *    U R                  UUUUUUS9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, CLIPSegTextModel

>>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r   r   rl   r   r>  r?  rd  )r?   r   r   rl   r   r>  r?  s          r$   r   CLIPSegTextModel.forward  s,    2 )%/!5#  
 	
r&   rp  r`  )rJ   rK   rL   rM   r   r3  _no_split_modulesrr   r	   Moduleri  rl  r   r   r!   r   r   r   r   r   r   rQ   r   r   s   @r$   rb  rb    s    $L02GH0 :bii :;  -115/3,0/3&*
ELL)
 !.
 u||,	

 $D>
 'tn
 d^
 
u00	1
 
r&   rb  c                      ^  \ rS rSrS\4U 4S jjr\    SS\\R                     S\\
   S\\
   S\\
   S\\
   S	\\\4   4S
 jj5       rSrU =r$ )CLIPSegVisionTransformeri  rd   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r  )rq   rr   rd   rs   rb   r   r	   r
  r  pre_layrnormr7  rU  post_layernormr   s      r$   rr   !CLIPSegVisionTransformer.__init__  sd    &&	1&9LL8M8MN%f- ll9:O:OPr&   r   r   r>  r?  r   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  XS9nU R                  U5      nU R                  UUUUS9nUS   nUS S 2SS S 24   n	U R                  U	5      n	U(       d	  X4USS  -   $ [        UU	UR                  UR                  S9$ )N)r   )r   r   r>  r?  r   r   rZ  )rd   r   r>  rG  r   rw  rU  rx  r   rU   rV   )
r?   r   r   r>  r?  r   rU   r^  rF  r[   s
             r$   r    CLIPSegVisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]h))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%58KKK)/')77&11	
 	
r&   )rd   r   rU  rx  rw  )NNNT)rJ   rK   rL   rM   r   rr   r   r   r!   rO   r   r   r   r   r   rQ   r   r   s   @r$   ru  ru    s    Q2 Q  -1/3&*37$
u001$
 $D>$
 'tn	$

 d^$
 #+4.$
 
u00	1$
 $
r&   ru  c                      ^  \ rS rSr\rSrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\\   S
\\   S\\\4   4S jj5       rSrU =r$ )CLIPSegVisionModeli   r   rd   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rq   rr   ru  vision_modelre  r   s     r$   rr   CLIPSegVisionModel.__init__  s'     4V<r&   r   c                 B    U R                   R                  R                  $ r   )r  r   r|   rF   s    r$   ri  'CLIPSegVisionModel.get_input_embeddings
  s      ++;;;r&   r   r>  r   r?  c                 (    U R                  UUUUUS9$ )aj  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPSegVisionModel

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r   r   r>  r   r?  r  )r?   r   r   r>  r   r?  s         r$   r   CLIPSegVisionModel.forward  s+    :   %/!5%=# ! 
 	
r&   r  NNNTN)rJ   rK   rL   rM   r   r3  main_input_namerr   r	   rs  ri  r   r   r!   rO   r   r   r   r   r   rQ   r   r   s   @r$   r}  r}     s    &L$O2 <bii <  59,0/337&*"
u001"
 $D>"
 'tn	"

 #+4."
 d^"
 
u00	1"
 "
r&   r}  c                   >  ^  \ rS rSr\rS\4U 4S jjr\      SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\\   S
\	R                  4S jj5       r\     SS\\	R                     S\\   S\\   S\S	\\   S
\	R                  4S jj5       r\         SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\S	\\   S
\\\4   4S jj5       rSrU =r$ )r&  i3  rd   c                 `  > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l        UR                  U l        [        U5      U l        [        U5      U l        ["        R$                  " U R                  U R                  SS9U l        ["        R$                  " U R                  U R                  SS9U l        ["        R*                  " [,        R.                  " U R0                  R2                  5      5      U l        U R7                  5         g )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rj   )rq   rr   r   text_configr   	TypeErrortypevision_configr   projection_dimrs   r(  r*  rS  rd  ru  r  r	   r   r)  r'  rw   r!   tensorrd   logit_scale_init_valuelogit_scalere  )r?   rd   r  r  r   s       r$   rr   CLIPSegModel.__init__7  s]    &,,.?@@++,-Q0 
 &..0CDD--./q2 
 ((,,$33)55 - 9 90=4]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r&   r   r   rl   r   r>  r?  r   c           	          Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUS9nUS   nU R                  U5      n	U	$ )aG  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`CLIPSegTextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, CLIPSegModel

>>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```ro  r   )rd   r   r>  rG  rd  r'  )
r?   r   r   rl   r   r>  r?  text_outputsr[   text_featuress
             r$   get_text_featuresCLIPSegModel.get_text_featuresW  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%/!5# ' 
 %Q,,];r&   r   r   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUS9nUS   nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`CLIPSegVisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPSegModel

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> image_features = model.get_image_features(**inputs)
```r  r   )rd   r   r>  rG  r  r)  )	r?   r   r   r>  r   r?  vision_outputsr[   image_featuress	            r$   get_image_featuresCLIPSegModel.get_image_features  s    @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 'q)//>r&   return_lossc
           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R	                  UUUUU	S9n
U R                  UUUUUU	S9nU
S   nU R                  U5      nUS   nU R                  U5      nXR                  SSSS9-  nXR                  SSSS9-  nU R                  R                  5       n[        R                  " XR                  5       5      U-  nUR                  5       nSnU(       a  [        U5      nU	(       d  UXXU
4nUb  U4U-   $ U$ [        UUUUUUU
S	9$ )
aO  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPSegModel

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```Nr  ro  r   rk   rn   T)r   r   keepdim)r0   r1   r2   r3   r4   r5   r6   )rd   r   r>  rG  r  rd  r)  r'  normr  expr!   r   r)   r,   r.   )r?   r   r   r   rl   r  r   r>  r   r?  r  r  r4   r3   r  r2   r1   r0   outputs                      r$   r   CLIPSegModel.forward  s   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 )%/!5# ' 
 &a(--l;"1o**;7 $&7&7!T&7&RR!$4$4qb$$4$OO &&**,,,{NN4DES*,,.0D&lbpqF)-)9TGf$EvE-+#%* .
 	
r&   )r  r  r(  rd  r'  r*  r  r)  r`  r  )	NNNNNNNTN)rJ   rK   rL   rM   r   r3  rr   r   r   r!   r   r   rO   r  r  r   r   r   r.   r   rQ   r   r   s   @r$   r&  r&  3  s    L} @  -115/3,0/3&*,ELL), !., u||,	,
 $D>, 'tn, d^, 
		, ,\  59,0/3)-&*0u0010 $D>0 'tn	0
 #'0 d^0 
		0 0d  15481537&*,0/3)-&*\
E,,-\
 u001\
 !.	\

 u//0\
 d^\
 $D>\
 'tn\
 #'\
 d^\
 
um#	$\
 \
r&   r&  c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\	\
   S	\\R                     4
S
 jjrSrU =r$ )CLIPSegDecoderLayeri  z
CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
self-attention/MLP, rather than before.
rd   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r  r  r   s     r$   rr   CLIPSegDecoderLayer.__init__   r  r&   rU   r   r   r   r   c                     UnU R                  UUUUS9u  pXQ-   nU R                  U5      nUnU R                  U5      nXQ-   nU R                  U5      nU4nU(       a  Xv4-  nU$ r  )r	  r  r  r  r  s           r$   r   CLIPSegDecoderLayer.forward(  s    " !&*nn')"7/	 '5 '
# !0((7 / 0((7 "&Gr&   r  r  )rJ   rK   rL   rM   rN   r   rr   r!   r   r   r   r   rO   r   rQ   r   r   s   @r$   r  r    sp    S} S -2'||' '  %||	'
 $D>' 
u  	!' 'r&   r  c                      ^  \ rS rSrS\4U 4S jjr   SS\\R                     S\R                  S\	\
   S\	\
   S\	\
   4
S	 jjrS
rU =r$ )CLIPSegDecoderiR  rd   c                   > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  5      U l        [        R                  " UR
                  UR                  5      U l        UR                  (       a  UR                  R                  S-  UR                  R                  S-  4n[        R                  " [        R                  " UR                  UR                  SSS9[        R                  " 5       [        R                  " UR                  UR                  S-  US   US   S9[        R                  " 5       [        R                  " UR                  S-  SUS   US   S95      U l        ON[        R                  " UR                  SUR                  R                  UR                  R                  S9U l        [#        UR$                  5      n[        R&                  " [)        U5       Vs/ s H8  n[        R                  " UR                  R*                  UR                  5      PM:     sn5      U l        [.        R0                  " UR                  5      nUR                  Ul        UR2                  Ul        UR6                  Ul        S	Ul        [        R&                  " [)        [#        UR$                  5      5       Vs/ s H  n[=        U5      PM     sn5      U l        g s  snf s  snf )
N   r
   r   )rh   paddingrk   r   )rh   ri   )ri   relu) rq   rr   conditional_layerr	   r   r  
reduce_dimfilm_mulfilm_add"use_complex_transposed_convolutionr  rv   
Sequentialrz   ReLUConvTranspose2dtransposed_convolutionr#   extract_layersr9  r:  rs   reducescopydeepcopydecoder_num_attention_headsr   decoder_intermediate_sizer   r   r  r;  )r?   rd   transposed_kernelsdepthr   decoder_configr   s         r$   rr   CLIPSegDecoder.__init__S  s]    !'!9!9		&"7"79J9JK		&"7"79J9JK44"("6"6"A"AQ"FH\H\HgHgklHl!m*,--		&++V->->AWXY	""%%%%* 21 5-a0	 	""%%*A;Ma;PYklmYn+D' +-*<*<!!1f&:&:&E&EfNbNbNmNm+D' F))*}}UZ[`UabUaPQRYYv++779J9JKUab
 v';';<%+%6%6"-3-O-O*+1+K+K($*!mmRWX[\b\q\qXrRs$tRsQ%8%HRs$tu c %us   ?L#L
rU   rZ   r   r>  r?  c                 d   U(       a  SOS nU(       a  SOS nUS S S2   nS n	[        [        XR                  U R                  5      5       H  u  n
u  pnU	b  U" U5      U	-   n	OU" U5      n	XR                  :X  aJ  U R                  U5      U	R                  SSS5      -  U R                  U5      -   n	U	R                  SSS5      n	U" U	S S US9nUS   n	U(       a  Xi4-  nU(       d  M  X~S   4-  nM     U	S S 2SS 2S S 24   R                  SSS5      n	[        [        R                  " U	R                  S   5      5      nUR                  S   nU	R                  UU	R                  S   X5      n	U R                  U	5      R                  S5      nU(       d  [        S UXg4 5       5      $ [!        UUUS9$ )	NrI   rn   r   r   rk   )r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rI   rB  s     r$   r@   )CLIPSegDecoder.forward.<locals>.<genexpr>  s     a$Oq$OrE  )r   rU   rV   )rH  zipr;  r  r  r  r   r  r   mathsqrtr   r   r  squeezerD   rS   )r?   rU   rZ   r   r>  r?  all_hidden_statesrL  activationsr  i
activationlayerreducerO  r   r   r   s                     r$   r   CLIPSegDecoder.forward}  s    #7BD0d#DbD).7KVZVbVb8c.d*A*
6!
+f4
+***'=>PQSTVWAXX[_[h[h*\   1a0!t4[lM #1%F#!Y.!  #3"55- /e0 12q!))!Q2499V\\!_-.+11!4
Za$E,,V4<<Q?aV->$Oaaa#+%
 	
r&   )r  r  r  r;  r  r  )NNT)rJ   rK   rL   rM   r   rr   r   r!   r   r   r   r   rQ   r   r   s   @r$   r  r  R  sp    (v} (v\ -1/3&*6
U\\*6
 !&6
 $D>	6

 'tn6
 d^6
 6
r&   r  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                     ^  \ rS rSr\rS\4U 4S jjr     SS\\   S\\	R                     S\\	R                     S\\	R                     S\\	R                     4
S	 jjr\           SS\\	R                     S
\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\S\\   S\\\4   4S jj5       rSrU =r$ )CLIPSegForImageSegmentationi  rd   c                    > [         TU ]  U5        Xl        [        U5      U l        UR
                  U l        [        U5      U l        U R                  5         g r   )	rq   rr   rd   r&  r  r  r  decoderre  r   s     r$   rr   $CLIPSegForImageSegmentation.__init__  sG      (	$33%f- 	r&   r   r   r   rl   conditional_pixel_valuesc                    UbT  [        U5      U:w  a  [        S5      e[        R                  " 5          U R                  R                  X#US9nS S S 5        U$ UbU  [        U5      U:w  a  [        S5      e[        R                  " 5          U R                  R                  U5      nS S S 5        U$ [        S5      e! , (       d  f       W$ = f! , (       d  f       W$ = f)Nz@Make sure to pass as many prompt texts as there are query images)r   rl   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r#   r   r!   no_gradr  r  r  )r?   r   r   r   rl   r  rZ   s          r$   get_conditional_embeddings6CLIPSegForImageSegmentation.get_conditional_embeddings  s      9~+ !cdd)-)D)D< *E *& ! &% &1+,
: !dee)-)E)EF^)_& ! &%	 m  ! &% ! &%s   B;
C;
C

Cr   rZ   labelsr   r>  r   r?  r   c           	      ^   Ub  UOU R                   R                  n[        R                  " 5          U R                  R                  UUSU
US9nU R                  R                  US   5      nU(       a  UR                  OUS   nU R                   Vs/ s H
  oUS-      PM     nnU(       a>  [        UR                  UR                  U	(       a  UR                  OSUR                  S9nOU	(       d  USS USS -   OUnSSS5        Uc!  U R                  UR                  S   UUUUS	9nO]UR                  S   UR                  S   :w  a  [        S
5      eUR                  S   U R                   R                   :w  a  [        S5      eU R#                  WUUU	US9nU(       a  UR$                  OUS   nSnUb9  UR'                  UR(                  5      n[*        R,                  " 5       nU" UU5      nU(       d  UUWWU4nUb  U4U-   $ U$ [/        UUUWWUS9$ s  snf ! , (       d  f       GN%= f)a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
conditional_pixel_values (`torch.FloatTensor`, *optional*):
    The pixel values of the conditional images.
conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
    The conditional embeddings for the query images. If provided, the model will use this instead of computing
    the embeddings from the conditional_pixel_values.

Examples:

```python
>>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
>>> from PIL import Image
>>> import requests

>>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
>>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = ["a cat", "a remote", "a blanket"]
>>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

>>> outputs = model(**inputs)

>>> logits = outputs.logits
>>> print(logits.shape)
torch.Size([3, 352, 352])
```NTr  r   rk   rZ  r
   r   )r   r   r   rl   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r   r>  r?  )r0   r   rZ   r[   r6   r\   )rd   rG  r!   r  r  r  r)  rU   r  r   rF  r[  rV   r  r   r   r  r  r   r   r   r	   BCEWithLogitsLossrX   )r?   r   r   r  rZ   r   rl   r  r   r>  r   r?  r  r[   rU   r  r  decoder_outputsr   r0   loss_fnr  s                         r$   r   #CLIPSegForImageSegmentation.forward  sg   ^ &1%<k$++B]B] ]]_!YY33)"3%))A' 4 N !II77q8IJM<GN88^\]M^M9=9L9LM9LAQ/9LKM !;&4&F&F"0">">BV.">">\`-88	" DXN2A&);;]k / 8 ")%)%D%D'--a0#-))A &E &" &++A.,2D2DQ2GG m  &++A.$++2L2LL 0  ,,"/!5# ' 
 ,7''OA<NYYv}}-F**,G66*D4m^UdeF)-)9TGf$EvE-#9' .*
 	
q N _s   A"HH$AHH
H,)r  rd   r  r  rQ  )NNNNNNNNNTN)rJ   rK   rL   rM   r   r3  rr   r   r   r!   r   r  r   rO   r   r   r   r   r.   r   rQ   r   r   s   @r$   r  r    s    !L}  %),015/3;?&SM& ELL)& !.	&
 u||,& #+5<<"8&:  2648@D>B1537-1,0/3)-&*|
E--.|
 u001|
 #+5+<+<"=	|

 !)):): ;|
 !.|
 u//0|
 ))*|
 $D>|
 'tn|
 #'|
 d^|
 
um#	$|
 |
r&   r  )r&  r  rb  r}  r  )r   )>rN   r  r  dataclassesr   typingr   r   r   r   r   r!   torch.utils.checkpointr	   r  r   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   r   configuration_clipsegr   r   r   
get_loggerrJ   r   r   r%   r,   r.   rS   rX   rs  rb   r   floatr   r   r   r  r  r7  rS  rb  ru  r}  r&  r  r  r  __all__rI   r&   r$   <module>r     su      ! 8 8    ! d K F D D X X 
		H	%
`U\\ `ell `
-U\\ -ell - !
K !
 !
H :; : :& 
[ 
 
0Pbii Ph%BII %` %II%<<% 
% <<	%
 U\\*% % %,M)ryy M)b  /")) /d ,%_ ,% ,%`^
RYY ^
BX
RYY X
v1
- 1
h1
ryy 1
h0
/ 0
f b
) b
 b
J6")) 6ra
+ a
H 
j
"8 j

j
Zr&   