
    fTh6                        S r SSKJr  SSKJrJrJrJrJr  SSK	r	SSK	J
r
  SSKJrJrJr  SSKJr  SS	KJrJr  SS
KJrJrJr  SSKJrJr  SSKJrJrJrJrJ r   SSK!J"r"J#r#J$r$  \RJ                  " \&5      r'S\	RP                  S\	RP                  4S jr)S\	RP                  S\	RP                  4S jr*S\	RP                  S\	RP                  4S jr+\ " S S\5      5       r,\ " S S\5      5       r-\ " S S\5      5       r. " S S\
R^                  5      r0 " S S\
R^                  5      r1  SHS \
R^                  S!\	RP                  S"\	RP                  S#\	RP                  S$\\	RP                     S%\2S&\2S'\34S( jjr4 " S) S*\
R^                  5      r5 " S+ S,\
R^                  5      r6 " S- S.\
R^                  5      r7\ " S/ S0\5      5       r8 " S1 S2\
R^                  5      r9 " S3 S4\
R^                  5      r:\" S5S69 " S7 S8\85      5       r; " S9 S:\
R^                  5      r<\" S;S69 " S< S=\85      5       r=\ " S> S?\85      5       r>\ " S@ SA\85      5       r?\ " SB SC\85      5       r@\" SDS69 " SE SF\85      5       rA/ SGQrBg)IzPyTorch CLIP model.    )	dataclass)AnyCallableOptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )
CLIPConfigCLIPTextConfigCLIPVisionConfiglogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)r	   
functionalcross_entropytorcharangelenr#   )r   s    ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/clip/modeling_clip.pycontrastive_lossr*   %   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)r*   t)r,   caption_loss
image_losss      r)   	clip_lossr1   )   s*    #J/L!*,,.1J%,,r+   tensorc                     [         R                  " U S5      n[         R                  " USSS9n[         R                  " US5      nU$ )z
This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
   T)dimkeepdim      ?)r&   powsum)r2   square_tensor
sum_tensornormed_tensors       r)   _get_vector_normr>   /   s<    
 IIfa(M=b$?JIIj#.Mr+   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
CLIPVisionModelOutput:   a  
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

Args:
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__rB   r   r&   FloatTensor__annotations__rC   rD   r   rE   __static_attributes__rF   r+   r)   r@   r@   :   sr    * 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r+   r@   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
CLIPTextModelOutputW   a  
Base class for text model's outputs that also contains a pooling of the last hidden states.

Args:
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Ntext_embedsrC   .rD   rE   rF   )rG   rH   rI   rJ   rK   rR   r   r&   rL   rM   rC   rD   r   rE   rN   rF   r+   r)   rP   rP   W   sr    * 04K%++,359x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r+   rP   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)
CLIPOutputt   a  
Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPVisionModel`].
Nlosslogits_per_imagelogits_per_textrR   rB   text_model_outputvision_model_outputr    c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))rY   rZ   N)getattrto_tuple).0kselfs     r)   	<genexpr>&CLIPOutput.to_tuple.<locals>.<genexpr>   s<      
   LLDGRYZ^`aRbRkRkRmm s   25)tuplekeysra   s   `r)   r^   CLIPOutput.to_tuple   s#     
YY[
 
 	
r+   rF   )rG   rH   rI   rJ   rK   rV   r   r&   rL   rM   rW   rX   rR   rB   rY   r   rZ   r   r   r^   rN   rF   r+   r)   rT   rT   t   s    ( )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r+   rT   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )CLIPVisionEmbeddings   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebiasr4   r   position_idsr   r5   
persistent)super__init__rk   hidden_size	embed_dim
image_size
patch_sizer	   	Parameterr&   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr'   expandra   rk   	__class__s     r)   rw   CLIPVisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr+   
embeddingsheightwidthr    c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr5   r8   r   r4   bicubicF)sizemodealign_cornersr6   )shaper   weight	unsqueezer&   jit
is_tracingrr   r{   r   reshapepermuter	   r$   interpolateviewcat)ra   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr6   
new_height	new_widthsqrt_num_positionss                r)   interpolate_pos_encoding-CLIPVisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr+   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model ().)dtyper4   r   r5   r   )r   rz   
ValueErrorr   r   r   toflatten	transposer~   r   r&   r   r   r   rr   )ra   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r)   forwardCLIPVisionEmbeddings.forward   s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr+   )	r~   rk   ry   rz   r   r   r   r{   r   F)rG   rH   rI   rJ   r   rw   r&   Tensorintr   rL   r   rN   __classcell__r   s   @r)   ri   ri      sj    q/ q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r+   ri   c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )CLIPTextEmbeddings   rk   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nrr   rs   Frt   )rv   rw   rx   r	   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r&   r'   r   ra   rk   ry   r   s      r)   rw   CLIPTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r+   	input_idsrr   inputs_embedsr    c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )Nr5   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r   r   r   rr   r   )ra   r   rr   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r)   r   CLIPTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r+   )r   r   )NNN)rG   rH   rI   rJ   r   rw   r   r&   
LongTensorrL   r   r   rN   r   r   s   @r)   r   r      so    

~ 

 153759	E,,- u//0   1 12	
 
 r+   r   modulequerykeyvalueattention_maskscalingdropoutoutput_attentionsc                    [         R                  " XR                  SS5      5      U-  n	Ub  X-   n	[        R                  R                  U	S[         R                  S9R                  UR                  5      n	[        R                  R                  XU R                  S9n	[         R                  " X5      n
U
R                  SS5      R                  5       n
U(       d  S n	X4$ )Nr5   r   )r6   r   )ptrainingr   r4   )r&   matmulr   r	   r$   softmaxfloat32r   r   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              r)   eager_attention_forwardr     s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r+   c                      ^  \ rS rSrSrS\\\4   4U 4S jjr   SS\	R                  S\\	R                     S\\	R                     S\\   S	\\	R                  \\	R                     4   4
S
 jjrSrU =r$ )CLIPAttentioni,  z=Multi-headed attention from 'Attention Is All You Need' paperrk   c                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rv   rw   rk   rx   ry   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr	   Lineark_projv_projq_projout_projr   s     r)   rw   CLIPAttention.__init__/  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar+   rD   r   causal_attention_maskr   r    c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVSU R
                  5      R                  SS5      nU	R	                  XVSU R
                  5      R                  SS5      n	U
R	                  XVSU R
                  5      R                  SS5      n
U R                  R                  S:X  a
  USLU l	        OUb  Ub  X#-   nOUb  Un[        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                  (       d  S	OU R                   US
9	u  pUR#                  XVU5      R%                  5       nU R'                  U5      nU(       d  SnX4$ )z#Input shape: Batch x Time x Channelr5   r   r4   flash_attention_2Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   r   )r   r   r   r   r   r   r   rk   _attn_implementationr   r   loggerwarning_oncer   r   r   r   r   r   r   )ra   rD   r   r   r   r   r   ry   queriesre   valuesattention_interfacer   r   s                 r)   r   CLIPAttention.forwardC  s    -:,?,?)
	++m,{{=)]+,,zr4==ISSTUWXYyyT]]CMMaQRSZRGQQRSUVW ;;++/BB2$>DN).C.O!/!G&2!6(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,/
%
! "))*)LWWYmmK0 L((r+   )rk   r   ry   r   r   r   r   r   r   r   r   )NNF)rG   rH   rI   rJ   rK   r   r   r   rw   r&   r   r   boolr   r   rN   r   r   s   @r)   r   r   ,  s    GBu%5~%EF B. 268<,17)||7) !.7)  (5	7)
 $D>7) 
u||Xell33	47) 7)r+   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )CLIPMLPi}  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)rv   rw   rk   r   
hidden_actactivation_fnr	   r   rx   intermediate_sizefc1fc2r   s     r)   rw   CLIPMLP.__init__~  sb    #F$5$5699V//1I1IJ99V55v7I7IJr+   rD   r    c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  )ra   rD   s     r)   r   CLIPMLP.forward  s4    /**=9/r+   )r  rk   r  r  )
rG   rH   rI   rJ   rw   r&   r   r   rN   r   r   s   @r)   r   r   }  s)    KU\\ ell  r+   r   c                      ^  \ rS rSrS\\\4   4U 4S jjr SS\R                  S\R                  S\R                  S\
\   S\\R                     4
S	 jjrS
rU =r$ )CLIPEncoderLayeri  rk   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Neps)rv   rw   rx   ry   r   	self_attnr	   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r)   rw   CLIPEncoderLayer.__init__  sl    ++&v.<<F<Q<QR6?<<F<Q<QRr+   rD   r   r   r   r    c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)rD   r   r   r   )r  r  r  r  )ra   rD   r   r   r   residualr   outputss           r)   r   CLIPEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr+   )ry   r  r  r  r  r   )rG   rH   rI   rJ   r   r   r   rw   r&   r   r   r   r   rL   r   rN   r   r   s   @r)   r
  r
    sv    Su%5~%EF S -2&||& &  %||	&
 $D>& 
u  	!& &r+   r
  c                   .    \ rS rSr\rSrSrSrSr	S r
Srg)CLIPPreTrainedModeli  clipTc                 "   U R                   R                  n[        U[        5      (       ad  UR                  R
                  R                  R                  SUS-  S9  UR                  R
                  R                  R                  SUS-  S9  GOF[        U[        5      (       a  U R                   R                  n[        R                  R                  UR                  SUR                  S-  U-  S9  [        R                  R                  UR                  R
                  UR                   R                  U-  S9  [        R                  R                  UR                  R
                  UR                   R                  U-  S9  GON[        U[         5      (       Ga!  U R                   R                  nUR                  S-  SUR                   R"                  -  S-  -  U-  nUR                  S-  U-  n[        R                  R                  UR$                  R
                  US9  [        R                  R                  UR&                  R
                  US9  [        R                  R                  UR(                  R
                  US9  [        R                  R                  UR*                  R
                  US9  GO[        U[,        5      (       a  U R                   R                  nUR                   R.                  S-  SUR                   R"                  -  S-  -  U-  nSUR                   R.                  -  S-  U-  n[        R                  R                  UR0                  R
                  US9  [        R                  R                  UR2                  R
                  US9  GO.[        U[4        5      (       a  [        R                  R                  UR6                  R
                  UR8                  S-  U R                   R                  -  S9  [        R                  R                  UR:                  R
                  UR<                  S-  U R                   R                  -  S9  GOk[        U[>        5      (       aa  [        R                  R                  UR:                  R
                  U R                   R.                  S-  U R                   R                  -  S9  O[        U[@        5      (       aa  [        R                  R                  UR6                  R
                  U R                   R.                  S-  U R                   R                  -  S9  O[        U[B        5      (       aj  [        R                  R                  URD                  R
                  U R                   RF                  R.                  S-  U R                   R                  -  S9  [        U[        RH                  5      (       aI  URJ                  R                  RM                  5         UR
                  R                  RO                  S5        [        U[        RP                  5      (       a3  URJ                  b%  URJ                  R                  RM                  5         ggg)	zInitialize the weightsr   g{Gz?)meanstdr   )r  r4   g      ?N))rk   initializer_factor
isinstancer   r   r   datanormal_r   ri   r	   initr~   ry   r   initializer_ranger   num_hidden_layersr   r   r   r   r   rx   r  r  	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr  rq   zero_fill_r   )ra   r   factorin_proj_stdout_proj_stdfc_stds         r)   _init_weights!CLIPPreTrainedModel._init_weights  s   //f011""))..66CVd]6S%%,,1199sQU9V 455[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk..[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE(([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?	**GGOO&&--))4/$++2P2PP   GGOO((//++T1DKK4R4RR    =>>GGOO((//KK++T1DKK4R4RR    ;<<GGOO&&--KK++T1DKK4R4RR    :;;GGOO!!((KK--994?$++B`B``  
 fbll++KK""$MM$$S)fbii((V[[-DKK""$ .E(r+   rF   N)rG   rH   rI   rJ   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_2r7  rN   rF   r+   r)   r  r    s#    L&*#N!6%r+   r  c                      ^  \ rS rSrSrS\4U 4S jjr\    SS\\	R                     S\\	R                     S\\   S\\   S	\4
S
 jj5       rSrU =r$ )CLIPEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`CLIPEncoderLayer`].

Args:
    config: CLIPConfig
rk   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
rv   rw   rk   r	   
ModuleListranger&  r
  layersgradient_checkpointing)ra   rk   r   r   s      r)   rw   CLIPEncoder.__init__  sS    mmuVMeMeGf$gGf!%5f%=Gf$gh&+# %hs   A&r   r   r   output_hidden_statesr    c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn[        U R                  5       Hr  u  pU(       a  Xh4-   nU R
                  (       a1  U R                  (       a   U R                  U
R                  UUUU5      nO	U
" UUUUS9nUS   nU(       d  Mj  X{S   4-   nMt     U(       a  Xh4-   n[        UUUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrF   )r   r   r   )rC   rD   rE   )
rk   r   rF  	enumeraterC  rD  r   _gradient_checkpointing_func__call__r   )ra   r   r   r   r   rF  encoder_statesall_attentionsrD   idxencoder_layerlayer_outputss               r)   r   CLIPEncoder.forward  s   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%"+DKK"8C#!/2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M  !/3C2E!E- #90  +.>>N+(%
 	
r+   )rk   rD  rC  NNNN)rG   rH   rI   rJ   rK   r   rw   r   r   r&   r   r   r   r   rN   r   r   s   @r)   r?  r?    s    ,z ,  268<,0/3M
 !.M
  (5	M

 $D>M
 'tnM
 
M
 M
r+   r?  c                      ^  \ rS rSrS\4U 4S jjr\\     SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\4S
 jj5       5       rSrU =r$ )CLIPTextTransformeri_  rk   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        UR                  U l        UR                  S:H  U l        g )Nr  r   )rv   rw   rk   rx   r   r   r?  encoderr	   r  r  final_layer_normeos_token_idr   _use_flash_attention_2r   s      r)   rw   CLIPTextTransformer.__init__`  ss    &&	,V4"6* "Y<Q<Q R #// '-&A&AEX&X#r+   r   r   rr   r   rF  r    c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eUR	                  5       nUR                  SUS   5      nU R                  XS9n[        XgR                  UR                  S9nUb&  U R                  (       d  [        X'R                  5      nU R                  UUUUUS9n	U	R                  n
U R                  U
5      n
U R                  S:X  ae  U
[         R"                  " U
R$                  S   U
R                  S9UR'                  [         R(                  U
R                  S9R+                  SS	94   nOU
[         R"                  " U
R$                  S   U
R                  S9UR'                  [         R(                  U
R                  S9U R                  :H  R)                  5       R+                  SS	94   n[-        U
UU	R.                  U	R0                  S
9$ )NzYou have to specify input_idsr5   )r   rr   r"   )r   r   r   r   rF  r4   r   )r   r#   r   rC   pooler_outputrD   rE   )rk   r   rF  r   r   r   r   r   r   r#   rX  r   rU  rC   rV  rW  r&   r'   r   r   r   argmaxr   rD   rE   )ra   r   r   rr   r   rF  input_shaperD   r   encoder_outputsrC   pooled_outputs               r)   r   CLIPTextTransformer.forwardn  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 <==nn&NN2{27	)W !A,,]5I5I!

 %d.I.I7H[H[\N+/<<')"7/!5 ,8 ,
 ,== 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M */')77&11	
 	
r+   )rX  rk   r   rU  rW  rV  NNNNN)rG   rH   rI   rJ   r   rw   r   r   r   r&   r   r   r   r   rN   r   r   s   @r)   rS  rS  _  s    Y~ Y  -115/3,0/3F
ELL)F
 !.F
 u||,	F

 $D>F
 'tnF
 
$F
  F
r+   rS  zI
    The text model from CLIP without any head or projection on top.
    )custom_introc                     ^  \ rS rSr\rSS/rS\4U 4S jjrS\R                  4S jr
S r\\     SS	\\R                      S
\\R                      S\\R                      S\\   S\\   S\4S jj5       5       rSrU =r$ )CLIPTextModeli  r   r
  rk   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rv   rw   rS  
text_model	post_initr   s     r)   rw   CLIPTextModel.__init__  s&     -f5r+   r    c                 B    U R                   R                  R                  $ r   rg  r   r   rf   s    r)   get_input_embeddings"CLIPTextModel.get_input_embeddings      ))999r+   c                 8    XR                   R                  l        g r   rk  ra   r   s     r)   set_input_embeddings"CLIPTextModel.set_input_embeddings      5:""2r+   r   r   rr   r   rF  c                 (    U R                  UUUUUS9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, CLIPTextModel

>>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r   r   rr   r   rF  rg  )ra   r   r   rr   r   rF  s         r)   r   CLIPTextModel.forward  s)    4 )%/!5  
 	
r+   rv  rb  )rG   rH   rI   rJ   r   r9  _no_split_modulesrw   r	   Modulerl  rq  r   r   r   r&   r   r   r   r   rN   r   r   s   @r)   re  re    s     "L-/AB~ :bii :;  -115/3,0/3
ELL)
 !.
 u||,	

 $D>
 'tn
 
$
  
r+   re  c                      ^  \ rS rSrS\4U 4S jjr\\    SS\\	R                     S\\   S\\   S\\   S\4
S	 jj5       5       rS
rU =r$ )CLIPVisionTransformeri  rk   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r  )rv   rw   rk   rx   ri   r   r	   r  r  pre_layrnormr?  rU  post_layernormr   s      r)   rw   CLIPVisionTransformer.__init__  sd    &&	.v6LL8M8MN"6* ll9:O:OPr+   r   r   rF  r   r    c                 ~   Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  XS9nU R                  U5      nU R                  UUUS9nUR                  nUS S 2SS S 24   nU R                  U5      n[        UUUR                  UR                  S9$ )Nz You have to specify pixel_values)r   )r   r   rF  r   r[  )rk   r   rF  r   r   r}  rU  rC   r~  r   rD   rE   )	ra   r   r   rF  r   rD   r_  rC   r`  s	            r)   r   CLIPVisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 ?@@h))-8+/<<'/!5 ,8 ,
 ,==)!Q'2++M:)/')77&11	
 	
r+   )rk   r   rU  r~  r}  NNNF)rG   rH   rI   rJ   r   rw   r   r   r   r&   rL   r   r   r   rN   r   r   s   @r)   r{  r{    s    Q/ Q  59,0/338!
u001!
 $D>!
 'tn	!

 #+4.!
 
$!
  !
r+   r{  zK
    The vision model from CLIP without any head or projection on top.
    c                      ^  \ rS rSr\rSrS/rS\4U 4S jjrS\	R                  4S jr\\    SS\\R                      S\\   S	\\   S
\S\4
S jj5       5       rSrU =r$ )CLIPVisionModeli#  r   r
  rk   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rv   rw   r{  vision_modelrh  r   s     r)   rw   CLIPVisionModel.__init__-  s'     1&9r+   r    c                 B    U R                   R                  R                  $ r   r  r   r   rf   s    r)   rl  $CLIPVisionModel.get_input_embeddings3        ++;;;r+   r   rF  r   c                 &    U R                  UUUUS9$ )ag  
Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPVisionModel

>>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r   r   rF  r   r  )ra   r   r   rF  r   s        r)   r   CLIPVisionModel.forward6  s(    <   %/!5%=	 ! 
 	
r+   r  r  )rG   rH   rI   rJ   r   r9  main_input_namerx  rw   r	   ry  rl  r   r   r   r&   rL   r   r   r   rN   r   r   s   @r)   r  r  #  s     $L$O+,/ <bii <  59,0/3).!
u001!
 $D>!
 'tn	!

 #'!
 
$!
  !
r+   r  c                   "  ^  \ rS rSr\r/ SQrS\4U 4S jjr\     SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\   S
\
R                  4S jj5       r\    SS\	\
R                     S\	\   S	\	\   S\S
\
R                  4
S jj5       r\\        SS\	\
R"                     S\	\
R                     S\	\
R                     S\	\
R"                     S\	\   S\	\   S	\	\   S\S
\4S jj5       5       rSrU =r$ )r'  i\  )r   r
  ri   rk   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l        UR                  U l        [        R                  U5      nUR                  U l        [         R                  U5      nUR"                  U l        [$        R&                  " U R                  U R                  SS9U l        [$        R&                  " U R                  U R                  SS9U l        [$        R,                  " [.        R0                  " U R2                  R4                  5      5      U l        U R9                  5         g )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type Frq   )rv   rw   r!  text_configr   	TypeErrortyper0  r   projection_dimrx   r)  r+  re  _from_configrg  r  r  r	   r   r*  r(  r|   r&   r2   rk   logit_scale_init_valuelogit_scalerh  )ra   rk   r  r0  rg  r  r   s         r)   rw   CLIPModel.__init__a  s~    &,,n==++,-Q0 
 &..0@AA--./q2 
 ((,,$33)55 - 9 9"//<
$//&33MB(55!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r+   r   r   rr   r   rF  r    c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUS9nUR                  nU R                  U5      nU$ )aB  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`CLIPTextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, CLIPModel

>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```ru  )rk   r   rF  rg  r\  r(  )	ra   r   r   rr   r   rF  text_outputsr`  text_featuress	            r)   get_text_featuresCLIPModel.get_text_features  s    4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 48??)%/!5 4C 4
 %22,,];r+   r   r   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUS9nUR                  nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`CLIPVisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPModel

>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> image_features = model.get_image_features(**inputs)
```r  )rk   r   rF  r  r\  r*  )ra   r   r   rF  r   vision_outputsr`  image_featuress           r)   get_image_featuresCLIPModel.get_image_features  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5%=	 6G 6
 '44//>r+   return_lossc	           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUS9n	U R	                  UUUUUS9n
U	R
                  nU R                  U5      nU
R
                  nU R                  U5      nU[        U5      -  nU[        U5      -  n[        R                  " XR                  5       R                  UR                  5      5      nXR                  R                  5       R                  UR                  5      -  nUR                  5       nSnU(       a  [!        U5      n[#        UUUUUU
U	S9$ )aM  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPModel

>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```Nr  ru  )rV   rW   rX   rR   rB   rY   rZ   )rk   r   rF  r  rg  r\  r*  r(  r>   r&   r   r.   r   r#   r  expr1   rT   )ra   r   r   r   rr   r  r   rF  r   r  r  rB   rR   rX   rW   rV   s                   r)   r   CLIPModel.forward  sq   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5%=	 6G 6
 48??)%/!5 4C 4
 &33--l;"00**;7 $&6|&DD!$4[$AA  ,,{NN4D4G4GHZHZ4[\),<,<,@,@,B,E,EkFXFX,YY*,,._-D-+#%* .
 	
r+   )r  r  r)  rg  r(  r+  r  r*  rb  r  )NNNNNNNF)rG   rH   rI   rJ   r   r9  rx  rw   r   r   r&   r   r   rL   r  r  r   r   rT   r   rN   r   r   s   @r)   r'  r'  \  s   LZ!z !F  -115/3,0/3)ELL)) !.) u||,	)
 $D>) 'tn) 
		) )V  59,0/3).-u001- $D>- 'tn	-
 #'- 
		- -^  15481537&*,0/3).U
E,,-U
 u001U
 !.	U

 u//0U
 d^U
 $D>U
 'tnU
 #'U
 
U
  U
r+   r'  c                     ^  \ rS rSr\rSS/rS\4U 4S jjrS\R                  4S jr
S r\\     SS	\\R                      S
\\R                      S\\R                      S\\   S\\   S\4S jj5       5       rSrU =r$ )r-  i:  r   r
  rk   c                    > [         TU ]  U5        [        R                  U5      nUR                  U l        [
        R                  " UR                  UR                  SS9U l	        U R                  5         g NFr  )rv   rw   re  r  rg  r	   r   rx   r  r(  rh  )ra   rk   rg  r   s      r)   rw   $CLIPTextModelWithProjection.__init__@  s[     "//7
$//!yy););V=R=RY^_ 	r+   r    c                 B    U R                   R                  R                  $ r   rk  rf   s    r)   rl  0CLIPTextModelWithProjection.get_input_embeddingsK  rn  r+   c                 8    XR                   R                  l        g r   rk  rp  s     r)   rq  0CLIPTextModelWithProjection.set_input_embeddingsN  rs  r+   r   r   rr   r   rF  c                     U R                  UUUUUS9nUR                  nU R                  U5      n[        UUR                  UR
                  UR                  S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

>>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> text_embeds = outputs.text_embeds
```ru  )rR   rC   rD   rE   )rg  r\  r(  rP   rC   rD   rE   )	ra   r   r   rr   r   rF  r  r`  rR   s	            r)   r   #CLIPTextModelWithProjection.forwardQ  so    2 48??)%/!5 4C 4
 %22**=9"#*<<&44#..	
 	
r+   )rg  r(  rb  )rG   rH   rI   rJ   r   r9  rx  rw   r	   ry  rl  rq  r   r   r   r&   r   r   rP   r   rN   r   r   s   @r)   r-  r-  :  s    !L-/AB	~ 	:bii :;  -115/3,0/3&
ELL)&
 !.&
 u||,	&

 $D>&
 'tn&
 
&
  &
r+   r-  c                      ^  \ rS rSr\rSrS\4U 4S jjrS\R                  4S jr
\\    SS\\R                     S\\   S\\   S	\S\4
S
 jj5       5       rSrU =r$ )r,  i|  r   rk   c                    > [         TU ]  U5        [        R                  U5      nUR                  U l        [
        R                  " UR                  UR                  SS9U l	        U R                  5         g r  )rv   rw   r  r  r  r	   r   rx   r  r*  rh  ra   rk   r  r   s      r)   rw   &CLIPVisionModelWithProjection.__init__  s\     &33F;(55!#6+=+=v?T?T[`!a 	r+   r    c                 B    U R                   R                  R                  $ r   r  rf   s    r)   rl  2CLIPVisionModelWithProjection.get_input_embeddings  r  r+   r   rF  r   c                     U R                  UUUUS9nUR                  nU R                  U5      n[        UUR                  UR
                  UR                  S9$ )a;  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, CLIPVisionModelWithProjection

>>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> image_embeds = outputs.image_embeds
```r  )rB   rC   rD   rE   )r  r\  r*  r@   rC   rD   rE   )ra   r   r   rF  r   r  r`  rB   s           r)   r   %CLIPVisionModelWithProjection.forward  sn    : 6:5F5F%/!5%=	 6G 6
 '44--m<$%,>>(66%00	
 	
r+   )r  r*  r  )rG   rH   rI   rJ   r   r9  r  rw   r	   ry  rl  r   r   r   r&   rL   r   r@   r   rN   r   r   s   @r)   r,  r,  |  s    #L$O	/ 	<bii <  59,0/3).)
u001)
 $D>)
 'tn	)

 #')
 
)
  )
r+   r,  z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrSrS\SS4U 4S jjr\\    SS\	\
R                     S\	\
R                     S\	\   S	\	\   S\4
S
 jj5       5       rSrU =r$ )r.  i  r   rk   r    Nc                   > [         TU ]  U5        UR                  U l        [        R	                  UR
                  5      nUR                  U l        UR                  S:  a5  [        R                  " UR
                  R                  UR                  5      O[        R                  " 5       U l        U R                  5         g )Nr   )rv   rw   
num_labelsr  r  r0  r  r	   r   rx   Identityr/  rh  r  s      r)   rw   #CLIPForImageClassification.__init__  s      ++&33F4H4HI(55 OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r+   labelsr   rF  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUS9nUR                  n[
        R                  " USS2SS2SS24   SS9nU R                  U5      nSnUGb  UR                  UR                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l
        OoU R                  S:  aN  UR                  [
        R                  :X  d  UR                  [
        R                  :X  a  SU R                   l
        OSU R                   l
        U R                   R                  S:X  aI  [        5       n	U R                  S:X  a&  U	" UR!                  5       UR!                  5       5      nOU	" Xr5      nOU R                   R                  S:X  a=  [#        5       n	U	" UR%                  SU R                  5      UR%                  S5      5      nO,U R                   R                  S:X  a  ['        5       n	U	" Xr5      n[)        UUUR*                  UR,                  S	9$ )
ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r   rF  r   r   
regressionsingle_label_classificationmulti_label_classificationr5   )rV   r   rD   rE   )rk   r   rF  r  rC   r&   r  r/  r   r#   problem_typer  r   longr   r   squeezer   r   r
   r   rD   rE   )
ra   r   r  r   rF  r  sequence_outputr   rV   loss_fcts
             r)   r   "CLIPForImageClassification.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 /3.?.?/!5 /@ /
 "33  **_QAX%>AF1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./$!//))	
 	
r+   )r/  r  r  rQ  )rG   rH   rI   rJ   r  r   rw   r   r   r   r&   r   r   r   r   rN   r   r   s   @r)   r.  r.    s     %Oz d   04)-,0/3=
u||,=
 &=
 $D>	=

 'tn=
 
=
  =
r+   r.  )r'  r  re  r-  r  r,  r.  )r   T)CrK   dataclassesr   typingr   r   r   r   r   r&   r	   torch.nnr
   r   r   activationsr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   modeling_utilsr   r   utilsr   r   r   r   r   configuration_clipr   r   r   
get_loggerrG   r   r   r*   r1   r>   r@   rP   rT   ry  ri   r   floatr   r   r   r   r
  r  r?  rS  re  r{  r  r'  r-  r,  r.  __all__rF   r+   r)   <module>r     s    ! 8 8   A A ! d b b F V V L L 
		H	%
`U\\ `ell `-%,, -5<< -U\\ ell  ?K ? ?8 ?+ ? ?8 !
 !
 !
HP299 Pf% %^ "%II%<<% 
% <<	%
 U\\*% % % %0N)BII N)bbii /ryy /d =%/ =% =%@]
")) ]
@W
")) W
t 
1
' 1

1
h.
BII .
b 
1
) 1

1
h Z
# Z
 Z
z >
"5 >
 >
B =
$7 =
 =
@ Q
!4 Q
Q
hr+   