
    fThW6                    >   S r SSKJr  SSKJr  SSKJrJrJrJ	r	J
r
  SSKrSSKrSSKJrJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJr  SSKJ r J!r!J"r"  \" 5       (       a  SSK#J$r$  \RJ                  " \&5      r'S\R                  S\R                  4S jr(S\R                  S\R                  4S jr)\ " S S\5      5       r*S\S\4S jr+S\S\4S jr,S r-S r.\ " S S\5      5       r/\ " S S \5      5       r0 " S! S"\Rb                  5      r2 " S# S$\Rb                  5      r3 " S% S&\Rb                  5      r4 " S' S(\Rb                  5      r5 " S) S*\Rb                  5      r6\ " S+ S,\5      5       r7 " S- S.\Rb                  5      r8 " S/ S0\Rb                  5      r9 " S1 S2\75      r: " S3 S4\Rb                  5      r; " S5 S6\75      r<\ " S7 S8\75      5       r= " S9 S:\Rb                  5      r> " S; S<\Rb                  5      r? " S= S>\75      r@/ S?QrAg)@zPyTorch OWLv2 model.    )	dataclass)	lru_cache)AnyDictOptionalTupleUnionN)Tensornn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputauto_docstringis_vision_availablelogging	torch_int   )Owlv2ConfigOwlv2TextConfigOwlv2VisionConfig)center_to_corners_formatlogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)r   
functionalcross_entropytorcharangelenr!   )r   s    `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/owlv2/modeling_owlv2.pycontrastive_lossr(   ,   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)r(   t)r*   caption_loss
image_losss      r'   
owlv2_lossr/   1   s*    #J/L!*,,.1J%,,r)   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)Owlv2Output7   a  
Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of
        [`Owlv2VisionModel`].
    text_model_output (Tuple[`BaseModelOutputWithPooling`]):
        The output of the [`Owlv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Owlv2VisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)r8   r9   Ngetattrto_tuple.0kselfs     r'   	<genexpr>'Owlv2Output.to_tuple.<locals>.<genexpr>W   <      
   LLDGRYZ^`aRbRkRkRmm    25tuplekeysrC   s   `r'   r?   Owlv2Output.to_tupleV   #     
YY[
 
 	
r)    )__name__
__module____qualname____firstlineno____doc__r3   r   r$   FloatTensor__annotations__r4   r5   r6   r7   r8   r   r9   r   r   r?   __static_attributes__rN   r)   r'   r1   r1   7   s    * )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r)   r1   r,   c                 ,   U R                  5       (       a@  U R                  [        R                  [        R                  4;   a  U $ U R                  5       $ U R                  [        R                  [        R                  4;   a  U $ U R                  5       $ N)	is_floating_pointdtyper$   float32float64floatint32int64int)r,   s    r'   _upcastra   ^   sc    GGu}}==qL1779LGGU[[99qFquuwFr)   boxesc                 f    [        U 5      n U SS2S4   U SS2S4   -
  U SS2S4   U SS2S4   -
  -  $ )a  
Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

Args:
    boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
        Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
        < x2` and `0 <= y1 < y2`.

Returns:
    `torch.FloatTensor`: a tensor containing the area for each box.
N   r   r   r   )ra   )rb   s    r'   box_areare   g   sB     ENE!Q$K%1+%%1+ad*CDDr)   c                 V   [        U 5      n[        U5      n[        R                  " U S S 2S S S24   US S 2S S24   5      n[        R                  " U S S 2S SS 24   US S 2SS 24   5      nXT-
  R	                  SS9nUS S 2S S 2S4   US S 2S S 2S4   -  nUS S 2S 4   U-   U-
  nXx-  n	X4$ )Nrd   r   minr   )re   r$   maxrh   clamp)
boxes1boxes2area1area2left_topright_bottomwidth_heightinterunionious
             r'   box_iouru   x   s    VEVEyy4!,fQUm<H99VAtQRK0&AB-@L +22q29LAq!LAq$99E!T'NU"U*E
-C:r)   c                    U SS2SS24   U SS2SS24   :  R                  5       (       d  [        SU  35      eUSS2SS24   USS2SS24   :  R                  5       (       d  [        SU 35      e[        X5      u  p#[        R                  " U SS2SSS24   USS2SS24   5      n[        R
                  " U SS2SSS24   USS2SS24   5      nXT-
  R                  SS9nUSS2SS2S4   USS2SS2S4   -  nX'U-
  U-  -
  $ )z
Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.

Returns:
    `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
Nrd   z<boxes1 must be in [x0, y0, x1, y1] (corner) format, but got z<boxes2 must be in [x0, y0, x1, y1] (corner) format, but got r   rg   r   )all
ValueErrorru   r$   rh   ri   rj   )rk   rl   rt   rs   top_leftbottom_rightrq   areas           r'   generalized_box_iour|      s(    1ab5MVArrE]*//11WX^W_`aa1ab5MVArrE]*//11WX^W_`aa(JCyy4!,fQUm<H99VAtQRK0&AB-@L +22q29L1a <1a#88D,$&&&r)   c                   v   \ rS rSr% SrSr\\R                     \	S'   Sr
\\   \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   Sr\\R                     \	S
'   Sr\\R                     \	S'   Sr\\	S'   Sr\\	S'   S\\   4S jrSrg)Owlv2ObjectDetectionOutput   a	  
Output type of [`Owlv2ForObjectDetection`].

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
        Classification logits (including no-object) for all queries.
    objectness_logits (`torch.FloatTensor` of shape `(batch_size, num_patches, 1)`):
        The objectness logits of all image patches. OWL-ViT represents images as a set of image patches where the
        total number of patches is (image_size / patch_size)**2.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to retrieve the
        unnormalized bounding boxes.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image
        embeddings for each patch.
    class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
        number of patches is (image_size / patch_size)**2.
    text_model_output (Tuple[`BaseModelOutputWithPooling`]):
        The output of the [`Owlv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Owlv2VisionModel`].
Nr3   	loss_dictr   objectness_logits
pred_boxesr6   r7   class_embedsr8   r9   r   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7fr<   r=   r@   s     r'   rD   6Owlv2ObjectDetectionOutput.to_tuple.<locals>.<genexpr>   rF   rG   rH   rK   s   `r'   r?   #Owlv2ObjectDetectionOutput.to_tuple   rM   r)   rN   )rO   rP   rQ   rR   rS   r3   r   r$   rT   rU   r   r   r   r   r   r6   r7   r   r8   r   r9   r   r   r?   rV   rN   r)   r'   r~   r~      s     D )-D(5$$
%, $Ix~$*.FHU&&'.59x 1 129.2J**+2/3K%++,304L(5,,-404L(5,,-448186:3:
%* 
r)   r~   c                   :   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S	'   Sr\\	S
'   Sr\\	S'   S\\   4S jrSrg)%Owlv2ImageGuidedObjectDetectionOutput   aW  
Output type of [`Owlv2ForObjectDetection.image_guided_detection`].

Args:
    logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
        Classification logits (including no-object) for all queries.
    target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual target image in the batch
        (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
        retrieve the unnormalized bounding boxes.
    query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual query image in the batch
        (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
        retrieve the unnormalized bounding boxes.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes
        image embeddings for each patch.
    query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes
        image embeddings for each patch.
    class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
        number of patches is (image_size / patch_size)**2.
    text_model_output (Tuple[`BaseModelOutputWithPooling`]):
        The output of the [`Owlv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Owlv2VisionModel`].
Nr   r7   query_image_embedstarget_pred_boxesquery_pred_boxesr   r8   r9   r   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   n   >#    U  H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7fr<   r=   r@   s     r'   rD   AOwlv2ImageGuidedObjectDetectionOutput.to_tuple.<locals>.<genexpr>  rF   rG   rH   rK   s   `r'   r?   .Owlv2ImageGuidedObjectDetectionOutput.to_tuple  rM   r)   rN   )rO   rP   rQ   rR   rS   r   r   r$   rT   rU   r7   r   r   r   r   r8   r   r9   r   r   r?   rV   rN   r)   r'   r   r      s    > +/FHU&&'.04L(5,,-46:!2!23:59x 1 12948hu001804L(5,,-448186:3:
%* 
r)   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )Owlv2VisionEmbeddingsi
  configc                   > [         TU ]  5         UR                  U l        Xl        UR                  U l        [        R                  " [        R                  " UR                  5      5      U l
        [        R                  " UR                  U R
                  UR                  UR                  SS9U l        UR                  UR                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R
                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebiasrd   r   position_idsr   
persistent)super__init__
patch_sizer   hidden_size	embed_dimr   	Parameterr$   randnclass_embeddingConv2dnum_channelspatch_embedding
image_sizenum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr%   expandrC   r   	__class__s     r'   r   Owlv2VisionEmbeddings.__init__  s    ++++!||EKK8J8J,KL!yy++))$$ 
 #--1B1BBqH!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr)   
embeddingsheightwidthr   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr   g      ?r   rd   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer$   jit
is_tracingr   r   r   reshapepermuter   r"   interpolateviewcat)rC   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r'   interpolate_pos_encoding.Owlv2VisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr)   pixel_valuesr   c                 b   UR                   u  p4pVU R                  U5      nUR                  S5      R                  SS5      nU R                  R                  USS5      n[        R                  " X/SS9n	U(       a  XR                  XU5      -   n	U	$ XR                  U R                  5      -   n	U	$ )Nrd   r   r   r   )r   r   flatten	transposer   r   r$   r   r   r   r   )
rC   r   r   
batch_size_r   r   patch_embedsr   r   s
             r'   forwardOwlv2VisionEmbeddings.forwardI  s    '3'9'9$
v++L9#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr)   )r   r   r   r   r   r   r   r   F)rO   rP   rQ   rR   r   r   r$   r
   r`   r   rT   boolr   rV   __classcell__r   s   @r'   r   r   
  sr    q0 q*'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 QU bgbnbn  r)   r   c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )Owlv2TextEmbeddingsiX  r   c                 ^  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nr   r   Fr   )r   r   r   r   
vocab_sizer   token_embeddingmax_position_embeddingsr   r   r$   r%   r   r   s     r'   r   Owlv2TextEmbeddings.__init__Y  s    !||F,=,=v?Q?QR"$,,v/M/MvOaOa"b 	ELL)G)GHOOPWXej 	 	
r)   	input_idsr   inputs_embedsr   c                     Ub  UR                   S   OUR                   S   nUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX5-   nU$ )Nr   )r   r   r   r   )rC   r   r   r   
seq_lengthposition_embeddingsr   s          r'   r   Owlv2TextEmbeddings.forwardc  sx     -6,AY__R(}GZGZ[]G^
,,Q^<L  00;M"55lC"8
r)   )r   r   )NNN)rO   rP   rQ   rR   r   r   r   r$   
LongTensorrT   r
   r   rV   r   r   s   @r'   r   r   X  so    
 
 153759	E,,- u//0   1 12	
 
 r)   r   c                   &  ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	   SS\R                  S	\
\R                     S
\
\R                     S\
\   S\\R                  \
\R                     \
\\R                        4   4
S jjrSrU =r$ )Owlv2Attentionix  z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      )r   r   r   r   r   num_attention_heads	num_headshead_dimrx   scaleattention_dropoutdropoutr   Lineark_projv_projq_projout_projr   s     r'   r   Owlv2Attention.__init__{  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar)   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   rd   )r   r   r   r   
contiguous)rC   r   r   r   s       r'   _shapeOwlv2Attention._shape  s5    {{3GQQRSUVWbbddr)   hidden_statesattention_maskcausal_attention_maskoutput_attentionsr   c                    UR                  5       u  pVnU R                  U5      U R                  -  nU R                  U R	                  U5      SU5      n	U R                  U R                  U5      SU5      n
XPR                  -  SU R                  4nU R                  XU5      R                  " U6 nU	R                  " U6 n	U
R                  " U6 n
U	R                  S5      n[        R                  " XR                  SS5      5      nUR                  5       XPR                  -  Xl4:w  a-  [        SXPR                  -  Xl4 SUR                  5        35      eUbv  UR                  5       USXl4:w  a"  [        SUSXl4 SUR                  5        35      eUR                  XPR                  Xl5      U-   nUR                  XPR                  -  Xl5      nUbv  UR                  5       USXl4:w  a"  [        SUSXl4 SUR                  5        35      eUR                  XPR                  Xl5      U-   nUR                  XPR                  -  Xl5      n[        R                  R                  USS9nU(       a;  UR                  XPR                  Xl5      nUR                  XPR                  -  Xl5      nOSn[        R                  R!                  XR                   U R"                  S	9nUR%                  U
R&                  5      n[        R                  " X5      nUR                  5       XPR                  -  X`R                  4:w  a5  [        S
XPR                  X`R                  4 SUR                  5        35      eUR                  XPR                  X`R                  5      nUR                  SS5      nUR)                  XVU5      nU R+                  U5      nUU4$ )z#Input shape: Batch x Time x Channelr   r   rd   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r   )ptrainingz `attn_output` should be of size )r   r   r   r  r   r   r   r   r   r$   bmmr   rx   r   r"   softmaxr   r	  torZ   r   r   )rC   r  r  r  r  r   tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                    r'   r   Owlv2Attention.forward  s    #0"4"4"6i {{=1DJJ>[[]!;RE
{{4;;}#=r3GNN*B>
{{<#>CCZP__j1
#((*5//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(*  !,$))+Q/II 7a8R7S T-22457  (,,S..'SVkkL',,S>>-A7TL%""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL',,S>>-A7TL}},,\r,B
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<<<RVR_R_*`
  ]]<#5#56
ii
9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1!))#	BmmK0111r)   )
r   r   r   r   r   r   r   r   r   r   NNF)rO   rP   rQ   rR   rS   r   r$   r
   r`   r  r   r   r   r   rV   r   r   s   @r'   r   r   x  s    GB&eU\\ eC ec e 268<,1O2||O2 !.O2  (5	O2
 $D>O2 
u||Xell3XeELL>Q5RR	SO2 O2r)   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Owlv2MLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g rX   )r   r   r   r   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r'   r   Owlv2MLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr)   r  r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rX   )r  r  r   )rC   r  s     r'   r   Owlv2MLP.forward  s4    /**=9/r)   )r  r   r  r   )
rO   rP   rQ   rR   r   r$   r
   r   rV   r   r   s   @r'   r  r    s)    KU\\ ell  r)   r  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )Owlv2EncoderLayeri  r   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r  mlplayer_norm2r   s     r'   r   Owlv2EncoderLayer.__init__  sm    ++'/<<F<Q<QRF#<<F<Q<QRr)   r  r  r  r  r   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r  r  r  r  )r-  r*  r/  r.  )rC   r  r  r  r  residualr  outputss           r'   r   Owlv2EncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr)   )r   r-  r/  r.  r*  r   )rO   rP   rQ   rR   r   r   r$   r
   r   r   r   rT   r   rV   r   r   s   @r'   r%  r%    sk    S{ S -2&||& &  %||	&
 $D>& 
u  	!& &r)   r%  c                   ,    \ rS rSr\rSrSrS/rS r	Sr
g)Owlv2PreTrainedModeli&  owlv2Tr%  c                 F
   U R                   R                  n[        U[        5      (       ad  UR                  R
                  R                  R                  SUS-  S9  UR                  R
                  R                  R                  SUS-  S9  GO[        U[        5      (       a  U R                   R                  n[        R                  R                  UR                  SUR                  S-  U-  S9  [        R                  R                  UR                  R
                  UR                   R                  U-  S9  [        R                  R                  UR                  R
                  UR                   R                  U-  S9  GO[        U[         5      (       Ga!  U R                   R                  nUR                  S-  SUR                   R"                  -  S-  -  U-  nUR                  S-  U-  n[        R                  R                  UR$                  R
                  US9  [        R                  R                  UR&                  R
                  US9  [        R                  R                  UR(                  R
                  US9  [        R                  R                  UR*                  R
                  US9  GO[        U[,        5      (       a  U R                   R                  nUR                   R.                  S-  SUR                   R"                  -  S-  -  U-  nSUR                   R.                  -  S-  U-  n[        R                  R                  UR0                  R
                  US9  [        R                  R                  UR2                  R
                  US9  O[        U[4        5      (       a  [        R                  R                  UR6                  R
                  UR8                  S-  U R                   R                  -  S9  [        R                  R                  UR:                  R
                  UR<                  S-  U R                   R                  -  S9  [        U[        R>                  5      (       aI  UR@                  R                  RC                  5         UR
                  R                  RE                  S5        [        U[        RF                  5      (       a3  UR@                  b%  UR@                  R                  RC                  5         ggg)	zInitialize the weights        g{Gz?)meanstdr   )r;  rd         ?N)$r   initializer_factor
isinstancer   r   r   datanormal_r   r   r   initr   r   r   initializer_ranger   num_hidden_layersr   r   r   r   r  r   r  r   
Owlv2Modeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr+  r   zero_fill_r   )rC   modulefactorin_proj_stdout_proj_stdfc_stds         r'   _init_weights"Owlv2PreTrainedModel._init_weights.  s   //f122""))..66CVd]6S%%,,1199sQU9V 566[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk//[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE))[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?
++GGOO&&--))4/$++2P2PP   GGOO((//++T1DKK4R4RR   fbll++KK""$MM$$S)fbii((V[[-DKK""$ .E(r)   rN   N)rO   rP   rQ   rR   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesrP  rV   rN   r)   r'   r6  r6  &  s#     L&*#,-&%r)   r6  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jjrSrU =r$ )Owlv2EncoderiX  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Owlv2EncoderLayer`].

Args:
    config: Owlv2Config
r   c                    > [         TU ]  5         [        R                  " [	        UR
                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l        g s  snf )NF)	r   r   r   
ModuleListrangerC  r%  layersgradient_checkpointing)rC   r   r   r   s      r'   r   Owlv2Encoder.__init__a  sN    mmfNfNfHg$hHg1%6v%>Hg$hi&+# %is   A r  r  r  output_hidden_statesreturn_dictr   c                 6   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	U R                   Hp  n
U(       a  Xy4-   nU R
                  (       a1  U R                  (       a   U R                  U
R                  U	UUU5      nO	U
" U	UUUS9nUS   n	U(       d  Mh  XS   4-   nMr     U(       a  Xy4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`).
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrN   )r  r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frX   rN   )rA   vs     r'   rD   'Owlv2Encoder.forward.<locals>.<genexpr>  s     e$Sq$S   	)last_hidden_stater  
attentions)r   r  r^  use_return_dictr[  r\  r	  _gradient_checkpointing_func__call__rI   r   )rC   r   r  r  r  r^  r_  encoder_statesall_attentionsr  encoder_layerlayer_outputss               r'   r   Owlv2Encoder.forwardf  s/   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%![[M#!/2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M  !/3C2E!E- )0  +.>>Ne]N$Seee+Vd
 	
r)   )r\  r[  NNNNN)rO   rP   rQ   rR   rS   r   r   r   r$   r
   r   r	   r   r   r   rV   r   r   s   @r'   rW  rW  X  s    ,{ , 268<,0/3&*H
 !.H
  (5	H

 $D>H
 'tnH
 d^H
 
uo%	&H
 H
r)   rW  c                      ^  \ rS rSrS\4U 4S jjr\     SS\R                  S\	\R                     S\	\R                     S\	\
   S\	\
   S	\	\
   S
\\\4   4S jj5       rSrU =r$ )Owlv2TextTransformeri  r   c                    > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        g r'  )r   r   r   r   r   r   rW  encoderr   r+  r,  final_layer_norm)rC   r   r   r   s      r'   r   Owlv2TextTransformer.__init__  sM    &&	-f5#F+ "Y<Q<Q Rr)   r   r  r   r  r^  r_  r   c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUR	                  5       nUR                  SUS   5      nU R                  XS9n[        XxR                  UR                  S9n	Ub  [        X(R                  5      nU R                  UUU	UUUS9n
U
S   nU R                  U5      nU[        R                  " UR                  S   UR                  S9UR!                  [        R"                  5      R%                  SS9R!                  UR                  5      4   nU(       d	  X4U
SS -   $ ['        UUU
R(                  U
R*                  S	9$ )
aT  
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids)
Nr   )r   r   r    )r   r  r  r  r^  r_  r   r   r   re  pooler_outputr  rf  )r   r  r^  rg  r   r   r   r   rZ   r!   r   rs  rt  r$   r%   r   r  r`   argmaxr   r  rf  )rC   r   r  r   r  r^  r_  input_shaper  r  encoder_outputsre  pooled_outputs                r'   r   Owlv2TextTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]nn&NN2{27	)W
 !A,,]5I5I!
 %7H[H[\N,,')"7/!5# ' 
 ,A. 112CD *LL*003<M<T<TULL#**r*2556G6N6NOQ

 %58KKK)/')77&11	
 	
r)   )r   r   rs  rt  ro  )rO   rP   rQ   rR   r   r   r   r$   r
   r   r   r	   r   r   r   rV   r   r   s   @r'   rq  rq    s    S S  26/3,0/3&*?
<<?
 !.?
 u||,	?

 $D>?
 'tn?
 d^?
 
u00	1?
 ?
r)   rq  c                      ^  \ rS rSr\rS\4U 4S jjrS\R                  4S jr	S r
\    SS\R                  S\\R                     S	\\   S
\\   S\\   S\\\4   4S jj5       rSrU =r$ )Owlv2TextModeli  r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rX   )r   r   rq  
text_model	post_initr   s     r'   r   Owlv2TextModel.__init__  s&     .v6r)   r   c                 B    U R                   R                  R                  $ rX   r  r   r   rK   s    r'   get_input_embeddings#Owlv2TextModel.get_input_embeddings  s    ))999r)   c                 8    XR                   R                  l        g rX   r  )rC   values     r'   set_input_embeddings#Owlv2TextModel.set_input_embeddings  s    5:""2r)   r   r  r  r^  r_  c                 (    U R                  UUUUUS9$ )aW  
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids)

Examples:
```python
>>> from transformers import AutoProcessor, Owlv2TextModel

>>> model = Owlv2TextModel.from_pretrained("google/owlv2-base-patch16")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
>>> inputs = processor(
...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
... )
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```r   r  r  r^  r_  r  )rC   r   r  r  r^  r_  s         r'   r   Owlv2TextModel.forward  s)    < )/!5#  
 	
r)   r  )NNNN)rO   rP   rQ   rR   r   rR  r   r   Moduler  r  r   r$   r
   r   r   r	   r   r   r   rV   r   r   s   @r'   r  r    s    "L :bii :;  26,0/3&*#
<<#
 !.#
 $D>	#

 'tn#
 d^#
 
u00	1#
 #
r)   r  c                      ^  \ rS rSrS\4U 4S jjr\    SS\R                  S\	\
   S\	\
   S\	\
   S\	\
   S	\\\4   4S
 jj5       rSrU =r$ )Owlv2VisionTransformeri6  r   c                 &  > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  S9U l	        [        U5      U l        [
        R                  " UR                  UR                  S9U l        g r'  )r   r   r   r   r   r   r+  r   r,  pre_layernormrW  rs  post_layernormr   s     r'   r   Owlv2VisionTransformer.__init__7  si    /7\\&*<*<&BWBWX#F+ ll6+=+=6CXCXYr)   r   r  r^  r   r_  r   c                 "   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  R
                  R                  R                  nUR                  U5      nU R	                  XS9nU R                  U5      nU R                  UUUUS9nUS   n	U	S S 2SS S 24   n
U R                  U
5      n
U(       d	  X4USS  -   $ [        U	U
UR                  UR                  S9$ )N)r   )r   r  r^  r_  r   r   rw  )r   r  r^  rg  r   r   r   rZ   r  r  rs  r  r   r  rf  )rC   r   r  r^  r   r_  expected_input_dtyper  r{  re  r|  s              r'   r   Owlv2VisionTransformer.forward@  s(    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  $>>EEKK#';<h**=9,,'/!5#	 ' 
 ,A.)!Q'2++M:%58KKK)/')77&11	
 	
r)   )r   r   rs  r  r  )NNFN)rO   rP   rQ   rR   r   r   r   r$   rT   r   r   r	   r   r   r   rV   r   r   s   @r'   r  r  6  s    Z0 Z  -1/338&*)
'')
 $D>)
 'tn	)

 #+4.)
 d^)
 
u00	1)
 )
r)   r  c                      ^  \ rS rSr\rSrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\S
\\   S\\\4   4S jj5       rSrU =r$ )Owlv2VisionModelin  r   r   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rX   )r   r   r  vision_modelr  r   s     r'   r   Owlv2VisionModel.__init__r  s'     26:r)   r   c                 B    U R                   R                  R                  $ rX   )r  r   r   rK   s    r'   r  %Owlv2VisionModel.get_input_embeddingsx  s      ++;;;r)   r  r^  r   r_  c                 (    U R                  UUUUUS9$ )ab  
Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Owlv2VisionModel

>>> model = Owlv2VisionModel.from_pretrained("google/owlv2-base-patch16")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r   r  r^  r   r_  r  )rC   r   r  r^  r   r_  s         r'   r   Owlv2VisionModel.forward{  s+    6   %/!5%=# ! 
 	
r)   r  NNNFN)rO   rP   rQ   rR   r   rR  main_input_namer   r   r  r  r   r   r$   rT   r   r	   r   r   r   rV   r   r   s   @r'   r  r  n  s    $L$O0 <bii <  59,0/3).&* 
u001 
 $D> 
 'tn	 

 #' 
 d^ 
 
u00	1 
  
r)   r  c                   
  ^  \ rS rSr\rS\4U 4S jjr\     SS\\	R                     S\\	R                     S\\   S\\   S\\   S	\	R                  4S
 jj5       r\     SS\\	R                     S\\   S\\   S\S\\   S	\	R                  4S jj5       r\         SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\S\\   S\\   S	\\\4   4S jj5       rSrU =r$ )rD  i  r   c                 L  > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l        UR                  U l        [        U5      U l        [        U5      U l        ["        R$                  " U R                  U R                  SS9U l        ["        R$                  " U R                  U R                  SS9U l        ["        R*                  " [,        R.                  " UR0                  5      5      U l        U R5                  5         g )NzLconfig.text_config is expected to be of type Owlv2TextConfig but is of type .zPconfig.vision_config is expected to be of type Owlv2VisionConfig but is of type F)r   )r   r   r>  text_configr   	TypeErrortypevision_configr   projection_dimr   rF  rH  rq  r  r  r  r   r   rG  rE  r   r$   r   logit_scale_init_valuelogit_scaler  )rC   r   r  r  r   s       r'   r   Owlv2Model.__init__  sX    &,,o>>++,-Q0 
 &..0ABB--./q2 
 ((,,$33)55 - 9 9.{;2=A!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<V5R5R(ST 	r)   r   r  r  r^  r_  r   c                     Ub  UOU R                   R                  nU R                  XUS9nUS   nU R                  U5      nU$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids)

Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`Owlv2TextModel`].

Examples:
```python
>>> from transformers import AutoProcessor, Owlv2Model

>>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> inputs = processor(
...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
... )
>>> text_features = model.get_text_features(**inputs)
```)r   r  r_  r   )r   rg  r  rE  )	rC   r   r  r  r^  r_  text_outputr|  text_featuress	            r'   get_text_featuresOwlv2Model.get_text_features  sN    > &1%<k$++B]B] oo	fqor#A,,];r)   r   r   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUS9nUS   nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`Owlv2VisionModel`].

Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Owlv2Model

>>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> image_features = model.get_image_features(**inputs)
```r  r   )r   r  r^  rg  r  rG  )	rC   r   r  r^  r   r_  vision_outputsr|  image_featuress	            r'   get_image_featuresOwlv2Model.get_image_features  s    8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 'q)//>r)   return_lossreturn_base_image_embedsc
           
      "   Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R	                  UUUUU	S9n
U R                  UUUUU	S9nUS   nU R                  U5      nU
S   nU R                  U5      nU[        R                  R                  USSSS9-  nU[        R                  R                  USSSS9-  nU R                  R                  5       R                  UR                  5      n[        R                  " XR!                  5       5      U-  nUR!                  5       nSnU(       a  [#        U5      nUnU	(       d  UUXX4nUb  U4U-   $ U$ [%        UUUUUUU
S	9$ )
a  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.
return_base_image_embeds (`bool`, *optional*):
    Whether or not to return the base image embeddings.

Examples:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Owlv2Model

>>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```Nr  r  r   rd   r   T)ordr   keepdim)r3   r4   r5   r6   r7   r8   r9   )r   r  r^  rg  r  r  rE  rG  r$   linalgnormr  expr  r!   matmulr,   r/   r1   )rC   r   r   r  r  r  r^  r   r  r_  r  text_outputsr6   r7   text_embeds_normr  r5   r4   r3   outputs                       r'   r   Owlv2Model.forward  s   F 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 )/!5# ' 
 #1o**;7%a(--l; $ell&7&7!QS]a&7&bb&):):;ASU_c):)dd &&**,//0C0CD,,'79IJ[X*,,.o.D&&T`qF)-)9TGf$EvE-+#%* .
 	
r)   )r  r  rF  r  rE  rH  r  rG  ro  r  )	NNNNNNFNN)rO   rP   rQ   rR   r   rR  r   r   r   r$   r
   r   rT   r  r  r   r	   r   r1   r   rV   r   r   s   @r'   rD  rD    s    L{ @  -115,0/3&*%ELL)% !.% $D>	%
 'tn% d^% 
		% %N  59,0/3).&*,u001, $D>, 'tn	,
 #', d^, 
		, ,\  154815&*,0/3).37&*Z
E,,-Z
 u001Z
 !.	Z

 d^Z
 $D>Z
 'tnZ
 #'Z
 #+4.Z
 d^Z
 
uk!	"Z
 Z
r)   rD  c                   r   ^  \ rS rSrS	S\S\4U 4S jjjrS\R                  S\R                  4S jr
SrU =r$ )
Owlv2BoxPredictionHeadiz  r   out_dimc                 $  > [         TU ]  5         UR                  R                  n[        R
                  " X35      U l        [        R
                  " X35      U l        [        R                  " 5       U l	        [        R
                  " X25      U l
        g rX   )r   r   r  r   r   r   dense0dense1GELUgeludense2)rC   r   r  r   r   s       r'   r   Owlv2BoxPredictionHead.__init__{  s\    $$00ii-ii-GGI	ii/r)   r  r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ rX   )r  r  r  r  )rC   r  r  s      r'   r   Owlv2BoxPredictionHead.forward  sM    ^,6"V$6"V$r)   )r  r  r  r  )   )rO   rP   rQ   rR   r   r`   r   r$   r
   rT   r   rV   r   r   s   @r'   r  r  z  s=    0{ 0S 0 0ell u7H7H  r)   r  c            	          ^  \ rS rSrS\4U 4S jjrS\R                  S\\R                     S\\R                     S\
\R                     4S jrS	rU =r$ )
Owlv2ClassPredictionHeadi  r   c                   > [         TU ]  5         UR                  R                  nUR                  R                  U l        [        R                  " U R
                  U5      U l        [        R                  " U R
                  S5      U l	        [        R                  " U R
                  S5      U l
        [        R                  " 5       U l        g )Nr   )r   r   r  r   r  	query_dimr   r   r  logit_shiftr  ELUelu)rC   r   r  r   s      r'   r   !Owlv2ClassPredictionHead.__init__  s    $$00--99ii899T^^Q799T^^Q7668r)   r7   query_embeds
query_maskr   c                 "   U R                  U5      nUcQ  UR                  nUR                  S S u  pg[        R                  " XgU R
                  45      R                  U5      nX4$ U[        R                  R                  USSS9S-   -  nU[        R                  R                  USSS9S-   -  n[        R                  " SXB5      nU R                  U5      n	U R                  U5      n
U R                  U
5      S-   n
X-   U
-  nUb  UR                  S:  a  [        R                  " USS	9n[        R                  " US
:H  [        R                   " UR"                  5      R$                  U5      nUR                  [        R&                  5      nX4$ )Nrd   r   T)r   r  gư>z...pd,...qd->...pqr   r   r   r   )r  r!   r   r$   zerosr  r  r  r  einsumr  r  r  ndimr   wherefinforZ   rh   r[   )rC   r7   r  r  image_class_embedsr!   r   r   pred_logitsr  r  s              r'   r    Owlv2ClassPredictionHead.forward  sw    "[[6'..F&8&>&>r&B#J++z&OPSSTZ[K44 05<<3D3DEW]_im3D3nqu3uv#u||'8'82W['8'\_c'cd ll#79KZ &&|4&&|4hh{+a/"0K?!""__ZR@
++jAou{{;CTCT7U7Y7Y[fgK%..7K00r)   )r  r  r  r  r  )rO   rP   rQ   rR   r   r   r$   rT   r   r
   r   r   rV   r   r   s   @r'   r  r    sd    	{ 	!1''!1 u001!1 U\\*	!1
 
u  	!!1 !1r)   r  c                   F  ^  \ rS rSr\rS\4U 4S jjr\S\S\S\	R                  4S j5       rS\	R                  S\	R                  4S	 jr\" S
S9 S#S\S\S\\	R                     S\	R                  4S jj5       r S$S\	R                  S\	R                  S\S\	R                  4S jjr  S%S\	R                  S\\	R                     S\\	R                     S\\	R                     4S jjr   S&S\	R                  S\	R                  S\	R                  S\\   S\\   S\S\\	R                     4S jjr   S&S\	R                  S\\   S\\   S\S\\	R                     4
S jjr S$S\	R                  S\	R                  S\S\	R                  4S jjr\     S'S\	R                  S\\	R                     S\\   S\\   S\S\\   S\4S  jj5       r\     S'S\	R                  S\	R                  S\\	R                     S\\   S\\   S\S\\   S\4S! jj5       rS"rU =r$ )(Owlv2ForObjectDetectioni  r   c                   > [         TU ]  U5        [        U5      U l        [	        U5      U l        [        U5      U l        [        USS9U l        [        R                  " UR                  R                  UR                  R                  S9U l        [        R                  " 5       U l        Xl        U R"                  R                  R$                  U R"                  R                  R&                  -  U l        U R"                  R                  R$                  U R"                  R                  R&                  -  U l        U R-                  U R(                  U R*                  5      U l        g )Nr   )r  r(  )r   r   rD  r7  r  
class_headr  box_headobjectness_headr   r+  r  r   r,  
layer_normSigmoidsigmoidr   r   r   num_patches_heightnum_patches_widthcompute_box_biasbox_biasr   s     r'   r    Owlv2ForObjectDetection.__init__  s     '
26:.v65faH,,v';';'G'GVMaMaMpMpqzz|"&++";";"F"F$++JcJcJnJn"n!%!:!:!E!EIbIbImIm!m--d.E.EtG]G]^r)   r  r  r   c                 T   [         R                  " SUS-   [         R                  S9n[         R                  " SU S-   [         R                  S9n[         R                  " X#SS9u  pE[         R                  " XE4SS9nUS==   U-  ss'   US==   U -  ss'   UR                  SS	5      nU$ )
Nr   )rZ   xy)indexingr   r   .r   .r   rd   )r$   r%   r[   meshgridstackr   )r  r  x_coordinatesy_coordinatesxxyybox_coordinatess          r'   !normalize_grid_corner_coordinates9Owlv2ForObjectDetection.normalize_grid_corner_coordinates  s     Q(9A(=U]]SQ(:Q(>emmTtL  ++rhB7#44#55 *..r15r)   r  c                 R    UR                  5       nU R                  U5      nUS   nU$ )zPredicts the probability that each image feature token is an object.

Args:
    image_features (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_dim)`)):
        Features extracted from the image.
Returns:
    Objectness scores.
r  )detachr  )rC   r  r   s      r'   objectness_predictor,Owlv2ForObjectDetection.objectness_predictor  s4     (..0 00@-f5  r)   rd   )maxsizefeature_mapc                    Ub  [        S5      eU R                  X5      n[        R                  " USS5      n[        R                  " US-   5      [        R
                  " U* S-   5      -
  n[        R                  " US5      nUS==   U-  ss'   US==   U-  ss'   [        R                  " US-   5      [        R
                  " U* S-   5      -
  n[        R                  " XW/SS9nU$ )	NzOfeature_map has been deprecated as an input. Please pass in num_patches insteadr9  r<  g-C6?r  r  r   r   )rx   r  r$   cliploglog1p	full_liker   )	rC   r  r  r
  r  box_coord_biasbox_sizebox_size_biasr  s	            r'   r  (Owlv2ForObjectDetection.compute_box_bias  s    
 "noo@@ASg**_c3? ?T#9:U[[/IY\`I`=aa ??>37--..		(T/2U[[(TAQ5RR 99n<"Er)   image_featsr   c                     U R                  U5      nU(       a!  UR                  u  pVpuU R                  Xg5      nOU R                  nUR	                  UR
                  5      nXH-  nU R                  U5      nU$ )a  
Args:
    image_feats:
        Features extracted from the image, returned by the `image_text_embedder` method.
    feature_map:
        A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
    interpolate_pos_encoding:
        Whether to interpolate the pre-trained position encodings.
Returns:
    pred_boxes:
        List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
)r  r   r  r  r  r!   r  )	rC   r  r
  r   r   r   r  r  r  s	            r'   box_predictor%Owlv2ForObjectDetection.box_predictor	  sr    & ]];/
 $:E:K:K7A#4,,-?SH}}H;;{112
\\*-
r)   r  r  c                 0    U R                  XU5      u  pEXE4$ )z
Args:
    image_feats:
        Features extracted from the `image_text_embedder`.
    query_embeds:
        Text query embeddings.
    query_mask:
        Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
)r  )rC   r  r  r  r  r  s         r'   class_predictor'Owlv2ForObjectDetection.class_predictor+  s!     -1OOKWa,b)00r)   r   r   r  r  r^  c           
         U R                  UUUUUUSS9nU(       aU  UR                  u    pn
XR                  R                  R                  -  nXR                  R                  R                  -  nOU R
                  nU R                  nUR                  S   nU R                   R                  R                  U5      n[        R                  " US S 2S S2S S 24   US S 2S S24   R                  5      nUS S 2SS 2S S 24   U-  nU R                  U5      nUR                  S   UUUR                  S   4nUR                  U5      nUS   nUX4$ )NT)r   r   r  r  r^  r   r_  r   r   r   )r7  r   r   r  r   r  r  r9   r  r  r$   broadcast_tor  r   )rC   r   r   r  r  r^  r   r3  r   r   r   r  r  re  r7   class_token_outnew_sizer6   s                     r'   image_text_embedder+Owlv2ForObjectDetection.image_text_embedder?  sn    **%)/!5%=  
 $"."4"4Aq%!';;+D+D+O+O!O %)B)B)M)M M!%!8!8 $ 6 6 $77:zz..==>OP  ,,\!RaR(-C\RSUXVXUXRXEYE_E_` $Aqr1H-?|4 q!r"	
 $++H5bk\33r)   c                    U R                   R                  XSS9nU(       aU  UR                  u    pgnXpR                  R                  R
                  -  n	XR                  R                  R
                  -  n
OU R                  n	U R                  n
US   nU R                   R                  R                  U5      n[        R                  " US S 2S S2S S 24   US S 2S S24   R                  5      nUS S 2SS 2S S 24   U-  nU R                  U5      nUR                  S   U	U
UR                  S   4nUR                  U5      nX4$ )NT)r   r   r_  r   r   r   )r7  r  r   r   r  r   r  r  r  r$   r  r  r   )rC   r   r  r^  r   r  r   r   r   r  r  re  r7   r  r  s                  r'   image_embedder&Owlv2ForObjectDetection.image_embedders  sT    00%fj 1 
 $"."4"4Aq%!';;+D+D+O+O!O %)B)B)M)M M!%!8!8 $ 6 6 +1-zz..==>OP  ,,\!RaR(-C\RSUXVXUXRXEYE_E_` $Aqr1H-?|4 q!r"	
 $++H5--r)   query_image_featuresquery_feature_mapc                    U R                  U5      u  pEU R                  XU5      n[        U5      n/ n/ n	UR                  n
[	        UR
                  S   5       GH$  n[        R                  " / SQ/U
S9nX{   n[        X5      u  p[        R                  " US   S:H  5      (       a  [        X5      n[        R                  " U5      S-  nUS   U:  R                  5       nUR                  5       (       d  M  X[   UR                  S5         n[        R                  " X[   SS9n[        R                   " SUU5      nU[        R"                  " U5         nUR%                  X[   U   5        U	R%                  U5        GM'     U(       a-  [        R&                  " U5      n[        R&                  " U	5      nOS	u  nnUUU4$ )
Nr   )r   r   r   r   r    r9  g?r   )axiszd,id->iNN)r  r  r   r!   rZ  r   r$   r   ru   rw   r|   ri   nonzeronumelsqueezer:  r  argminappendr  )rC   r%  r&  r   r   r   r   pred_boxes_as_cornersbest_class_embedsbest_box_indicespred_boxes_deviceieach_query_boxeach_query_pred_boxesiousiou_thresholdselected_indsselected_embeddingsmean_embedsmean_simbest_box_indr  box_indicess                          r'   embed_image_query)Owlv2ForObjectDetection.embed_image_query  s    ../CD''(<Qij
 8 D 188+11!45A"\\<.ARSN$9$<!nDGD yyaC((*>Q "IIdOc1M!!W5>>@M""$$&2om6K6KA6N&O##jjqA <<	;@ST,U\\(-CD!(()FG ''5' 6*  ;;'89L++&67K(2%L+[*44r)   query_pixel_valuesr_  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  X%S9S   nU R	                  UUUUS9u  pUR
                  u  pp[        R                  " XX-  U45      nUR
                  u  pp[        R                  " XzX-  U45      nU R                  XU5      u  nnnU R                  UUS9u  nnU R                  XU5      nU(       d+  UUUUUUU	R                  5       4n[        S U 5       5      nU$ [        UUUUUUSU	S9$ )a
  
query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
    Pixel values of query image(s) to be detected. Pass in one query image per target image.

Examples:
```python
>>> import requests
>>> from PIL import Image
>>> import torch
>>> from transformers import AutoProcessor, Owlv2ForObjectDetection

>>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
>>> query_image = Image.open(requests.get(query_url, stream=True).raw)
>>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")

>>> # forward pass
>>> with torch.no_grad():
...     outputs = model.image_guided_detection(**inputs)

>>> target_sizes = torch.Tensor([image.size[::-1]])

>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_image_guided_detection(
...     outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes
... )
>>> i = 0  # Retrieve predictions for the first image
>>> boxes, scores = results[i]["boxes"], results[i]["scores"]
>>> for box, score in zip(boxes, scores):
...     box = [round(i, 2) for i in box.tolist()]
...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
Detected similar object with confidence 0.938 at location [327.31, 54.94, 547.39, 268.06]
Detected similar object with confidence 0.959 at location [5.78, 360.65, 619.12, 366.39]
Detected similar object with confidence 0.902 at location [2.85, 360.01, 627.63, 380.8]
Detected similar object with confidence 0.985 at location [176.98, -29.45, 672.69, 182.83]
Detected similar object with confidence 1.0 at location [6.53, 14.35, 624.87, 470.82]
Detected similar object with confidence 0.998 at location [579.98, 29.14, 615.49, 489.05]
Detected similar object with confidence 0.985 at location [206.15, 10.53, 247.74, 466.01]
Detected similar object with confidence 0.947 at location [18.62, 429.72, 646.5, 457.72]
Detected similar object with confidence 0.996 at location [523.88, 20.69, 586.84, 483.18]
Detected similar object with confidence 0.998 at location [3.39, 360.59, 617.29, 499.21]
Detected similar object with confidence 0.969 at location [4.47, 449.05, 614.5, 474.76]
Detected similar object with confidence 0.966 at location [31.44, 463.65, 654.66, 471.07]
Detected similar object with confidence 0.924 at location [30.93, 468.07, 635.35, 475.39]
```N)r   r   r   )r   r  r^  r   )r  r  c              3   .   #    U  H  oc  M  Uv   M     g 7frX   rN   rA   xs     r'   rD   AOwlv2ForObjectDetection.image_guided_detection.<locals>.<genexpr>2       >f11frd  )r7   r   r   r   r   r   r8   r9   )r   r  r^  r_  r#  r   r$   r   r>  r  r  r?   rI   r   )rC   r   r@  r  r^  r   r_  r&  r
  r  r   r  r  
hidden_dimr  query_image_featsr  r1  r   r  r   r   r  s                          r'   image_guided_detection.Owlv2ForObjectDetection.image_guided_detection  s   v 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY !//+ 0 

 '+&9&9%/!5%=	 ': '
# ITHYHYE
(9mmK>P>dfp1qrHYH_H_E
(9!MM,>,RT^_
 <@;Q;Q2J<
8&(8
 '+&:&:{am&:&n#l !..{Iab!! '')F >f>>FM4$0/-%" .	
 		
r)   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUS9u  pn
U
R
                  nU
R                  nU	R                  u  pnn[        R                  " XX-  U45      nUR                  S   U-  nUR                  UUUR                  S   5      nUR                  UUUR                  S   5      nUS   S:  nU R                  UUU5      u  nnU R                  U5      nU R                  UX5      nU(       d:  UUUUU	UUR                  5       UR                  5       4n[        S U 5       5      nU$ [        U	UUUUUUUS9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
    [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
    IDs?](../glossary#input-ids).
output_hidden_states (`bool`, *optional*):
    Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
    `vision_model_last_hidden_state` under returned tensors for more detail.

Examples:
```python
>>> import requests
>>> from PIL import Image
>>> import torch

>>> from transformers import Owlv2Processor, Owlv2ForObjectDetection

>>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
>>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text_labels = [["a photo of a cat", "a photo of a dog"]]
>>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
>>> outputs = model(**inputs)

>>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
>>> target_sizes = torch.tensor([(image.height, image.width)])
>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
>>> results = processor.post_process_grounded_object_detection(
...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
... )
>>> # Retrieve predictions for the first image for the corresponding text queries
>>> result = results[0]
>>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
>>> for box, score, text_label in zip(boxes, scores, text_labels):
...     box = [round(i, 2) for i in box.tolist()]
...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
```)r   r   r  r  r^  r   r   r   r  c              3   .   #    U  H  oc  M  Uv   M     g 7frX   rN   rC  s     r'   rD   2Owlv2ForObjectDetection.forward.<locals>.<genexpr>  rF  rd  )r7   r6   r   r   r   r   r8   r9   )r   r  r^  r_  r   r8   r9   r   r$   r   r  r  r  r?   rI   r~   )rC   r   r   r  r  r^  r   r_  r  r
  r3  r  r  r   r  r  rG  r  max_text_queriesr  r  r   r   r   r  s                            r'   r   Owlv2ForObjectDetection.forward@  s   h 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY .2-E-E%)/!5%= .F .
*7 00 44HSHYHYE
(9:mmK>P>dfp1qr %??1-;#++J8H,J\J\]_J`a %%j2BIOOTVDWX	v&*
 '+&:&:;V`&a#l !55kB ''[[
!%%''')	F >f>>FM)$$!/%* .	
 		
r)   )
r  r  r  r   r  r  r  r  r7  r  rX   r   r)  r  r  )rO   rP   rQ   rR   r   rR  r   staticmethodr`   r$   r
   r  rT   r  r   r   r  r   r  r   r  r   r#  r>  r   r   rI  r~   r   rV   r   r   s   @r'   r  r    s   L_{ _ c VY ^c^j^j   !53D3D !IZIZ ! q ko"%:=LTUZUfUfLg	 6 */	&& && #'	
 
		J 59-1	1&&1 u0011 U\\*	1
 
u  	!12 -1/3).14<<14 ''14 	14
 $D>14 'tn14 #'14 
u  	!14n -1/3).(.''(. $D>(. 'tn	(.
 #'(. 
u  	!(.^ */	*5#//*5 !,,*5 #'	*5
 
		*5X  ;?,0/3).&*s
''s
 %U%6%67s
 $D>	s

 'tns
 #'s
 d^s
 
/s
 s
j 
 26,0/3).&*r
<<r
 ''r
 !.	r

 $D>r
 'tnr
 #'r
 d^r
 
$r
 r
r)   r  )rD  r6  r  r  r  )BrS   dataclassesr   	functoolsr   typingr   r   r   r   r	   r$   torch.utils.checkpointr
   r   activationsr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   configuration_owlv2r   r   r   transformers.image_transformsr   
get_loggerrO   loggerr(   r/   r1   ra   re   ru   r|   r~   r   r  r   r   r   r  r%  r6  rW  rq  r  r  r  rD  r  r  r  __all__rN   r)   r'   <module>r_     s:    !  4 4    ! d K - Y Y P P F 
		H	%`U\\ `ell `
-5<< -ELL - "
+ "
 "
LGv G& GEF Ev E""'0 2
 2
 2
j -
K -
 -
bJBII J\")) @h2RYY h2Xryy  /		 /d ,%? ,% ,%`V
299 V
tI
299 I
Z3
) 3
n4
RYY 4
p.
+ .
b U
% U
 U
rRYY (-1ryy -1`u
2 u
p rr)   