
    fThh                        S SK Jr  S SKrS SKJr  S SKJs  Jr  S SKJrJ	r	J
r
  S SKJrJrJr  S SKJrJrJrJrJrJrJrJrJrJrJrJrJr  SSKJr   " S S	\5      r " S
 S\5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\RJ                  5      r& " S S\5      r' " S S\5      r( " S S\5      r) " S S\5      r* " S S\5      r+ " S  S!\5      r, " S" S#\5      r-/ S$Qr.g)%    )OptionalN)BCEWithLogitsLossCrossEntropyLossMSELoss)SiglipConfigSiglipTextConfigSiglipVisionConfig)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputSiglipForImageClassificationSiglipModel#SiglipMultiheadAttentionPoolingHeadSiglipOutputSiglipPreTrainedModelSiglipTextModelSiglipTextModelOutputSiglipVisionModelSiglipVisionModelOutputSiglipVisionTransformer   )_prepare_4d_attention_maskc                       \ rS rSrSrg)Siglip2TextConfig*    N__name__
__module____qualname____firstlineno____static_attributes__r       c/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/siglip2/modular_siglip2.pyr   r   *       r#   r   c                   D   ^  \ rS rSrSr          SU 4S jjrSrU =r$ )Siglip2VisionConfig.   a
  
This is the configuration class to store the configuration of a [`Siglip2VisionModel`]. It is used to instantiate a
Siglip2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip2
[google/siglip2-base-patch16-naflex](https://huggingface.co/google/siglip2-base-patch16-naflex) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    num_patches (`int`, *optional*, defaults to 256):
        The number of patches in the image with the size of (`patch_size`, `patch_size`).
        The image is resized to fill maximum of this number of patches, and to preserve
        the aspect ratio. In case the resulted number of patches is lower, the image is
        padded in "patch" dimension.
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.

Example:

```python
>>> from transformers import Siglip2VisionConfig, Siglip2VisionModel

>>> # Initializing a Siglip2VisionConfig with google/siglip2-base-patch16-naflex style configuration
>>> configuration = Siglip2VisionConfig()

>>> # Initializing a Siglip2VisionModel (with random weights) from the google/siglip2-base-patch16-naflex style configuration
>>> model = Siglip2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```c                 6   > [         TU ]  " S0 UD6  X`l        U ?g )Nr   )super__init__num_patches
image_size)selfhidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channelsr,   
patch_size
hidden_actlayer_norm_epsattention_dropoutkwargs	__class__s               r$   r+   Siglip2VisionConfig.__init__a   s      	"6"&Or#   )r,   )
i   i      r;   r         gelu_pytorch_tanhgư>g        )r   r   r    r!   __doc__r+   r"   __classcell__r9   s   @r$   r'   r'   .   s3    0h & r#   r'   c                       \ rS rSrSrg)Siglip2Configt   r   Nr   r   r#   r$   rC   rC   t   r%   r#   rC   c                       \ rS rSrSrg)Siglip2VisionOutputx   r   Nr   r   r#   r$   rF   rF   x   r%   r#   rF   c                       \ rS rSrSrg)Siglip2TextOutput|   r   Nr   r   r#   r$   rI   rI   |   r%   r#   rI   c                       \ rS rSrSrg)Siglip2Output   r   Nr   r   r#   r$   rL   rL      r%   r#   rL   c            	          ^  \ rS rSrS\4U 4S jjr\S\R                  S\R                  S\
S\R                  4S j5       rS	\R                  S\R                  S\R                  4S
 jrSrU =r$ )Siglip2VisionEmbeddings   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        [        R                  " UR                  U R
                  -  U R
                  -  U R                  S9U l	        UR                  U l
        [        U R                  S-  5      U l        [        R                  " U R                  U R                  5      U l        g )N)in_featuresout_featuresg      ?)r*   r+   rQ   r/   	embed_dimr4   nnLinearr3   patch_embeddingr,   intposition_embedding_size	Embeddingposition_embeddingr.   rQ   r9   s     r$   r+    Siglip2VisionEmbeddings.__init__   s    ++ ++!yy++doo=O 

 "--'*4+;+;S+@'A$"$,,t/?/?"Pr#   positional_embeddingsspatial_shapes
max_lengthreturnc           	      N   UR                   S   nU R                   S   nU R                  n[        R                  " X2U4U R                  US9nU R                  SSS5      R                  S5      n U R                  R                  S:X  a  U R                  [        R                  5      n [        U5       Hn  nX   u  p[        R                  " U X4SSS	S
9n
U
R                  XHU	-  5      R                  SS5      n
U
R                  U5      n
XUSX-  24'   U
S   XgX-  S24'   Mp     U$ )a  
Resize positional embeddings to image-specific size and pad to a fixed size.

Args:
    positional_embeddings (`torch.Tensor`):
        Position embeddings of shape (height, width, embed_dim)
    spatial_shapes (`torch.LongTensor`):
        Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
    max_length (`int`):
        Maximum length of the positional embeddings to pad resized positional embeddings to

Returns:
    `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
r   )devicedtype      cpubilinearFT)sizemodealign_corners	antialiasN)shaperf   torchemptyre   permute	unsqueezetypetofloat32rangeFinterpolatereshape	transpose)r_   r`   ra   
batch_sizerU   source_dtyperesulted_positional_embeddingsiheightwidthresized_embeddingss              r$   resize_positional_embeddings4Siglip2VisionEmbeddings.resize_positional_embeddings   sI   ( $))!,
)//3	,22).Y/(//*
& !6 = =aA F P PQR S !'',,5$9$<$<U]]$K!z"A*-MF!"%_#" "4!;!;IPU~!V!`!`abde!f "4!6!6|!DBT1.>.>+>?BTUVBW*fn.>+>?% #( .-r#   pixel_valuesc                 :   U R                   R                  R                  nU R                  UR                  US95      nU R                  R                  R                  U R                  U R                  S5      nU R                  XRUR                  S   S9nXF-   nU$ )a  
Args:
    pixel_values (`torch.FloatTensor`):
        Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
    spatial_shapes (`List[Tuple[int, int]]`):
        Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
)rf   rd   rh   )ra   )	rX   weightrf   ru   r\   rz   rZ   r   ro   )r.   r   r`   target_dtypepatch_embedsr_   resized_positional_embeddings
embeddingss           r$   forwardSiglip2VisionEmbeddings.forward   s     ++2288++LOO,O,OP !% 7 7 > > F F(($*F*F!
 )-(I(I!l>P>PQR>S )J )
%
 "A
r#   )rQ   rU   r,   rX   r4   r\   rZ   )r   r   r    r!   r'   r+   staticmethodrp   Tensor
LongTensorrY   r   FloatTensorr   r"   r@   rA   s   @r$   rO   rO      s    Q2 Q 8.$||8.((8. 8. 
	8. 8.tE$5$5 uGWGW \a\h\h  r#   rO   c                      ^  \ rS rSrS\4U 4S jjr  SS\R                  S\R                  S\R                  S\
\   S\
\   S	\4S
 jjrSrU =r$ )Siglip2VisionTransformer   rQ   c                 J   > [         TU ]  5         UR                  S:H  U l        g )Nflash_attention_2)r*   r+   _attn_implementation_use_flash_attention_2r]   s     r$   r+   !Siglip2VisionTransformer.__init__   s"    &,&A&AEX&X#r#   r   attention_maskr`   output_attentionsoutput_hidden_statesrb   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  X5      nUb'  U R                  (       d  [        X&R                  5      nOUnU R                  UUUUS9nUR                  n	U R                  U	5      n	U R                  (       a  U R                  X5      OSn
[        U	U
UR                  UR                  S9$ )z
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.
N)inputs_embedsr   r   r   )last_hidden_statepooler_outputhidden_states
attentions)rQ   r   r   r   r   r   rf   encoderr   post_layernormuse_headheadr   r   r   )r.   r   r   r`   r   r   r   encoder_attention_maskencoder_outputsr   r   s              r$   r    Siglip2VisionTransformer.forward   s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 E%d.I.I%?PcPc%d"%3"+/<<'1/!5	 ,8 ,
 ,== //0ABHL		"3D[_)/')77&11	
 	
r#   )r   NN)r   r   r    r!   r'   r+   rp   r   r   r   r   boolr   r   r"   r@   rA   s   @r$   r   r      sv    Y2 Y -1/3*
''*
 *
 ((	*

 $D>*
 'tn*
 
$*
 *
r#   r   c                       \ rS rSrSrg)Siglip2PreTrainedModeli  r   Nr   r   r#   r$   r   r     r%   r#   r   c                       \ rS rSrSrg)Siglip2TextModeli   r   Nr   r   r#   r$   r   r      r%   r#   r   c                      ^  \ rS rSrS\4U 4S jjrS	S\R                  S\\R                     S\R                  4S jjr	Sr
U =r$ )
$Siglip2MultiheadAttentionPoolingHeadi$  rQ   c                 F   > [         TU ]  U5        UR                  U l        g N)r*   r+   r2   	num_headsr]   s     r$   r+   -Siglip2MultiheadAttentionPoolingHead.__init__%  s     33r#   hidden_stater   rb   c                    UR                   S   nU R                  R                  USS5      nUbc  UR                   S   UR                   S   pe[        X!R                  U5      nUR                  SU R
                  US5      nUR                  SXV5      nU R                  XAXS9S   nUnU R                  U5      nXpR                  U5      -   nUS S 2S4   $ )Nr   rh   rd   )	attn_mask)
ro   proberepeatr   rf   r   rz   	attention	layernormmlp)r.   r   r   r|   r   
target_len
source_lenresiduals           r$   r   ,Siglip2MultiheadAttentionPoolingHead.forward)  s    !''*


!!*a3%%*[[^\5G5G5J
7HZHZ\fgN+221dnnjRSTN+33B
ON~~e<~bcde~~l3((<"88AqD!!r#   )r   r   )r   r   r    r!   r'   r+   rp   r   r   r   r"   r@   rA   s   @r$   r   r   $  sC    42 4"ELL "(5<<BX "didpdp " "r#   r   c                       \ rS rSr  SS\R
                  S\R                  S\R                  S\\	   S\\	   S\
4S	 jjrS
rg)Siglip2VisionModeli<  Nr   pixel_attention_maskr`   r   r   rb   c                 (    U R                  UUUUUS9$ )a  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```r   r   r`   r   r   )vision_model)r.   r   r   r`   r   r   s         r$   r   Siglip2VisionModel.forward>  s,    B   %/)/!5 ! 
 	
r#   r   r   )r   r   r    r!   rp   r   r   r   r   r   r   r   r"   r   r#   r$   r   r   <  sf     -1/3'
'''
 $ll'
 ((	'

 $D>'
 'tn'
 
$'
 '
r#   r   c                      \ rS rSr     SS\\R                     S\\R                     S\\R                     S\\	   S\\	   S\R                  4S	 jjr
         SS
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S\\	   S\\	   S\4S jjrSrg)Siglip2Modelih  Nr   r   r`   r   r   rb   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUS9nUR                  nU$ )a<  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.

Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`Siglip2VisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     image_features = model.get_image_features(**inputs)
```
r   )rQ   r   r   r   r   )r.   r   r   r`   r   r   vision_outputspooled_outputs           r$   get_image_featuresSiglip2Model.get_image_featuresj  su    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/)/!5 6G 6
 '44r#   	input_idsr   position_idsreturn_lossc
           
         Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U R                  UUUUU	S9n
U R	                  UUUUU	S9nU
R
                  nUR
                  nXR                  SSSS9-  nXR                  SSSS9-  n[        R                  " XR                  5       R                  UR                  5      5      nU R                  R                  UR                  5      U R                  R                  UR                  5      nnXR                  5       -  U-   nUR                  5       nSnU(       a  [        R                  " UR!                  S5      UR                  S	9n[        R"                  " U5      * SU-  -   n[        R$                  R&                  R)                  UU-  5      n[        R*                  " USS
9* nUR-                  5       n[/        UUUUUUU
S9$ )a<  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AutoModel
>>> import torch

>>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
>>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
>>> # important: we pass `padding=max_length` since the model was trained with this
>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> logits_per_image = outputs.logits_per_image
>>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
>>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
31.9% that image 0 is 'a photo of 2 cats'
```
Nr   )r   r   r   r   r   rg   rd   T)pdimkeepdimr   )re   r   )losslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_output)rQ   r   r   r   
text_modelr   normrp   matmultru   re   logit_scale
logit_biasexpeyerk   	ones_likerV   
functional
logsigmoidsummeanrL   )r.   r   r   r   r`   r   r   r   r   r   r   text_outputsr   r   r   r   r   r   r   r   m1_diag1logliknlls                          r$   r   Siglip2Model.forward  s   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/)/!5 6G 6
 48??)%/!5 4C 4
 &33"00 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4D4G4GHZHZ4[\"&"2"2"5"5k6H6H"I4??K]K]^i^p^pKqZ)OO,==
J*,,.))O003O<R<RSC881s7BHXX((33H4NOF99V,,C88:D-+#%* .
 	
r#   r   )NNNNN)	NNNNNNNNN)r   r   r    r!   r   rp   r   r   r   r   r   rL   r   r"   r   r#   r$   r   r   h  s_    597;59,0/36u0016 'u||46 !!1!12	6
 $D>6 'tn6 
		6v 15487;591537&*,0/3e
E,,-e
 u001e
 'u||4	e

 !!1!12e
 !.e
 u//0e
 d^e
 $D>e
 'tne
 
e
 e
r#   r   c                       \ rS rSr      SS\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S	\	4S
 jjr
Srg)Siglip2ForImageClassificationi  Nr   r   r`   labelsr   r   rb   c                 4   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUS9nUR                  nUbL  US   R                  UR                  5      n	[        R                  " X-  SS9[        R                  " U	SS9-  nO[        R                  " USS9nU R                  U5      n
SnUGb  UR                  U
R                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [!        5       nU R                  S:X  a&  U" U
R#                  5       UR#                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [%        5       nU" U
R'                  S	U R                  5      UR'                  S	5      5      nO,U R                   R                  S:X  a  [)        5       nU" X5      n[+        UU
UR,                  UR.                  S
9$ )a  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
    Tensor containing the spatial dimensions (height, width) of the input images.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, Siglip2ForImageClassification
>>> import torch
>>> from PIL import Image
>>> import requests

>>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> # note: we are loading a `Siglip2Model` from the hub here,
>>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
>>> image_processor = AutoImageProcessor.from_pretrained("google/siglip2-base-patch16-224")
>>> model = Siglip2ForImageClassification.from_pretrained("google/siglip2-base-patch16-224")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> # model predicts one of the two classes
>>> predicted_class_idx = logits.argmax(-1).item()
>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
Predicted class: LABEL_1
```
N)r   r`   r   r   ).Nrh   r   
regressionsingle_label_classificationmulti_label_classificationrd   )r   logitsr   r   )rQ   r   r   r   r   ru   re   rp   r   r   
classifierproblem_type
num_labelsrf   longrY   r   squeezer   viewr   r   r   r   )r.   r   r   r`   r   r   r   outputssequence_output	pool_maskr   r   loss_fcts                r$   r   %Siglip2ForImageClassification.forward  s2   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 /3.?.?/)/!5 /@ /
 "33  +,Y7::?;Q;QRI#ii(CKeiiXaghNiiO#jja@O 1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./$!//))	
 	
r#   r   )NNNNNN)r   r   r    r!   r   rp   r   r   r   r   r   r"   r   r#   r$   r   r     s     047;59)-,0/3d
u||,d
 'u||4d
 !!1!12	d

 &d
 $D>d
 'tnd
 
d
 d
r#   r   )rC   r   r'   r   r   r   r   r   )/typingr   rp   torch.nnrV   torch.nn.functionalr   rx   r   r   r   /transformers.models.siglip.configuration_siglipr   r   r	   *transformers.models.siglip.modeling_siglipr
   r   r   r   r   r   r   r   r   r   r   r   r   modeling_attn_mask_utilsr   r   r'   rC   rF   rI   rL   ModulerO   r   r   r   r   r   r   r   __all__r   r#   r$   <module>r     s        A A n n     C	( 	C, CL	L 		1 		- 		L 	bbii bJ0
6 0
f	2 		 	"+N "0)
* )
X`
; `
Ff
$@ f
R	r#   