o
    Zhh                     @   s  d dl mZ d dlZd dlmZ d dlm  mZ d dlmZm	Z	m
Z
 d dlmZmZmZ d dlmZmZmZmZmZmZmZmZmZmZmZmZmZ ddlmZ G dd	 d	eZG d
d deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd dej%Z&G dd deZ'G dd deZ(G dd deZ)G dd deZ*G dd deZ+G d d! d!eZ,G d"d# d#eZ-g d$Z.dS )%    )OptionalN)BCEWithLogitsLossCrossEntropyLossMSELoss)SiglipConfigSiglipTextConfigSiglipVisionConfig)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputSiglipForImageClassificationSiglipModel#SiglipMultiheadAttentionPoolingHeadSiglipOutputSiglipPreTrainedModelSiglipTextModelSiglipTextModelOutputSiglipVisionModelSiglipVisionModelOutputSiglipVisionTransformer   )_prepare_4d_attention_maskc                   @      e Zd ZdS )Siglip2TextConfigN__name__
__module____qualname__ r   r   Z/var/www/auris/lib/python3.10/site-packages/transformers/models/siglip2/modular_siglip2.pyr   *       r   c                       s6   e Zd ZdZ											
d fdd	Z  ZS )Siglip2VisionConfigaO  
    This is the configuration class to store the configuration of a [`Siglip2VisionModel`]. It is used to instantiate a
    Siglip2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip2
    [google/siglip2-base-patch16-naflex](https://huggingface.co/google/siglip2-base-patch16-naflex) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        num_patches (`int`, *optional*, defaults to 256):
            The number of patches in the image with the size of (`patch_size`, `patch_size`).
            The image is resized to fill maximum of this number of patches, and to preserve
            the aspect ratio. In case the resulted number of patches is lower, the image is
            padded in "patch" dimension.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.

    Example:

    ```python
    >>> from transformers import Siglip2VisionConfig, Siglip2VisionModel

    >>> # Initializing a Siglip2VisionConfig with google/siglip2-base-patch16-naflex style configuration
    >>> configuration = Siglip2VisionConfig()

    >>> # Initializing a Siglip2VisionModel (with random weights) from the google/siglip2-base-patch16-naflex style configuration
    >>> model = Siglip2VisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```         r         gelu_pytorch_tanhư>        c                    s    t  jdi | || _| `d S )Nr   )super__init__num_patchesZ
image_size)selfhidden_sizeZintermediate_sizeZnum_hidden_layersnum_attention_headsnum_channelsr,   
patch_sizeZ
hidden_actZlayer_norm_epsZattention_dropoutkwargs	__class__r   r   r+   a   s   zSiglip2VisionConfig.__init__)
r"   r#   r$   r$   r   r%   r&   r'   r(   r)   )r   r   r   __doc__r+   __classcell__r   r   r3   r   r!   .   s    4r!   c                   @   r   )Siglip2ConfigNr   r   r   r   r   r7   t   r    r7   c                   @   r   )Siglip2VisionOutputNr   r   r   r   r   r8   x   r    r8   c                   @   r   )Siglip2TextOutputNr   r   r   r   r   r9   |   r    r9   c                   @   r   )Siglip2OutputNr   r   r   r   r   r:      r    r:   c                	       sb   e Zd Zdef fddZedejdejde	dejfdd	Z
d
ejdejdejfddZ  ZS )Siglip2VisionEmbeddingsconfigc                    sn   t    || _|j| _|j| _tj|j| j | j | jd| _	|j
| _
t| j
d | _t| j
| j| _d S )N)Zin_featuresZout_featuresg      ?)r*   r+   r<   r.   	embed_dimr1   nnZLinearr0   patch_embeddingr,   intposition_embedding_sizeZ	Embeddingposition_embeddingr-   r<   r3   r   r   r+      s   
z Siglip2VisionEmbeddings.__init__positional_embeddingsspatial_shapes
max_lengthreturnc                 C   s   |j d }| j d }| j}tj|||f| j|d}| dddd} | jjdkr/| tj	} t
|D ];}|| \}}	tj| ||	fddd	d
}
|
|||	 dd}
|
|}
|
||d||	 f< |
d ||||	 df< q3|S )ac  
        Resize positional embeddings to image-specific size and pad to a fixed size.

        Args:
            positional_embeddings (`torch.Tensor`):
                Position embeddings of shape (height, width, embed_dim)
            spatial_shapes (`torch.LongTensor`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
            max_length (`int`):
                Maximum length of the positional embeddings to pad resized positional embeddings to

        Returns:
            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
        r   )devicedtype      cpuZbilinearFT)sizemodeZalign_cornersZ	antialiasN)shaperJ   torchemptyrI   ZpermuteZ	unsqueezetypetoZfloat32rangeFZinterpolatereshapeZ	transpose)rD   rE   rF   
batch_sizer=   Zsource_dtypeZresulted_positional_embeddingsiheightwidthZresized_embeddingsr   r   r   resize_positional_embeddings   s2   

	
z4Siglip2VisionEmbeddings.resize_positional_embeddingspixel_valuesc                 C   sT   | j jj}|  |j|d}| jj| j| jd}| j|||jd d}|| }|S )aH  
        Args:
            pixel_values (`torch.FloatTensor`):
                Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
            spatial_shapes (`List[Tuple[int, int]]`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
        )rJ   rH   rL   )rF   )	r?   weightrJ   rT   rB   rW   rA   r\   rP   )r-   r]   rE   Ztarget_dtypeZpatch_embedsrD   Zresized_positional_embeddings
embeddingsr   r   r   forward   s   


zSiglip2VisionEmbeddings.forward)r   r   r   r!   r+   staticmethodrQ   Tensor
LongTensorr@   r\   FloatTensorr`   r6   r   r   r3   r   r;      s    $:r;   c                       sX   e Zd Zdef fddZ		ddejdejdejde	e
 d	e	e
 d
efddZ  ZS )Siglip2VisionTransformerr<   c                    s   t    |jdk| _d S )NZflash_attention_2)r*   r+   Z_attn_implementation_use_flash_attention_2rC   r3   r   r   r+      s   
z!Siglip2VisionTransformer.__init__Nr]   attention_maskrE   output_attentionsoutput_hidden_statesrG   c                 C   s   |dur|n| j j}|dur|n| j j}| ||}|dur(| js(t||j}n|}| j||||d}|j}	| 	|	}	| j
rD| |	|nd}
t|	|
|j|jdS )z
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        N)Zinputs_embedsrg   rh   ri   )last_hidden_statepooler_outputhidden_states
attentions)r<   rh   ri   r_   rf   r   rJ   encoderrj   Zpost_layernormZuse_headheadr
   rl   rm   )r-   r]   rg   rE   rh   ri   rl   Zencoder_attention_maskZencoder_outputsrj   rk   r   r   r   r`      s,   
z Siglip2VisionTransformer.forwardNN)r   r   r   r!   r+   rQ   rd   rb   rc   r   boolr
   r`   r6   r   r   r3   r   re      s"    
re   c                   @   r   )Siglip2PreTrainedModelNr   r   r   r   r   rr     r    rr   c                   @   r   )Siglip2TextModelNr   r   r   r   r   rs      r    rs   c                       sD   e Zd Zdef fddZd
dejdeej dejfdd	Z  Z	S )$Siglip2MultiheadAttentionPoolingHeadr<   c                    s   t  | |j| _d S N)r*   r+   r/   	num_headsrC   r3   r   r   r+   %  s   z-Siglip2MultiheadAttentionPoolingHead.__init__Nhidden_staterg   rG   c                 C   s   |j d }| j|dd}|d ur3|j d |j d }}t||j|}|d| j|d}|d||}| j||||dd }|}| |}|| 	| }|d d df S )Nr   rL   rH   )Z	attn_mask)
rP   proberepeatr   rJ   rv   rW   Z	attentionZ	layernormZmlp)r-   rw   rg   rX   rx   Z
target_lenZ
source_lenZresidualr   r   r   r`   )  s   

z,Siglip2MultiheadAttentionPoolingHead.forwardru   )
r   r   r   r!   r+   rQ   rb   r   r`   r6   r   r   r3   r   rt   $  s    *rt   c                   @   sB   e Zd Z		d
dejdejdejdee dee de	fdd	Z
dS )Siglip2VisionModelNr]   pixel_attention_maskrE   rh   ri   rG   c                 C   s   | j |||||dS )a9  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r]   rg   rE   rh   ri   )vision_model)r-   r]   r{   rE   rh   ri   r   r   r   r`   >  s   !zSiglip2VisionModel.forwardrp   )r   r   r   rQ   rd   rb   rc   r   rq   r
   r`   r   r   r   r   rz   <  s     rz   c                   @   s   e Zd Z					ddeej deej deej dee dee dejfdd	Z										dd
eej deej deej deej deej deej dee dee dee de
fddZdS )Siglip2ModelNr]   r{   rE   rh   ri   rG   c                 C   sF   |dur|n| j j}|dur|n| j j}| j|||||d}|j}|S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.

        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`Siglip2VisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```
        Nr|   )r<   rh   ri   r}   rk   )r-   r]   r{   rE   rh   ri   vision_outputsZpooled_outputr   r   r   get_image_featuresj  s   'zSiglip2Model.get_image_features	input_idsrg   position_idsreturn_lossc
              	   C   sD  |dur|n| j j}|	dur|	n| j j}	| j|||||	d}
| j|||||	d}|
j}|j}||jdddd }||jdddd }t||	 
|j}| j
|j| j
|j}}||  | }|	 }d}|rtj|d|jd	}t| d|  }tjj|| }tj|dd
 }| }t|||||||
dS )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
        >>> # important: we pass `padding=max_length` since the model was trained with this
        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> logits_per_image = outputs.logits_per_image
        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
        31.9% that image 0 is 'a photo of 2 cats'
        ```
        Nr|   )r   rg   r   rh   ri   rK   rH   T)pdimZkeepdimr   )rI   r   )losslogits_per_imagelogits_per_texttext_embedsimage_embedsZtext_model_outputZvision_model_output)r<   rh   ri   r}   Z
text_modelrk   ZnormrQ   matmultrT   rI   logit_scale
logit_biasexpeyerN   Z	ones_liker>   
functionalZ
logsigmoidsummeanr:   )r-   r   r]   r{   rE   rg   r   r   rh   ri   r   Ztext_outputsr   r   r   r   r   r   r   r   Zm1_diag1ZloglikZnllr   r   r   r`     sR   0zSiglip2Model.forward)NNNNN)	NNNNNNNNN)r   r   r   r   rQ   rd   rb   rc   rq   r   r:   r`   r   r   r   r   r~   h  sb    
;	
r~   c                   @   s`   e Zd Z						ddeej deej deej deej dee dee defd	d
Z	dS )Siglip2ForImageClassificationNr]   r{   rE   labelsrh   ri   rG   c                 C   s  |dur|n| j j}|dur|n| j j}| j|||||d}|j}|dur>|d |j}	tj||	 ddtj|	dd }ntj	|dd}| 
|}
d}|dur||
j}| j jdu r| jdkrfd| j _n| jdkr||jtjksw|jtjkr|d| j _nd| j _| j jdkrt }| jdkr||
 | }n+||
|}n%| j jdkrt }||
d	| j|d	}n| j jdkrt }||
|}t||
|j|jd
S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, Siglip2ForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a `Siglip2Model` from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip2-base-patch16-224")
        >>> model = Siglip2ForImageClassification.from_pretrained("google/siglip2-base-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the two classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: LABEL_1
        ```
        N)rg   rE   rh   ri   ).NrL   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrH   )r   logitsrl   rm   )r<   rh   ri   r}   rj   rT   rI   rQ   r   r   Z
classifierZproblem_typeZ
num_labelsrJ   longr@   r   Zsqueezer   viewr   r   rl   rm   )r-   r]   r{   rE   r   rh   ri   ZoutputsZsequence_outputZ	pool_maskr   r   Zloss_fctr   r   r   r`     sT   -"


"


z%Siglip2ForImageClassification.forward)NNNNNN)
r   r   r   r   rQ   rb   rc   rq   r   r`   r   r   r   r   r     s,    r   )r7   r   r!   r~   rr   rs   rz   r   )/typingr   rQ   Ztorch.nnr>   Ztorch.nn.functionalr   rV   r   r   r   Z/transformers.models.siglip.configuration_siglipr   r   r   Z*transformers.models.siglip.modeling_siglipr	   r
   r   r   r   r   r   r   r   r   r   r   r   Zmodeling_attn_mask_utilsr   r   r!   r7   r8   r9   r:   Moduler;   re   rr   rs   rt   rz   r~   r   __all__r   r   r   r   <module>   s0   <Fe3, $i