o
    Zh                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlZddlZddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z, e'-e.Z/dd Z0	dWdej1de2de2de2de2dej1fddZ3dXd!d"Z4d#d$ Z5d%d& Z6eG d'd( d(e$Z7eG d)d* d*e$Z8eG d+d, d,e$Z9G d-d. d.ej:Z;G d/d0 d0ej:Z<	dYd1ej:d2ej1d3ej1d4ej1d5eej1 d6e2d7e2fd8d9Z=G d:d; d;ej:Z>G d<d= d=ej:Z?G d>d? d?eZ@e%G d@dA dAe"ZAG dBdC dCej:ZBG dDdE dEej:ZCe%dFdGG dHdI dIeAZDG dJdK dKej:ZEG dLdM dMej:ZFe%dNdGG dOdP dPeAZGe%G dQdR dReAZHe%dSdGG dTdU dUeAZIg dVZJdS )ZzPyTorch Siglip model.    N)	dataclass)AnyCallableOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)_calculate_fan_in_and_fan_out   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )SiglipConfigSiglipTextConfigSiglipVisionConfigc                 C   s   dd }||d|  k s||d|  krt jddd ||| | }||| | }| d| d d| d  |   | |td  | | | j||d d S )	Nc                 S   s   dt | t d  d S )N      ?       @)matherfsqrt)x r%   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.pynorm_cdf,   s   z _trunc_normal_.<locals>.norm_cdf   zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.)
stacklevelr   r    )minmax)	warningswarnuniform_Zerfinv_mul_r!   r#   add_Zclamp_)tensormeanstdabr'   lur%   r%   r&   _trunc_normal_)   s    	
r8           r          r    r1   r2   r3   r4   r5   returnc                 C   sN   t   t| dd|| | || W d   dS 1 s w   Y  dS )an  Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(	ext{mean}, 	ext{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq 	ext{mean} \leq b`.

    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
    and the result is subsequently scaled and shifted by the mean and std args.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    r   r   N)torchno_gradr8   r/   r0   )r1   r2   r3   r4   r5   r%   r%   r&   trunc_normal_tf_M   s   
"r>   fan_innormalc           	      C   s  t | \}}|dkr|}n|dkr|}n
|dkr|| d }|| }|dkr3t| t|d d d S |dkrWt  | jt|d W d    d S 1 sPw   Y  d S |d	krtd
| }t  | | | W d    d S 1 syw   Y  d S td| )Nr?   fan_outZfan_avgr(   truncated_normalg۶%?r3   r@   uniformr   zinvalid distribution )	r   r>   r!   r#   r<   r=   normal_r.   
ValueError)	r1   scalemodedistributionr?   rA   denomZvarianceboundr%   r%   r&   variance_scaling_g   s(   
"
"rL   c                 C      t | ddd d S )Nr?   rB   rH   rI   rL   r1   r%   r%   r&   lecun_normal_      rQ   c                 C   rM   )Nr?   r@   rN   rO   rP   r%   r%   r&   default_flax_embed_init   rR   rS   c                   @   j   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dS )SiglipVisionModelOutputa  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__rV   r   r<   FloatTensor__annotations__rW   rX   r   rY   r%   r%   r%   r&   rU         
 rU   c                   @   rT   )SiglipTextModelOutputa  
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Ntext_embedsrW   .rX   rY   )rZ   r[   r\   r]   rb   r   r<   r^   r_   rW   rX   r   rY   r%   r%   r%   r&   ra      r`   ra   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeej ed< dZeej ed< dZeed< dZeed	< d
ee fddZdS )SiglipOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`SiglipTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`SiglipVisionModel`].
    Nlosslogits_per_imagelogits_per_textrb   rV   text_model_outputvision_model_outputr;   c                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))rg   rh   N)getattrto_tuple).0kselfr%   r&   	<genexpr>   s
    
z(SiglipOutput.to_tuple.<locals>.<genexpr>)tuplekeysrm   r%   rm   r&   rj      s   zSiglipOutput.to_tuple)rZ   r[   r\   r]   rd   r   r<   r^   r_   re   rf   rb   rV   rg   r   rh   r   r   rj   r%   r%   r%   r&   rc      s   
 rc   c                       sX   e Zd Zdef fddZdejdededejfdd	Zddej	dejfddZ
  ZS )SiglipVisionEmbeddingsconfigc                    s   t    || _|j| _|j| _|j| _tj|j	| j| j| jdd| _
| j| j d | _| j| _t| j| j| _| jdt| jddd d S )NZvalid)Zin_channelsZout_channelsZkernel_sizeZstridepaddingr(   position_idsr   F
persistent)super__init__rs   hidden_size	embed_dimZ
image_size
patch_sizer   Conv2dZnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr<   arangeexpandrn   rs   	__class__r%   r&   r{      s    
"zSiglipVisionEmbeddings.__init__
embeddingsheightwidthr;   c                 C   s   |j d }| jjj d }tj s||kr||kr| | jS | jjd}|j d }|| j }|| j }	t	|d }
|
d|
|
|}|dddd}tjj|||	fddd	}|dddddd|}|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   rw   g      ?r   r(   ZbicubicF)sizerH   Zalign_corners)shaper   weightr<   Zjit
is_tracingru   Z	unsqueezer~   r   reshapeZpermuter   
functionalZinterpolateview)rn   r   r   r   r   r   Zpatch_pos_embeddimZ
new_heightZ	new_widthZsqrt_num_positionsr%   r%   r&   interpolate_pos_encoding   s&   




z/SiglipVisionEmbeddings.interpolate_pos_encodingFpixel_valuesc           	      C   sj   |j \}}}}| jjj}| |j|d}|ddd}|r+|| ||| }|S || | j	 }|S )N)dtyper(   r   )
r   r   r   r   toflatten	transposer   r   ru   )	rn   r   r   _r   r   Ztarget_dtypeZpatch_embedsr   r%   r%   r&   forward%  s   
zSiglipVisionEmbeddings.forwardF)rZ   r[   r\   r   r{   r<   Tensorintr   r^   r   __classcell__r%   r%   r   r&   rr      s     &rr   c                	       sX   e Zd Zdef fddZ			ddeej deej deej dej	fd	d
Z
  ZS )SiglipTextEmbeddingsrs   c                    sR   t    |j}t|j|| _t|j|| _| j	dt
|jddd d S )Nru   rv   Frx   )rz   r{   r|   r   r   Z
vocab_sizetoken_embeddingZmax_position_embeddingsr   r   r<   r   r   rn   rs   r}   r   r%   r&   r{   4  s   

zSiglipTextEmbeddings.__init__N	input_idsru   inputs_embedsr;   c                 C   s   |d ur	|j d n|j d }| jjj d }||kr#td| d| |d u r2| jd d d |f }|d u r;| |}| |}|| }|S )Nrw   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r   r   rF   ru   r   )rn   r   ru   r   
seq_lengthZmax_position_embeddingZposition_embeddingsr   r%   r%   r&   r   @  s"   

zSiglipTextEmbeddings.forwardNNN)rZ   r[   r\   r   r{   r   r<   
LongTensorr^   r   r   r   r%   r%   r   r&   r   3  s    r   modulequerykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nrw   r   )r   r   )ptrainingr   r(   )r<   matmulr   r   r   ZsoftmaxZfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputr%   r%   r&   eager_attention_forward[  s   
r   c                       sj   e Zd ZdZdeeef f fddZ		ddej	de
ej	 d	e
e d
eej	e
ej	 f fddZ  ZS )SiglipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrs   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)rz   r{   rs   r|   r}   num_attention_heads	num_headshead_dimrF   rG   Zattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   r   r%   r&   r{   u  s$   

zSiglipAttention.__init__NFrX   r   output_attentionsr;   c              
   C   s  |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t}
| j	j
dkr[| j	j
dkrU|rUtd nt| j	j
 }
|
| |||	|| j| j| jsjdn| jd\}}|||| }| |}|sd}||fS )	z#Input shape: Batch x Time x Channelr   r(   eagerZsdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r9   )r   r   r   N)r   r   r   r   r   r   r   r   r   rs   _attn_implementationloggerZwarning_oncer   r   rG   r   r   r   r   r   )rn   rX   r   r   
batch_sizer   r}   Zqueriesrq   valuesZattention_interfacer   r   r%   r%   r&   r     s:   




zSiglipAttention.forward)NF)rZ   r[   r\   r]   r   r   r   r{   r<   r   r   boolr   r   r   r%   r%   r   r&   r   r  s    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )	SiglipMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)rz   r{   rs   r   Z
hidden_actactivation_fnr   r   r|   Zintermediate_sizefc1fc2r   r   r%   r&   r{     s
   
zSiglipMLP.__init__rX   r;   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )rn   rX   r%   r%   r&   r     s   


zSiglipMLP.forward)rZ   r[   r\   r{   r<   r   r   r   r%   r%   r   r&   r     s    r   c                
       sV   e Zd Zdeeef f fddZ	ddejdejde	e
 deej fd	d
Z  ZS )SiglipEncoderLayerrs   c                    sR   t    |j| _tj| j|jd| _t|| _	tj| j|jd| _
t|| _d S )NZeps)rz   r{   r|   r}   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr   r   r%   r&   r{     s   

zSiglipEncoderLayer.__init__FrX   r   r   r;   c                 C   sb   |}|  |}| j|||d\}}|| }|}| |}| |}|| }|f}|r/||f7 }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rX   r   r   )r   r   r   r   )rn   rX   r   r   residualr   outputsr%   r%   r&   r     s    




zSiglipEncoderLayer.forwardr   )rZ   r[   r\   r   r   r   r{   r<   r   r   r   r   r^   r   r   r%   r%   r   r&   r     s    r   c                   @   s0   e Zd ZeZdZdZg dZdZdZ	dd Z
dS )SiglipPreTrainedModelZsiglipT)r   r   rr   r   #SiglipMultiheadAttentionPoolingHeadc                 C   sf  t |tr%t | jtr| jjjn| jj}tjj|j	j
dt| d dS t |tjr2t|j
 dS t |trytj|jj
 tj|jj
 tj|jj
 tj|jj
 tj|jj tj|jj tj|jj tj|jj dS t |trtj|jj
 tj|jj
 tjj|jjdd tjj|jjdd dS t |trtj|jj tj|jjj tj|jjj dS t |t rt!"t!#d}|j$j%| |j&j'  dS t |t(rtjj|j)j
| jjjd | jj* d dS t |tj+tj,frt-|j
 |jdurtj|j dS dS t |tj.r1|jj'  |j
j%d dS dS )zInitialize the weightsr   rC   gư>r   r   N)/
isinstancerr   rs   r   vision_configr|   r   initrE   r   r   npr#   r   rS   r   Zxavier_uniform_r   r   r   r   Zzeros_Zbiasr   r   r   r   probedata	attentionZin_proj_weightZin_proj_biasSiglipModelr<   logr1   logit_scaleZfill_
logit_biasZzero_SiglipForImageClassification
classifierZinitializer_factorr   r   rQ   r   )rn   r   r   Zlogit_scale_initr%   r%   r&   _init_weights	  sX   

"






z#SiglipPreTrainedModel._init_weightsN)rZ   r[   r\   r   config_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_flash_attn_2Z_supports_sdpar   r%   r%   r%   r&   r     s    r   c                
       sZ   e Zd ZdZdef fddZe			ddeej	 dee
 dee
 d	efd
dZ  ZS )SiglipEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`SiglipEncoderLayer`].

    Args:
        config: SiglipConfig
    rs   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r%   )r   )rk   r   rs   r%   r&   
<listcomp>E  s    z*SiglipEncoder.__init__.<locals>.<listcomp>F)	rz   r{   rs   r   Z
ModuleListrangeZnum_hidden_layerslayersZgradient_checkpointingr   r   r   r&   r{   B  s   
 
zSiglipEncoder.__init__Nr   r   output_hidden_statesr;   c           
      C   s   |dur|n| j j}|dur|n| j j}|rdnd}|rdnd}|}| jD ]}|r.||f }||||d}	|	d }|rB||	d f }q%|rJ||f }t|||dS )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr%   )r   r   r   )rW   rX   rY   )rs   r   r   r   r   )
rn   r   r   r   r   Zencoder_statesZall_attentionsrX   Zencoder_layerZlayer_outputsr%   r%   r&   r   I  s2   


zSiglipEncoder.forwardr   )rZ   r[   r\   r]   r   r{   r   r   r<   r   r   r   r   r   r%   r%   r   r&   r   9  s     r   c                       sr   e Zd Zdef fddZee					ddeej	 deej	 deej	 dee
 d	ee
 d
efddZ  ZS )SiglipTextTransformerrs   c                    s\   t    || _|j}t|| _t|| _tj	||j
d| _t||j| _|jdk| _d S )Nr   Zflash_attention_2)rz   r{   rs   r|   r   r   r   encoderr   r   r   final_layer_normr   Zprojection_sizeheadr   _use_flash_attention_2r   r   r%   r&   r{     s   


zSiglipTextTransformer.__init__Nr   r   ru   r   r   r;   c                 C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| }|d|d }| j||d}|d ur<| js<t||j	}| j
||||d}|j}	| |	}	|	d d dd d f }
| |
}
t|	|
|j|jdS )NzYou have to specify input_idsrw   )r   ru   )r   r   r   r   rW   pooler_outputrX   rY   )rs   r   r   rF   r   r   r   r   r   r   r   rW   r   r   r   rX   rY   )rn   r   r   ru   r   r   Zinput_shaperX   encoder_outputsrW   pooled_outputr%   r%   r&   r     s4   


zSiglipTextTransformer.forwardNNNNN)rZ   r[   r\   r   r{   r   r   r   r<   r   r   r   r   r   r%   r%   r   r&   r     s,    r   zK
    The text model from SigLIP without any head or projection on top.
    )Zcustom_introc                       s   e Zd ZeZdef fddZdejfddZdd Z	e
e										dd
eej deej deej dee dee defddZ  ZS )SiglipTextModelrs   c                    "   t  | t|| _|   d S r   )rz   r{   r   
text_model	post_initr   r   r%   r&   r{     s   
zSiglipTextModel.__init__r;   c                 C   
   | j jjS r   r   r   r   rm   r%   r%   r&   get_input_embeddings     
z$SiglipTextModel.get_input_embeddingsc                 C   s   || j j_d S r   r  )rn   r   r%   r%   r&   set_input_embeddings  s   z$SiglipTextModel.set_input_embeddingsNr   r   ru   r   r   c                 C   s   | j |||||dS )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, SiglipTextModel

        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   ru   r   r   )r   )rn   r   r   ru   r   r   r%   r%   r&   r     s   zSiglipTextModel.forwardr   )rZ   r[   r\   r   r   r{   r   Moduler  r  r   r   r   r<   r   r   r   r   r   r%   r%   r   r&   r     s2    r   c                       sX   e Zd Zdef fddZee			ddee dee dee d	e	fd
dZ
  ZS )SiglipVisionTransformerrs   c                    sj   t    || _|j}t|| _t|| _tj	||j
d| _t|ds%dn|j| _| jr3t|| _d S d S )Nr   vision_use_headT)rz   r{   rs   r|   rr   r   r   r   r   r   r   post_layernormhasattrr  use_headr   r   r   r   r%   r&   r{     s   


z SiglipVisionTransformer.__init__NFr   r   r   r;   c           	      C   s~   |d ur|n| j j}|d ur|n| j j}| j||d}| j|||d}|j}| |}| jr3| |nd }t	|||j
|jdS )N)r   )r   r   r   r   )rs   r   r   r   r   rW   r	  r  r   r   rX   rY   )	rn   r   r   r   r   rX   r   rW   r   r%   r%   r&   r     s$   	
zSiglipVisionTransformer.forwardNNF)rZ   r[   r\   r   r{   r   r   r   r   r   r   r   r%   r%   r   r&   r     s     r  c                       s.   e Zd ZdZdef fddZdd Z  ZS )r   zMultihead Attention Pooling.rs   c                    s\   t    ttdd|j| _tjj|j|j	dd| _
tj|j|jd| _t|| _d S )Nr   T)Zbatch_firstr   )rz   r{   r   	Parameterr<   randnr|   r   ZMultiheadAttentionr   r   r   r   	layernormr   r   r   r   r%   r&   r{   3  s
   
z,SiglipMultiheadAttentionPoolingHead.__init__c                 C   sX   |j d }| j|dd}| |||d }|}| |}|| | }|d d df S )Nr   r   )r   r   repeatr   r  r   )rn   Zhidden_stater   r   r   r%   r%   r&   r   ;  s   

z+SiglipMultiheadAttentionPoolingHead.forward)rZ   r[   r\   r]   r   r{   r   r   r%   r%   r   r&   r   0  s    r   zM
    The vision model from SigLIP without any head or projection on top.
    c                       sl   e Zd ZeZdZdef fddZdejfddZ	e
e				dd
ee dee dedefddZ  ZS )SiglipVisionModelr   rs   c                    r   r   )rz   r{   r  vision_modelr   r   r   r%   r&   r{   Q  s   
zSiglipVisionModel.__init__r;   c                 C   r   r   )r  r   r   rm   r%   r%   r&   r  Y  r  z&SiglipVisionModel.get_input_embeddingsNFr   r   r   c                 C   s   | j ||||dS )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, SiglipVisionModel

        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r   r   r   r   )r  )rn   r   r   r   r   r%   r%   r&   r   \  s   zSiglipVisionModel.forwardr  )rZ   r[   r\   r   r   main_input_namer{   r   r  r  r   r   r   r   r   r   r   r%   r%   r   r&   r  H  s&    r  c                       s  e Zd ZeZdef fddZe					ddeej	 deej	 deej	 dee
 d	ee
 d
ejfddZe				ddeej dee
 d	ee
 de
d
ejf
ddZee								ddeej deej deej	 deej dee
 dee
 d	ee
 de
d
efddZ  ZS )r   rs   c                    s   t  | t|jtstdt|j dt|jts(tdt|j d|j}|j}t	
|}t
|}|j| _|j| _ttd| _ttd| _|   d S )NzMconfig.text_config is expected to be of type SiglipTextConfig but is of type .zQconfig.vision_config is expected to be of type SiglipVisionConfig but is of type r   )rz   r{   r   text_configr   	TypeErrortyper   r   r   _from_configr  r   r  r   r  r<   r  r   r   r   )rn   rs   r  r   r   r  r   r%   r&   r{     s,   

zSiglipModel.__init__Nr   r   ru   r   r   r;   c                 C   sF   |dur|n| j j}|dur|n| j j}| j|||||d}|j}|S )aJ  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`SiglipTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
        >>> with torch.no_grad():
        ...     text_features = model.get_text_features(**inputs)
        ```Nr  )rs   r   r   r   r   )rn   r   r   ru   r   r   text_outputsr   r%   r%   r&   get_text_features  s   zSiglipModel.get_text_featuresFr   r   c                 C   sD   |dur|n| j j}|dur|n| j j}| j||||d}|j}|S )a  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`SiglipVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```Nr  )rs   r   r   r  r   )rn   r   r   r   r   vision_outputsr   r%   r%   r&   get_image_features  s   !zSiglipModel.get_image_featuresreturn_lossc	              	   C   sB  |dur|n| j j}|dur|n| j j}| j||||d}	| j|||||d}
|	j}|
j}||jdddd }||jdddd }t||	 
|j}| j
|j| j
|j}}||  | }|	 }d}|rtj|d|jd	}t| d|  }tjj|| }tj|dd
 }| }t||||||
|	dS )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
        >>> # important: we pass `padding=max_length` since the model was trained with this
        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> logits_per_image = outputs.logits_per_image
        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
        31.9% that image 0 is 'a photo of 2 cats'
        ```Nr  r  r(   rw   T)r   r   Zkeepdimr   )devicer   )rd   re   rf   rb   rV   rg   rh   )rs   r   r   r  r   r   Znormr<   r   tr   r  r   r   expeyer   Z	ones_liker   r   Z
logsigmoidsumr2   rc   )rn   r   r   r   ru   r  r   r   r   r  r  rV   rb   rf   r   r   re   rd   r#  Zm1_diag1ZloglikZnllr%   r%   r&   r     sP   ,zSiglipModel.forwardr   )NNNF)NNNNNNNF)rZ   r[   r\   r   r   r{   r   r   r<   r   r   r^   r  r  r   r   rc   r   r   r%   r%   r   r&   r     s     -0	
r   z
    SigLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                       st   e Zd ZdZdeddf fddZee					ddee	j
 dee	j
 d	ee d
ee dedefddZ  ZS )r   r   rs   r;   Nc                    sZ   t  | |j| _t|j}|j| _|jdkr"t|jj	|jnt
 | _|   d S )Nr   )rz   r{   
num_labelsr  r  r   r  r   r   r|   ZIdentityr   r   )rn   rs   r  r   r%   r&   r{   q  s   "z%SiglipForImageClassification.__init__Flabelsr   r   r   c                 C   s^  |dur|n| j j}|dur|n| j j}| j||||d}|j}tj|dd}| |}d}	|dur||j	}| j j
du rb| jdkrHd| j _
n| jdkr^|jtjksY|jtjkr^d| j _
nd| j _
| j j
dkrt }
| jdkrz|
| | }	n+|
||}	n%| j j
dkrt }
|
|d| j|d}	n| j j
dkrt }
|
||}	t|	||j|jd	S )
a$  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, SiglipForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a `SiglipModel` from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
        >>> model = SiglipForImageClassification.from_pretrained("google/siglip-base-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the two classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: LABEL_1
        ```N)r   r   r   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrw   )rd   logitsrX   rY   )rs   r   r   r  rW   r<   r2   r   r   r  Zproblem_typer%  r   longr   r   Zsqueezer
   r   r	   r   rX   rY   )rn   r   r&  r   r   r   r   Zsequence_outputr'  rd   Zloss_fctr%   r%   r&   r     sL   )


"


z$SiglipForImageClassification.forward)NNNNF)rZ   r[   r\   r  r   r{   r   r   r   r<   r   r   r   r   r   r%   r%   r   r&   r   h  s.    r   )r   r   r   r  r   )r9   r   r:   r    )r   r?   r@   )r9   )Kr]   r!   r,   dataclassesr   typingr   r   r   r   r   numpyr   r<   Ztorch.utils.checkpointr   Ztorch.nnr	   r
   r   Ztorch.nn.initr   Zactivationsr   Zmodeling_attn_mask_utilsr   Zmodeling_layersr   Zmodeling_outputsr   r   r   Zmodeling_utilsr   r   utilsr   r   r   r   r   Zconfiguration_siglipr   r   r   Z
get_loggerrZ   r   r8   r   floatr>   rL   rQ   rS   rU   ra   rc   r  rr   r   r   r   r   r   r   r   r   r   r  r   r  r   r   __all__r%   r%   r%   r&   <module>   s   
%

$I/
H0?P?305 fr