o
    Zhn                     @   s  d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ d
dlmZ eeZG dd dejZG dd dejZ G dd dejZ!	d7dejdej"dej"dej"deej" de#de#fddZ$dd Z%dej"de&d ej"fd!d"Z'd#ej"d$ej"d%ej"d&ej"d eej"ej"f f
d'd(Z(G d)d* d*ejZ)G d+d, d,ejZ*G d-d. d.ejZ+G d/d0 d0ejZ,eG d1d2 d2eZ-ed3d4G d5d6 d6e-Z.d2d6gZ/dS )8    )CallableOptionalTupleUnionN   )ACT2FN)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging	torch_int   )MLCDVisionConfigc                       s2   e Zd Z fddZdejdejfddZ  ZS )MLCDMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)super__init__configr   Z
hidden_actactivation_fnnnLinearhidden_sizeZintermediate_sizefc1fc2selfr   	__class__ U/var/www/auris/lib/python3.10/site-packages/transformers/models/mlcd/modeling_mlcd.pyr   '   s
   
zMLCDMLP.__init__hidden_statesreturnc                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r    r%   r#   r#   r$   forward.   s   


zMLCDMLP.forward)__name__
__module____qualname__r   torchTensorr'   __classcell__r#   r#   r!   r$   r   &   s    r   c                       sD   e Zd Zddededdf fddZded	edejfd
dZ  Z	S )MLCDRotaryEmbedding     @dimthetar&   Nc                    s>   t    d|tjd|dtjd|   }| jd|dd d S )N      ?r      dtypeinv_freqF
persistent)r   r   r+   arangefloatregister_buffer)r    r0   r1   r6   r!   r#   r$   r   6   s   
 zMLCDRotaryEmbedding.__init__num_patches_heightnum_patches_widthc           
      C   s   t j|| jjddd|}t j|| jjdd|d}t j| | gdd}t||}t j|| jj| jj	d}t 
|| j}|| d}	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer   r   r0   )r>   r5   )r+   r9   r6   r>   	unsqueezeexpandstackflattenmaxr5   outer)
r    r<   r=   Zhpos_idsZwpos_idsZpos_idsZmax_grid_sizeseqZrotary_pos_emb_fullrotary_pos_embr#   r#   r$   r'   ;   s   
zMLCDRotaryEmbedding.forward)r/   )
r(   r)   r*   intr:   r   r+   r,   r'   r-   r#   r#   r!   r$   r.   5   s     r.   c                       sV   e Zd Zdef fddZdejdededejfdd	Zd
ej	dejfddZ
  ZS )MLCDVisionEmbeddingsr   c                    s   t    || _|j| _|j| _|j| _tt	
| j| _tj|j| j| j| jdd| _| j| j d | _| jd | _| jdt	| jddd d S )NF)Zin_channelsZout_channelsZkernel_sizeZstridebiasr3   r   position_ids)r   r?   r7   )r   r   r   r   	embed_dimZ
image_size
patch_sizer   	Parameterr+   randnclass_embeddingZConv2dZnum_channelspatch_embeddingnum_patchesnum_positionsr;   r9   rB   r   r!   r#   r$   r   ]   s    
"zMLCDVisionEmbeddings.__init__
embeddingsheightwidthr&   c                 C   s  |j d d }| jjd}|j d d }tj s(||kr(||kr(| | jS |ddddf }|ddddf }|j d }	|| j }
|| j }t	|d }|
d|||	}|dddd}tjj||
|fdd	d
}|dddddd|	}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr?   g      ?r   r3   ZbicubicF)sizemodeZalign_cornersr@   )shapeposition_embeddingweightrA   r+   Zjit
is_tracingrL   rN   r   reshapepermuter   
functionalZinterpolateviewcat)r    rU   rV   rW   rS   r[   rT   Zclass_pos_embedZpatch_pos_embedr0   Z
new_heightZ	new_widthZsqrt_num_positionsr#   r#   r$   interpolate_pos_encodingr   s*   



z-MLCDVisionEmbeddings.interpolate_pos_encodingpixel_valuesc                 C   s^   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|S )Nr   r4   r3   r   r?   r@   )rZ   rR   r\   r5   torD   	transposerQ   rB   r+   rb   )r    rd   
batch_sizeZtarget_dtypeZpatch_embedsZclass_embedsrU   r#   r#   r$   r'      s   

zMLCDVisionEmbeddings.forward)r(   r)   r*   r   r   r+   r,   rI   rc   FloatTensorr'   r-   r#   r#   r!   r$   rJ   \   s    )rJ           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   t || j}t || j}	t||dd| }
|d ur3|d d d d d d d |jd f }|
| }
tjj|
dtj	d
|j}
tjj|
|| jd}
t|
|	}|dd }||
fS )Nr3   r   r?   )r0   r5   )ptrainingr   )	repeat_kvnum_key_value_groupsr+   matmulrf   rZ   r   r`   ZsoftmaxZfloat32re   r5   rp   rs   
contiguous)rj   rk   rl   rm   rn   ro   rp   kwargs
key_statesvalue_statesattn_weightsZcausal_maskattn_outputr#   r#   r$   eager_attention_forward   s   
&r}   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr?   r3   r@   )rZ   r+   rb   )xx1Zx2r#   r#   r$   rotate_half   s   r   r%   n_repr&   c                 C   s^   | j \}}}}|dkr| S | dddddddddf |||||} | ||| ||S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rZ   rB   r^   )r%   r   batchZnum_key_value_headsslenhead_dimr#   r#   r$   rt      s
   0rt   qkcossinc                 C   s   | j }|j }|  | } }|d |d }}| | t| |  }|| t||  }||}||}||fS )Nrq   )r5   r:   rA   r   re   )r   r   r   r   Zorig_q_dtypeZorig_k_dtypeZq_embedZk_embedr#   r#   r$   apply_rotary_pos_emb_vision   s   

r   c                       sp   e Zd ZdZdef fddZ	ddejdeejejf de	ej d	e
e d
eeje	ej f f
ddZ  ZS )MLCDAttentionaA  Multi-headed attention from 'Attention Is All You Need' paper
    Multi-headed attention with RoPE. Refer to papers:
        - Attention is all you need:
            https://arxiv.org/abs/1706.03762
        - RoFormer: Enhanced Transformer with Rotary Position Embedding:
            https://arxiv.org/abs/2104.09864
    r   c                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _d| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _|j| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)r   r   r   r   rM   num_attention_heads	num_headsr   
ValueErrorscaleZattention_dropoutrp   	is_causalr   r   k_projv_projq_projout_projru   r   r!   r#   r$   r      s&   

zMLCDAttention.__init__Nr%   position_embeddingsrn   rx   r&   c                 K   s  |j dd \}}| |||| j| jf}| |||| j| jf}| |||| j| jf}	|d d }
|d d }t	|||
|\}}|
dddd }|
dddd }|	
dddd }	t}| jjdkr| jjdkr|d	d
rtd nt| jj }|| |||	|f| jsdn| j| j| jd|\}}|
dddd }|||d}| |}|
ddd }||fS )z#Input shape: Batch x Time x ChannelNr?   r   r   r3   r   eagerZsdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.ri   )rp   ro   r   )rZ   r   r^   r   r   r   r   rA   r:   r   r_   rw   r}   r   Z_attn_implementationgetloggerZwarning_oncer   rs   rp   r   r   ra   r   )r    r%   r   rn   rx   rg   Z
seq_lengthZquery_statesry   rz   r   r   Zattention_interfacer|   r{   r#   r#   r$   r'     sF   	

zMLCDAttention.forwardr   )r(   r)   r*   __doc__r   r   r+   r,   r   r   r   r   r'   r-   r#   r#   r!   r$   r      s    r   c                       sd   e Zd Zdef fddZ		ddejdeejejf deej d	ee	 d
eej
 f
ddZ  ZS )MLCDEncoderLayerr   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S )NZeps)r   r   r   rM   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   r!   r#   r$   r   8  s   


zMLCDEncoderLayer.__init__NFr%   r   rn   r   r&   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`Tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        r%   r   rn   r   )r   r   r   r   )r    r%   r   rn   r   Zresidualr{   Zoutputsr#   r#   r$   r'   @  s"   




zMLCDEncoderLayer.forward)NF)r(   r)   r*   r   r   r+   r,   r   r   boolrh   r'   r-   r#   r#   r!   r$   r   7  s    r   c                       s   e Zd ZdZdef fddZe				ddejde	ej
ej
f deej
 d	ee d
ee dee dee	ef fddZ  ZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    r   c                    s:   t     | _t fddt jD | _d| _dS )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.c                    s   g | ]}t  qS r#   )r   ).0_r   r#   r$   
<listcomp>z  s    z(MLCDEncoder.__init__.<locals>.<listcomp>FN)	r   r   r   r   Z
ModuleListrangenum_hidden_layerslayersgradient_checkpointingr   r!   r   r$   r   v  s   
 
zMLCDEncoder.__init__Ninputs_embedsr   rn   r   output_hidden_statesreturn_dictr&   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]1\}
}|r<||	f }| jrM| jrM| |j	|	|||}n||	|||d}|d }	|rb||d f }q1|rj||	f }|sxt
dd |	||fD S t|	||dS )	aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`Tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr#   r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r#   )r   vr#   r#   r$   	<genexpr>  s    z&MLCDEncoder.forward.<locals>.<genexpr>)last_hidden_stater%   
attentions)r   r   use_return_dictr   	enumerater   r   rs   Z_gradient_checkpointing_func__call__tupler	   )r    r   r   rn   r   r   r   Zencoder_statesZall_attentionsr%   idxZencoder_layerZlayer_outputsr#   r#   r$   r'   }  sJ   #

zMLCDEncoder.forwardNNNN)r(   r)   r*   r   r   r   r   r+   rh   r   r,   r   r   r   r	   r'   r-   r#   r#   r!   r$   r   m  s.    
r   c                       sh   e Zd Zdef fddZe				ddeej dee	 dee	 dee	 d	e
eef f
d
dZ  ZS )MLCDVisionTransformerr   c                    s   t    || _|j}t|| _tj||jd| _	t
|| _tj||jd| _t|j|j d | _ttd|j|j d | _d S )Nr   r3   r   )r   r   r   r   rJ   rU   r   r   r   pre_layrnormr   encoderpost_layernormr.   r   vision_rotary_embeddingrO   r+   rP   class_pos_emb)r    r   rM   r!   r#   r$   r     s   


$zMLCDVisionTransformer.__init__Nrd   r   r   r   r&   c                 C   s<  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td|jd | j j }|jd | j j }| ||}|| j	j
}tj| j	|gdd}tj||fdd}| | f}	| |}
| |
}
| j|
|	|||d}|d }|d d dd d f }| |}|s||f|dd   S t|||j|jdS )	Nz You have to specify pixel_valuesrq   r?   r   r@   )r   r   r   r   r   r   )r   Zpooler_outputr%   r   )r   r   r   r   r   rZ   rN   r   re   r   r>   r+   rb   r   r   rU   r   r   r   r
   r%   r   )r    rd   r   r   r   r<   r=   rH   Zembr   r%   Zencoder_outputsr   Zpooled_outputr#   r#   r$   r'     sB   	


zMLCDVisionTransformer.forwardr   )r(   r)   r*   r   r   r   r   r+   rh   r   r   r   r
   r'   r-   r#   r#   r!   r$   r     s$    
r   c                   @   s(   e Zd ZeZdZdZdZdZdd Z	dS )MLCDPreTrainedModelZmlcdTc                 C   s  | j j}t|tr,| j j}tjj|jd|jd | d tjj|j	j
|j j| d dS t|tru| j j}|jd d|j j d  | }|jd | }tjj|jj
|d tjj|jj
|d tjj|jj
|d tjj|jj
|d dS t|tr| j j}|j jd d|j j d  | }d|j j d | }tjj|jj
|d tjj|jj
|d dS t|tr| j j}|j j|j j d d | }tjj|jd|d dS t|tjr|jj  |j
jd dS t|tjr|jdur|jj  dS dS dS )zInitialize the weightsri   r   )meanstd)r   r3   r2   N)r   Zinitializer_factor
isinstancerJ   r   initZnormal_rQ   rM   rR   r\   Zinitializer_ranger   r   r   r   r   r   r   r   r   r   r   r   r   r   rK   dataZzero_Zfill_r   )r    rj   factorZin_proj_stdZout_proj_stdZfc_stdZpos_emb_stdr#   r#   r$   _init_weights  s:   
 

 
z!MLCDPreTrainedModel._init_weightsN)
r(   r)   r*   r   config_classZbase_model_prefixZsupports_gradient_checkpointingZ_supports_flash_attn_2Z_supports_sdpar   r#   r#   r#   r$   r     s    r   zN
    The vision model from M_L_C_D without any head or projection on top.
    )Zcustom_introc                       s   e Zd ZeZdZdgZdef fddZdej	fddZ
e								ddeej d
ee dee dee deeef f
ddZ  ZS )MLCDVisionModelrd   r   r   c                    s"   t  | t|| _|   d S r   )r   r   r   vision_modelZ	post_initr   r!   r#   r$   r   ?  s   
zMLCDVisionModel.__init__r&   c                 C   s
   | j jjS r   )r   rU   rR   )r    r#   r#   r$   get_input_embeddingsE  s   
z$MLCDVisionModel.get_input_embeddingsNr   r   r   c                 C   sN   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||dS )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```N)rd   r   r   r   )r   r   r   r   r   )r    rd   r   r   r   r#   r#   r$   r'   H  s   zMLCDVisionModel.forwardr   )r(   r)   r*   r   r   Zmain_input_nameZ_no_split_modulesr   r   Moduler   r   r   r+   rh   r   r   r   r
   r'   r-   r#   r#   r!   r$   r   5  s,    
r   )ri   )0typingr   r   r   r   r+   Ztorch.nnr   Zactivationsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr	   r
   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zconfiguration_mlcdr   Z
get_loggerr(   r   r   r   r.   rJ   r,   r:   r}   r   rI   rt   r   r   r   r   r   r   r   __all__r#   r#   r#   r$   <module>   sl   
'S

T6`@':