o
    Zh^                     @   sz  d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZmZmZmZ d
dlm Z  d
dl!m"Z"m#Z# e$e%Z&G dd de	Z'G dd deZ(G dd de"Z)G dd deZ*G dd deZ+G dd deZ,G dd deZ-G dd deZ.eG dd deZ/G d d! d!eZ0g d"Z1dS )#    )CallableOptionalTupleUnionN   )PretrainedConfig)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionc                       sD   e Zd ZdZdZdZ								
						d fdd	Z  ZS )MLCDVisionConfiga  
    This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
    [DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1664):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 1024):
            Dimensionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 336):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```python
    >>> from transformers import MLCDVisionConfig, MLCDVisionModel

    >>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> configuration = MLCDVisionConfig()

    >>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> model = MLCDVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```Zmlcd_vision_modelZvision_config      0         r   P     geluh㈵>        {Gz?      ?c                    sd   t  jdi | || _|| _|| _|| _|| _|| _|| _|| _	|| _
|| _|| _|
| _|	| _d S )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_groupsnum_channels
patch_size
image_sizeinitializer_rangeinitializer_factorattention_dropoutlayer_norm_eps
hidden_act)selfr+   r,   r-   r.   r/   r0   r2   r1   r7   r6   r5   r3   r4   kwargs	__class__r(   T/var/www/auris/lib/python3.10/site-packages/transformers/models/mlcd/modular_mlcd.pyr*   d   s   
zMLCDVisionConfig.__init__)r   r   r   r   r    r   r!   r"   r#   r$   r%   r&   r'   )__name__
__module____qualname____doc__Z
model_typeZbase_config_keyr*   __classcell__r(   r(   r:   r<   r   *   s$    6r   c                   @   s   e Zd ZdS )MLCDMLPN)r=   r>   r?   r(   r(   r(   r<   rB      s    rB   c                   @   s$   e Zd ZdededejfddZdS )MLCDRotaryEmbeddingnum_patches_heightnum_patches_widthreturnc           
      C   s   t j|| jjddd|}t j|| jjdd|d}t j| | gdd}t||}t j|| jj| jj	d}t 
|| j}|| d}	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer    r   dim)rG   dtype)torchZarangeZinv_freqrG   	unsqueezeexpandstackflattenmaxrK   outer)
r8   rD   rE   Zhpos_idsZwpos_idsZpos_idsZmax_grid_sizeseqZrotary_pos_emb_fullrotary_pos_embr(   r(   r<   forward   s   
zMLCDRotaryEmbedding.forwardN)r=   r>   r?   intrL   TensorrU   r(   r(   r(   r<   rC      s    rC   c                       s8   e Zd Zdef fddZdejdejfddZ  Z	S )MLCDVisionEmbeddingsconfigc                    s   t  | | `d S N)r)   r*   Zposition_embeddingr8   rY   r:   r(   r<   r*      s   zMLCDVisionEmbeddings.__init__pixel_valuesrF   c                 C   s^   |j d }| jjj}| |j|d}|ddd}| j|dd}t	j
||gdd}|S )Nr   )rK   r   r    rH   rI   )shapepatch_embeddingweightrK   torP   Z	transposeclass_embeddingrN   rL   cat)r8   r\   
batch_sizeZtarget_dtypeZpatch_embedsZclass_embeds
embeddingsr(   r(   r<   rU      s   

zMLCDVisionEmbeddings.forward)
r=   r>   r?   r   r*   rL   FloatTensorrW   rU   rA   r(   r(   r:   r<   rX      s    rX   c                       sp   e Zd ZdZdef fddZ	ddejdeejejf de	ej d	e
e d
eeje	ej f f
ddZ  ZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://arxiv.org/abs/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://arxiv.org/abs/2104.09864
    rY   c                    s   t  | |j| _d| _d S NF)r)   r*   r/   	is_causalr[   r:   r(   r<   r*      s   
zMLCDAttention.__init__Nhidden_statesposition_embeddingsattention_maskr9   rF   c                 K   s  |j d d \}}| |||| j| jf}| |||| j| jf}| |||| j| jf}	|d d }
|d d }t	|||
|\}}|
dddd }|
dddd }|	
dddd }	t}| jjdkr| jjdkr|dd	rtd
 nt| jj }|| |||	|f| jsdn| j| j| jd|\}}|
dddd }|||d}| |}|
ddd }||fS )NrH   r   r    r   r   eagerZsdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r%   )dropoutZscalingrh   )r]   q_projZreshapeZ	num_headsZhead_dimk_projv_projrM   floatr   Zpermute
contiguousr   rY   Z_attn_implementationgetloggerZwarning_oncer   trainingrn   scalerh   viewout_proj)r8   ri   rj   rk   r9   rc   Z
seq_lengthZquery_statesZ
key_statesZvalue_statescossinZattention_interfaceZattn_outputattn_weightsr(   r(   r<   rU      sF   	

zMLCDAttention.forwardrZ   )r=   r>   r?   r@   r   r*   rL   rW   r   r   r   r   rU   rA   r(   r(   r:   r<   rf      s    	rf   c                       sd   e Zd Zdef fddZ		ddejdeejejf deej d	ee	 d
eej
 f
ddZ  ZS )MLCDEncoderLayerrY   c                    s   t  | t|| _d S rZ   )r)   r*   rf   	self_attnr[   r:   r(   r<   r*     s   zMLCDEncoderLayer.__init__NFri   rj   rk   rm   rF   c                 C   sd   |}|  |}| j||||d\}}|| }|}| |}| |}|| }|f}|r0||f7 }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`Tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        ri   rj   rk   rm   )Zlayer_norm1r~   Zlayer_norm2Zmlp)r8   ri   rj   rk   rm   Zresidualr|   Zoutputsr(   r(   r<   rU     s"   




zMLCDEncoderLayer.forwardrg   )r=   r>   r?   r   r*   rL   rW   r   r   boolre   rU   rA   r(   r(   r:   r<   r}      s    r}   c                       s~   e Zd ZdZdef fddZ				ddejdeej	ej	f de
ej	 d	e
e d
e
e de
e deeef fddZ  ZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    rY   c                    s   t  | dS )z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r)   r*   r[   r:   r(   r<   r*   ;  s   zMLCDEncoder.__init__Ninputs_embedsrj   rk   rm   output_hidden_statesreturn_dictrF   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}	t| jD ]1\}
}|r<||	f }| jrM| jrM| |j	|	|||}n||	|||d}|d }	|rb||d f }q1|rj||	f }|sxt
dd |	||fD S t|	||dS )	aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`Tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr(   r   r   r    c                 s   s    | ]	}|d ur|V  qd S rZ   r(   ).0vr(   r(   r<   	<genexpr>  s    z&MLCDEncoder.forward.<locals>.<genexpr>)last_hidden_stateri   
attentions)rY   r   use_return_dictrm   	enumerateZlayersZgradient_checkpointingrv   Z_gradient_checkpointing_func__call__tupler	   )r8   r   rj   rk   rm   r   r   Zencoder_statesZall_attentionsri   idxZencoder_layerZlayer_outputsr(   r(   r<   rU   ?  sJ   "

zMLCDEncoder.forwardNNNN)r=   r>   r?   r@   r   r*   rL   re   r   rW   r   r   r   r	   rU   rA   r(   r(   r:   r<   r   2  s,    
r   c                       sh   e Zd Zdef fddZe				ddeej dee	 dee	 dee	 d	e
eef f
d
dZ  ZS )MLCDVisionTransformerrY   c                    sF   t  | t|j|j d | _tt	d|j|j d | _
d S )Nr   r    )r)   r*   rC   r+   r.   vision_rotary_embeddingnn	ParameterrL   Zrandnclass_pos_embr[   r:   r(   r<   r*     s   $zMLCDVisionTransformer.__init__Nr\   rm   r   r   rF   c                 C   s<  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td|jd | j j }|jd | j j }| ||}|| j	j
}tj| j	|gdd}tj||fdd}| | f}	| |}
| |
}
| j|
|	|||d}|d }|d d dd d f }| |}|s||f|dd   S t|||j|jdS )	Nz You have to specify pixel_valuesrH   r   rI   )r   rj   rm   r   r   r    )r   Zpooler_outputri   r   )rY   r   r   rm   
ValueErrorr]   r1   r   r`   r   rG   rL   rb   rz   r{   rd   Zpre_layrnormencoderZpost_layernormr
   ri   r   )r8   r\   rm   r   r   rD   rE   rT   Zembrj   ri   Zencoder_outputsr   Zpooled_outputr(   r(   r<   rU     sB   	


zMLCDVisionTransformer.forwardr   )r=   r>   r?   r   r*   r   r   rL   re   r   r   r   r
   rU   rA   r(   r(   r:   r<   r     s$    
r   c                   @   s(   e Zd ZeZdZdZdZdZdd Z	dS )MLCDPreTrainedModelZmlcdTc                 C   s  | j j}t|tr,| j j}tjj|jd|jd | d tjj|j	j
|j j| d dS t|tru| j j}|jd d|j j d  | }|jd | }tjj|jj
|d tjj|jj
|d tjj|jj
|d tjj|jj
|d dS t|tr| j j}|j jd d|j j d  | }d|j j d | }tjj|jj
|d tjj|jj
|d dS t|tr| j j}|j j|j j d d | }tjj|jd|d dS t|tjr|jj  |j
jd dS t|tjr|jdur|jj  dS dS dS )zInitialize the weightsr%   g      )meanstd)r   r   r'   N)rY   r4   
isinstancerX   r   initZnormal_ra   Z	embed_dimr^   r_   r3   rf   r-   ro   rp   rq   ry   rB   r+   Zfc1Zfc2r   r.   r   Z	LayerNormZbiasdataZzero_Zfill_ZLinear)r8   modulefactorZin_proj_stdZout_proj_stdZfc_stdZpos_emb_stdr(   r(   r<   _init_weights  s:   
 

 
z!MLCDPreTrainedModel._init_weightsN)
r=   r>   r?   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_supports_flash_attn_2Z_supports_sdpar   r(   r(   r(   r<   r     s    r   c                   @   sR   e Zd Ze				d	deej dee dee dee dee	e
f f
ddZdS )
MLCDVisionModelNr\   rm   r   r   rF   c                 C   sN   |dur|n| j j}|dur|n| j j}|dur|n| j j}| j||||dS )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```N)r\   rm   r   r   )rY   r   r   rm   Zvision_model)r8   r\   rm   r   r   r(   r(   r<   rU     s   zMLCDVisionModel.forwardr   )r=   r>   r?   r   r   rL   re   r   r   r   r
   rU   r(   r(   r(   r<   r     s"    
r   )r   r   r   )2typingr   r   r   r   rL   Ztorch.nnr   Zconfiguration_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr	   r
   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   Zclip.modeling_clipr   r   r   r   r   r   r   Zllama.modeling_llamar   Zqwen2_vl.modeling_qwen2_vlr   r   Z
get_loggerr=   ru   r   rB   rC   rX   rf   r}   r   r   r   r   __all__r(   r(   r(   r<   <module>   s2   $	
\"B2\9'-