o
    ZhXi                     @   sX  d dl Zd dlmZ d dlmZmZmZmZm	Z	 d dl
Z
d dlmZ d dlZ
ddlmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+m,Z, e-e.Z/	d<dej0de
j1de
j1de
j1dee
j1 de2de2fddZ3G dd de$Z4G dd de"Z5eG d d! d!eZ6eG d"d# d#eZ7G d$d% d%ej0Z8G d&d' d'ej0Z9G d(d) d)e Z:ej;e4d*Z<G d+d, d,ej0Z=G d-d. d.ej0Z>eG d/d0 d0e6Z?G d1d2 d2e)Z@dZAG d3d4 d4ej0ZBG d5d6 d6e(ZCG d7d8 d8e&ZDG d9d: d:e'ZEg d;ZFdS )=    N)	dataclass)CallableListOptionalTupleUnion   )ACT2FN)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging	torch_int   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfig        modulequerykeyvalueattention_maskscalingdropoutc                 K   s   |}|}	t ||dd| }
|d ur+|d d d d d d d |jd f }|
| }
tjj|
dd}
tjj|
|| jd}
t |
|	}|dd	 }||
fS )Nr   r   dim)ptrainingr   )
torchmatmul	transposeshapenn
functionalZsoftmaxr&   r,   
contiguous)r    r!   r"   r#   r$   r%   r&   kwargs
key_statesvalue_statesattn_weightsZcausal_maskattn_output r9   \/var/www/auris/lib/python3.10/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forward.   s   
&r;   c                   @      e Zd ZdS )InternVLVisionRMSNormN__name__
__module____qualname__r9   r9   r9   r:   r=   I       r=   c                
       sT   e Zd Zdef fddZ		ddejdeej deej dee	 fd	d
Z
  ZS )InternVLVisionAttentionconfigc                    sT   t    | `d| _|j}|rt| jnt | _	|r#t| j| _
d S t | _
d S )NF)super__init__Znum_key_value_groups	is_causalZuse_qk_normr=   	embed_dimr1   Identityq_normk_norm)selfrD   Zqk_norm	__class__r9   r:   rF   N   s   
"z InternVLVisionAttention.__init__Nhidden_statesr$   output_attentionsr4   c                 K   sF  |  \}}}| |}| |}	| |}
| |}| |	}	|||| j| j	dd}|	||| j| j	dd}	|

||| j| j	dd}
t}| jjdkrj| jjdkrd|ddrdtd nt| jj }|| ||	|
|f| jsvdn| j| jdd	|\}}|||| j}| |}| |}|r||f}|S |d f}|S )
Nr   r   eagerZsdparP   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r&   r%   rG   )sizeZq_projZk_projZv_projrJ   rK   reshapeZ	num_headsZhead_dimr/   viewr;   rD   Z_attn_implementationgetloggerZwarning_oncer   r,   Zattention_dropoutscalerH   Zprojection_layerZprojection_dropout)rL   rO   r$   rP   r4   
batch_sizeseq_len_Zquery_statesr5   r6   Zattention_interfacer8   r7   outputZoutputsr9   r9   r:   forwardY   sH   




	


zInternVLVisionAttention.forward)NN)r?   r@   rA   r   rF   r-   Tensorr   r   r
   r\   __classcell__r9   r9   rM   r:   rC   M   s    rC   c                   @   s2   e Zd ZeZdZdZdZdgZdZ	dZ
dd ZdS )InternVLVisionPreTrainedModelZinternvl_visionpixel_valuesTInternVLVisionLayerc                 C   s:  t |tjtjtjfr%|jjjd| jj	d |j
dur#|j
j  dS dS t |tjrH|jjjd| jj	d |jdurF|jj|j   dS dS t |tjr]|j
j  |jjd dS t |tr|jj  |jdurs|jj  |jdur|jj  dS dS t |tr|jj| jj |jj| jj dS dS )zInitialize the weightsr   meanstdN      ?)
isinstancer1   LinearConv2dZConvTranspose2dweightdatanormal_rD   initializer_rangebiaszero_Z	EmbeddingZpadding_idx	LayerNormfill_InternVLVisionEmbeddings	cls_token
mask_tokenposition_embeddingsra   lambda_1layer_scale_init_valuelambda_2)rL   r    r9   r9   r:   _init_weights   s0   





z+InternVLVisionPreTrainedModel._init_weightsN)r?   r@   rA   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attn_2rx   r9   r9   r9   r:   r_      s    r_   c                   @   s   e Zd ZdZdS )$InternVLVisionModelOutputWithPoolinga  
    Class for outputs of [`InternVLVisionModel`].

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
            *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
            will be returned.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    N)r?   r@   rA   __doc__r9   r9   r9   r:   ry      s    ry   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _|| _tj	||||d| _
d S )Nr   r   )Zkernel_sizeZstride)rE   rF   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper1   rh   
projection)rL   rD   r|   r}   r~   r   r   r   rM   r9   r:   rF      s   
  z&InternVLVisionPatchEmbeddings.__init__r`   returnc           	      C   s^   |j \}}}}|| jkrtd| |}|j d |j d }}|ddd}|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )r0   r~   
ValueErrorr   flattenr/   )	rL   r`   rX   r~   heightwidth
embeddingspatch_heightpatch_widthr9   r9   r:   r\      s   

z%InternVLVisionPatchEmbeddings.forward)	r?   r@   rA   rz   rF   r-   r]   r\   r^   r9   r9   rM   r:   r{      s    r{   c                       sl   e Zd ZdZdeddf fddZdejded	edejfd
dZ		ddejde
ej dejfddZ  ZS )rq   zc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rD   r   Nc                    s   t    ttdd|j| _|jr!ttdd|j| _	nd | _	t
|| _|j| _t|jtjjr8|jn|j|jf| _| jj}|jrUttd|d |j| _nd | _t|j| _d S )Nr   )rE   rF   r1   	Parameterr-   Zzerosr   rr   Zuse_mask_tokenrs   r{   patch_embeddingsr}   rf   r|   collectionsabcIterabler   Z use_absolute_position_embeddingsrt   Dropouthidden_dropout_probr&   )rL   rD   r   rM   r9   r:   rF      s    


z!InternVLVisionEmbeddings.__init__r   r   r   c                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| jd  }	|| jd  }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr(   r         ?r   r   ZbicubicF)rR   modeZalign_cornersr)   )r0   rt   r-   Zjit
is_tracingr}   r   rS   permuter1   r2   ZinterpolaterT   cat)rL   r   r   r   r   Znum_positionsZclass_pos_embedZpatch_pos_embedr*   Z
new_heightZ	new_widthZsqrt_num_positionsr9   r9   r:   interpolate_pos_encoding  s(   

z1InternVLVisionEmbeddings.interpolate_pos_encodingr`   bool_masked_posc                 C   s   |j \}}}}| |\}\}}| \}	}
}|d ur5| j|	|
d}|d|}|d|  ||  }| j|	dd}tj	||fdd}| j
d urT|| ||| }| |}|||ffS )Nr(   r   r)   )r0   r   rR   rs   expandZ	unsqueezeZtype_asrr   r-   r   rt   r   r&   )rL   r`   r   rZ   r   r   r   r   r   rX   rY   Zmask_tokenswZ
cls_tokensr9   r9   r:   r\   3  s   

z InternVLVisionEmbeddings.forwardN)r?   r@   rA   rz   r   rF   r-   r]   intr   r   
BoolTensorr\   r^   r9   r9   rM   r:   rq      s    +rq   c                   @   r<   )InternVLVisionMLPNr>   r9   r9   r9   r:   r   M  rB   r   )
layer_normZrms_normc                       s`   e Zd ZdZdeddf fddZ	ddejd	ede	e
ej e
ejejf f fd
dZ  ZS )ra   z?This corresponds to the Block class in the timm implementation.rD   r   Nc                    s   t    |j| _d| _t|| _t|| _t|j	 |j
|jd| _t|j	 |j
|jd| _|j}tj|t|j
 dd| _tj|t|j
 dd| _t|j| _d S )Nr   ZepsT)Zrequires_grad)rE   rF   Zchunk_size_feed_forwardZseq_len_dimrC   	attentionr   mlpNORM2FNZ	norm_typer   layer_norm_epslayernorm_beforelayernorm_afterrv   r1   r   r-   Zonesru   rw   r   r   r&   )rL   rD   Zinit_valuesrM   r9   r:   rF   W  s   


zInternVLVisionLayer.__init__FrO   rP   c                 C   sl   | j | ||d\}}| j| }|| }| |}| |}| |}| jd ur.| j| }|| }||fS )N)rP   )r   r   ru   r   r   r&   rw   )rL   rO   rP   Zattention_outputZattention_weightsZlayer_outputr9   r9   r:   r\   f  s   






zInternVLVisionLayer.forward)F)r?   r@   rA   rz   r   rF   r-   r]   boolr   r   r\   r^   r9   r9   rM   r:   ra   T  s    ra   c                       sT   e Zd Zdeddf fddZe		ddejded	ede	e
ef fd
dZ  ZS )InternVLVisionEncoderrD   r   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r9   )ra   ).0irD   r9   r:   
<listcomp>  s    z2InternVLVisionEncoder.__init__.<locals>.<listcomp>F)	rE   rF   rD   r1   Z
ModuleListrangeZnum_hidden_layerslayergradient_checkpointingrL   rD   rM   r   r:   rF     s   
 
zInternVLVisionEncoder.__init__FrO   rP   output_hidden_statesc           	      C   s   |rdnd }|r
dnd }t | jD ],\}}|r||f }| jr+| jr+| |j||}n|||}|d }|r=||d f }q|rE||f }t|||dS )Nr9   r   r   last_hidden_staterO   
attentions)	enumerater   r   r,   Z_gradient_checkpointing_func__call__r   )	rL   rO   rP   r   Zall_hidden_statesZall_self_attentionsr   Zlayer_moduleZlayer_outputsr9   r9   r:   r\     s*   


zInternVLVisionEncoder.forward)FF)r?   r@   rA   r   rF   r   r-   r]   r   r   tupler   r\   r^   r9   r9   rM   r:   r     s    
r   c                       st   e Zd Zdeddf fddZdd Zee			ddej	d	e
ej d
e
e de
e deeef f
ddZ  ZS )InternVLVisionModelrD   r   Nc                    sT   t  | || _t|| _t|| _|jrt	 ntj
|j|jd| _|   d S )Nr   )rE   rF   rD   rq   r   r   encoderZuse_mean_poolingr1   rI   ro   r   r   	layernormZ	post_initr   rM   r9   r:   rF     s   

zInternVLVisionModel.__init__c                 C   s   | j jS r   )r   r   )rL   r9   r9   r:   get_input_embeddings  s   z(InternVLVisionModel.get_input_embeddingsr`   r   rP   r   c           	      C   sn   |dur|n| j j}|dur|n| j j}| j||d\}}| j|||d}|d }| |}t||j|jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r   )rP   r   r   r   )	rD   rP   r   r   r   r   ry   rO   r   )	rL   r`   r   rP   r   Zembedding_outputrZ   Zencoder_outputsZsequence_outputr9   r9   r:   r\     s    
zInternVLVisionModel.forward)NNN)r?   r@   rA   r   rF   r   r   r   r-   r]   r   r   r   r   r   ry   r\   r^   r9   r9   rM   r:   r     s&    
r   c                   @   s   e Zd Zdd ZdS )InternVLPreTrainedModelc                 C   s   t | jd| j j}t|tjr)|jjj	d|d |j
d ur'|j
j  d S d S t|tjr>|j
j  |jjd d S d S )Nrl   r   rb   re   )getattrrD   Zget_text_configrl   rf   r1   rg   ri   rj   rk   rm   rn   ro   rp   )rL   r    rd   r9   r9   r:   rx     s   
z%InternVLPreTrainedModel._init_weightsN)r?   r@   rA   rx   r9   r9   r9   r:   r     s    r   c                       s*   e Zd Zdef fddZdd Z  ZS )InternVLMultiModalProjectorrD   c                    sz   t    t|jjtd|j d  | _t	|jjtd|j d  |j
j| _t|j | _t	|j
j|j
j| _d S )Nr   r   )rE   rF   r1   ro   Zvision_configr   r   downsample_ratior   rg   Ztext_configlinear_1r	   Zprojector_hidden_actactlinear_2r   rM   r9   r:   rF     s   
"z$InternVLMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   r   r   )rL   Zimage_featuresrO   r9   r9   r:   r\     s
   



z#InternVLMultiModalProjector.forward)r?   r@   rA   r   rF   r\   r^   r9   r9   rM   r:   r     s    	r   c                   @   sF   e Zd ZddejdefddZdejdee	e
e	 f defd	d
ZdS )InternVLModelr   vision_featuresscale_factorc              	   C   s   |  \}}}}|| dks|| dkrtd|||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )rR   r   rT   r   r   r3   )rL   r   r   rX   r   r   channelsr9   r9   r:   pixel_shuffle  s   $zInternVLModel.pixel_shuffler`   vision_feature_layervision_feature_select_strategyc           
      K   s   | j j}|dkr| j|dj}n	| j|dj| }|dkr*|ddddddf }|jd }t|d }|jd }	||	||d}| j	||d}||	d|jd }| 
|}|S )	a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `List[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        r(   )r`   defaultNr   r   r   )r   )rD   r   Zvision_towerr   Zvision_modelrO   r0   r   rS   r   Zmulti_modal_projector)
rL   r`   r   r   r4   r   r   r   Zfeature_sizerX   r9   r9   r:   get_image_features+  s   


z InternVLModel.get_image_featuresN)r   )r?   r@   rA   r-   r]   floatr   ZFloatTensorr   r   r   strr   r9   r9   r9   r:   r     s    #r   c                   @   r<   )InternVLCausalLMOutputWithPastNr>   r9   r9   r9   r:   r   Y  rB   r   c                       s   e Zd Z fddZ  ZS ) InternVLForConditionalGenerationc                     s   t  jdi |  dS )ai  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", torch_dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```Nr9   )rE   r\   )Zsuper_kwargsrM   r9   r:   r\   ^  s   $z(InternVLForConditionalGeneration.forward)r?   r@   rA   r\   r^   r9   r9   rM   r:   r   ]  s    r   )r_   r   r   r   r   )r   )Gcollections.abcr   dataclassesr   typingr   r   r   r   r   r-   Ztorch.nnr1   Ztorch.utils.checkpointZactivationsr	   Zmodeling_flash_attention_utilsr
   Zmodeling_outputsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   Zclip.modeling_clipr   Zjanus.modeling_janusr   Zllama.modeling_llamar   Zllava.modeling_llavar   r   r   r   Zconfiguration_internvlr   r   Z
get_loggerr?   rV   Moduler]   r   r;   r=   rC   r_   ry   r{   rq   r   ro   r   ra   r   r   r   ZINTERNVL_INPUTS_DOCSTRINGr   r   r   r   __all__r9   r9   r9   r:   <module>   sn   


>#&^0+5R(