o
    Zh                     @   s  d dl Zd dlmZ d dlmZmZmZmZm	Z	 d dl
Z
d dlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z* e$+e,Z-edG dd dej.Z/	dCdej.de
j0de
j0de
j0dee
j0 de1de1fddZ2G dd dej.Z3e!G d d! d!eZ4eG d"d# d#eZ5G d$d% d%ej.Z6G d&d' d'ej.Z7G d(d) d)ej.Z8ej9e/d*Z:G d+d, d,ej.Z;G d-d. d.ej.Z<e!G d/d0 d0e4Z=e!G d1d2 d2eZ>G d3d4 d4ej.Z?eG d5d6 d6eZ@e!d7d8G d9d: d:e>ZAeG d;d< d<e ZBG d=d> d>eeZCe!d?d8G d@dA dAe>eZDg dBZEdS )D    N)	dataclass)CallableListOptionalTupleUnion   )ACT2FN)GenerationMixin)use_kernel_forward_from_hub)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tupleis_torchdynamo_compilinglogging	torch_int   )	AutoModel   )InternVLConfigInternVLVisionConfigZRMSNormc                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	InternVLVisionRMSNormư>c                    s&   t    tt|| _|| _dS )zD
        InternVLVisionRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ ]/var/www/auris/lib/python3.10/site-packages/transformers/models/internvl/modeling_internvl.pyr"   7   s   

zInternVLVisionRMSNorm.__init__c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   T)Zkeepdim)	dtypetor%   Zfloat32powmeanZrsqrtr(   r'   )r)   hidden_statesZinput_dtypeZvariancer.   r.   r/   forward?   s
   zInternVLVisionRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler'   shaper(   r)   r.   r.   r/   
extra_reprF   s   z InternVLVisionRMSNorm.extra_repr)r    )__name__
__module____qualname__r"   r6   r:   __classcell__r.   r.   r,   r/   r   5   s    r           modulequerykeyvalueattention_maskscalingdropoutc                 K   s   |}|}	t ||dd| }
|d ur+|d d d d d d d |jd f }|
| }
tjj|
dd}
tjj|
|| jd}
t |
|	}|dd	 }||
fS )Nr   r   r0   dim)ptrainingr   )
r%   matmul	transposer8   r#   
functionalZsoftmaxrF   rK   
contiguous)r@   rA   rB   rC   rD   rE   rF   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputr.   r.   r/   eager_attention_forwardJ   s   
&rV   c                
       sX   e Zd ZdZdef fddZ		ddejdeej deej d	e	e
 fd
dZ  ZS )InternVLVisionAttentionz+Attention Class for InternVL Vision Encoderconfigc                    sB  t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _
|j}|j}d| _tj| j| j| j |jd| _tj| j| j| j |jd| _tj| j| j| j |jd| _t| j| j| _|dkrt|nt | _|rt| jnt | _|rt| j| _d S t | _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r!   r"   rX   r*   	embed_dimZnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutZuse_qk_norm	is_causalr#   LinearZattention_biasq_projk_projv_projprojection_layerDropoutIdentityr   q_normk_norm)r)   rX   Zproj_dropoutZqk_normr,   r.   r/   r"   h   s.   

"z InternVLVisionAttention.__init__Nr5   rD   output_attentionsrP   c                 K   sF  |  \}}}| |}| |}	| |}
| |}| |	}	|||| j| j	dd}|	||| j| j	dd}	|

||| j| j	dd}
t}| jjdkrj| jjdkrd|ddrdtd nt| jj }|| ||	|
|f| jsvdn| j| jdd	|\}}|||| j}| |}| |}|r||f}|S |d f}|S )
Nr   r   eagerZsdparl   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r?   )rF   rE   rb   )sizerd   re   rf   rj   rk   reshaper\   r]   rM   viewrV   rX   Z_attn_implementationgetloggerZwarning_oncer   rK   r`   r_   r[   rg   ra   )r)   r5   rD   rl   rP   
batch_sizeseq_len_Zquery_statesrQ   rR   Zattention_interfacerU   rS   outputoutputsr.   r.   r/   r6      sH   




	


zInternVLVisionAttention.forward)NN)r;   r<   r=   __doc__r   r"   r%   Tensorr   r   r   r6   r>   r.   r.   r,   r/   rW   e   s    rW   c                   @   s2   e Zd ZeZdZdZdZdgZdZ	dZ
dd ZdS )InternVLVisionPreTrainedModelZinternvl_visionpixel_valuesTInternVLVisionLayerc                 C   s:  t |tjtjtjfr%|jjjd| jj	d |j
dur#|j
j  dS dS t |tjrH|jjjd| jj	d |jdurF|jj|j   dS dS t |tjr]|j
j  |jjd dS t |tr|jj  |jdurs|jj  |jdur|jj  dS dS t |tr|jj| jj |jj| jj dS dS )zInitialize the weightsr?   r4   stdN      ?)
isinstancer#   rc   Conv2dZConvTranspose2dr'   datanormal_rX   initializer_rangerZ   zero_Z	EmbeddingZpadding_idx	LayerNormfill_InternVLVisionEmbeddings	cls_token
mask_tokenposition_embeddingsr|   lambda_1layer_scale_init_valuelambda_2)r)   r@   r.   r.   r/   _init_weights   s0   





z+InternVLVisionPreTrainedModel._init_weightsN)r;   r<   r=   r   config_classbase_model_prefixZmain_input_namesupports_gradient_checkpointingZ_no_split_modules_supports_sdpa_supports_flash_attn_2r   r.   r.   r.   r/   rz      s    rz   c                   @   s   e Zd ZdZdS )$InternVLVisionModelOutputWithPoolinga  
    Class for outputs of [`InternVLVisionModel`].

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
            *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
            will be returned.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    N)r;   r<   r=   rx   r.   r.   r.   r/   r      s    r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}|d |d  |d |d   }|d |d  |d |d  f}|| _|| _|| _|| _|| _tj	||||d| _
d S )Nr   r   )Zkernel_sizeZstride)r!   r"   
image_size
patch_sizenum_channelsr*   num_patchespatch_shaper#   r   
projection)r)   rX   r   r   r   r*   r   r   r,   r.   r/   r"      s   
  z&InternVLVisionPatchEmbeddings.__init__r{   returnc           	      C   s^   |j \}}}}|| jkrtd| |}|j d |j d }}|ddd}|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )r8   r   r^   r   flattenrM   )	r)   r{   rs   r   heightwidth
embeddingspatch_heightpatch_widthr.   r.   r/   r6   
  s   

z%InternVLVisionPatchEmbeddings.forward)	r;   r<   r=   rx   r"   r%   ry   r6   r>   r.   r.   r,   r/   r      s    r   c                       sl   e Zd ZdZdeddf fddZdejded	edejfd
dZ		ddejde
ej dejfddZ  ZS )r   zc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rX   r   Nc                    s   t    ttdd|j| _|jr!ttdd|j| _	nd | _	t
|| _|j| _t|jtjjr8|jn|j|jf| _| jj}|jrUttd|d |j| _nd | _t|j| _d S )Nr   )r!   r"   r#   r$   r%   Zzerosr*   r   Zuse_mask_tokenr   r   patch_embeddingsr   r   r   collectionsabcIterabler   Z use_absolute_position_embeddingsr   rh   hidden_dropout_probrF   )r)   rX   r   r,   r.   r/   r"      s    


z!InternVLVisionEmbeddings.__init__r   r   r   c                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| jd  }	|| jd  }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr0   r         ?r   r   ZbicubicF)rn   modeZalign_cornersrH   )r8   r   r%   Zjit
is_tracingr   r   ro   permuter#   rN   Zinterpolaterp   cat)r)   r   r   r   r   Znum_positionsZclass_pos_embedZpatch_pos_embedrI   Z
new_heightZ	new_widthZsqrt_num_positionsr.   r.   r/   interpolate_pos_encoding6  s(   

z1InternVLVisionEmbeddings.interpolate_pos_encodingr{   bool_masked_posc                 C   s   |j \}}}}| |\}\}}| \}	}
}|d ur5| j|	|
d}|d|}|d|  ||  }| j|	dd}tj	||fdd}| j
d urT|| ||| }| |}|||ffS )Nr0   r   rH   )r8   r   rn   r   expand	unsqueezeZtype_asr   r%   r   r   r   rF   )r)   r{   r   ru   r   r   r   r   r   rs   rt   Zmask_tokenswZ
cls_tokensr.   r.   r/   r6   ^  s   

z InternVLVisionEmbeddings.forwardN)r;   r<   r=   rx   r   r"   r%   ry   intr   r   
BoolTensorr6   r>   r.   r.   r,   r/   r     s    +r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )InternVLVisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S r   )r!   r"   rX   r	   Z
hidden_actactivation_fnr#   rc   r*   Zintermediate_sizefc1fc2r)   rX   r,   r.   r/   r"   y  s
   
zInternVLVisionMLP.__init__r5   r   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r)   r5   r.   r.   r/   r6     s   


zInternVLVisionMLP.forward)r;   r<   r=   r"   r%   ry   r6   r>   r.   r.   r,   r/   r   x  s    r   )
layer_normZrms_normc                       s`   e Zd ZdZdeddf fddZ	ddejd	ede	e
ej e
ejejf f fd
dZ  ZS )r|   z?This corresponds to the Block class in the timm implementation.rX   r   Nc                    s   t    |j| _d| _t|| _t|| _t|j	 |j
|jd| _t|j	 |j
|jd| _|j}tj|t|j
 dd| _tj|t|j
 dd| _t|j| _d S )Nr   r+   T)Zrequires_grad)r!   r"   Zchunk_size_feed_forwardZseq_len_dimrW   	attentionr   mlpNORM2FNZ	norm_typer*   layer_norm_epslayernorm_beforelayernorm_afterr   r#   r$   r%   r&   r   r   rh   r   rF   )r)   rX   Zinit_valuesr,   r.   r/   r"     s   


zInternVLVisionLayer.__init__Fr5   rl   c                 C   sl   | j | ||d\}}| j| }|| }| |}| |}| |}| jd ur.| j| }|| }||fS )N)rl   )r   r   r   r   r   rF   r   )r)   r5   rl   Zattention_outputZattention_weightsZlayer_outputr.   r.   r/   r6     s   






zInternVLVisionLayer.forward)F)r;   r<   r=   rx   r   r"   r%   ry   boolr   r   r6   r>   r.   r.   r,   r/   r|     s    r|   c                       sT   e Zd Zdeddf fddZe		ddejded	ede	e
ef fd
dZ  ZS )InternVLVisionEncoderrX   r   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r.   )r|   ).0irX   r.   r/   
<listcomp>  s    z2InternVLVisionEncoder.__init__.<locals>.<listcomp>F)	r!   r"   rX   r#   Z
ModuleListrangeZnum_hidden_layerslayergradient_checkpointingr   r,   r   r/   r"     s   
 
zInternVLVisionEncoder.__init__Fr5   rl   output_hidden_statesc           	      C   s   |rdnd }|r
dnd }t | jD ],\}}|r||f }| jr+| jr+| |j||}n|||}|d }|r=||d f }q|rE||f }t|||dS )Nr.   r   r   last_hidden_stater5   
attentions)	enumerater   r   rK   Z_gradient_checkpointing_func__call__r   )	r)   r5   rl   r   Zall_hidden_statesZall_self_attentionsr   Zlayer_moduleZlayer_outputsr.   r.   r/   r6     s*   


zInternVLVisionEncoder.forward)FF)r;   r<   r=   r   r"   r   r%   ry   r   r   r7   r   r6   r>   r.   r.   r,   r/   r     s    
r   c                       st   e Zd Zdeddf fddZdd Zee			ddej	d	e
ej d
e
e de
e deeef f
ddZ  ZS )InternVLVisionModelrX   r   Nc                    sT   t  | || _t|| _t|| _|jrt	 ntj
|j|jd| _|   d S )Nr   )r!   r"   rX   r   r   r   encoderZuse_mean_poolingr#   ri   r   r*   r   	layernorm	post_initr   r,   r.   r/   r"     s   

zInternVLVisionModel.__init__c                 C      | j jS r   )r   r   r9   r.   r.   r/   get_input_embeddings  s   z(InternVLVisionModel.get_input_embeddingsr{   r   rl   r   c           	      C   sn   |dur|n| j j}|dur|n| j j}| j||d\}}| j|||d}|d }| |}t||j|jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r   )rl   r   r   r   )	rX   rl   r   r   r   r   r   r5   r   )	r)   r{   r   rl   r   Zembedding_outputru   Zencoder_outputsZsequence_outputr.   r.   r/   r6     s    
zInternVLVisionModel.forward)NNN)r;   r<   r=   r   r"   r   r   r   r%   ry   r   r   r   r   r7   r   r6   r>   r.   r.   r,   r/   r     s&    
r   c                   @   s<   e Zd ZeZdZdZdZdZdZ	dZ
dZdZdZdd ZdS )InternVLPreTrainedModel Tpast_key_valuesc                 C   s   t | jd| j j}t|tjr)|jjj	d|d |j
d ur'|j
j  d S d S t|tjr>|j
j  |jjd d S d S )Nr   r?   r}   r   )getattrrX   Zget_text_configr   r   r#   rc   r'   r   r   rZ   r   r   r   )r)   r@   r~   r.   r.   r/   r   (  s   
z%InternVLPreTrainedModel._init_weightsN)r;   r<   r=   r   r   r   r   Z_skip_keys_device_placementZ_supports_cache_classr   r   Z_supports_quantized_cacheZ_supports_static_cacheZ_supports_attention_backendr   r.   r.   r.   r/   r     s    r   c                       s*   e Zd Zdef fddZdd Z  ZS )InternVLMultiModalProjectorrX   c                    sz   t    t|jjtd|j d  | _t	|jjtd|j d  |j
j| _t|j | _t	|j
j|j
j| _d S )Nr   r   )r!   r"   r#   r   vision_configr*   r   downsample_ratior   rc   text_configlinear_1r	   Zprojector_hidden_actactlinear_2r   r,   r.   r/   r"   5  s   
"z$InternVLMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S r   )r   r   r   r   )r)   image_featuresr5   r.   r.   r/   r6   >  s
   



z#InternVLMultiModalProjector.forward)r;   r<   r=   r   r"   r6   r>   r.   r.   r,   r/   r   4  s    	r   c                   @   s$   e Zd ZU dZdZeej ed< dS )InternVLModelOutputWithPasta  
    Base class for InternVL outputs, with hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	r;   r<   r=   rx   r   r   r%   FloatTensor__annotations__r.   r.   r.   r/   r   F  s   
 r   zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    )Zcustom_introc                %       s<  e Zd ZddiZdef fddZdd Zdd	 Zd
ej	de
eee f defddZee														d$dejd
ej	deej deej deeej	  deej	 dee
eee f  dee dee dee dee dee deej dejdee de
eef f ddZd%d ejd!efd"d#Z  ZS )&InternVLModelzlanguage_model.modellanguage_modelrX   c                    s>   t  | t|j| _t|| _t|j| _	| 
  d S r   )r!   r"   r   Zfrom_configr   vision_towerr   multi_modal_projectorr   r   r   r   r,   r.   r/   r"   o  s
   
zInternVLModel.__init__c                 C   
   | j  S r   )r   r   r9   r.   r.   r/   r   w     
z"InternVLModel.get_input_embeddingsc                 C      | j | d S r   )r   set_input_embeddingsr)   rC   r.   r.   r/   r   z     z"InternVLModel.set_input_embeddingsr{   vision_feature_layervision_feature_select_strategyc           
      K   s   | j j}|dkr| j|dj}n	| j|dj| }|dkr*|ddddddf }|jd }t|d }|jd }	||	||d}| j	||d}||	d|jd }| 
|}|S )	a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `List[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        r0   )r{   defaultNr   r   r   )scale_factor)rX   r   r   r   Zvision_modelr5   r8   r   ro   pixel_shuffler   )
r)   r{   r   r   rP   r   vision_featureschannelsZfeature_sizers   r.   r.   r/   get_image_features}  s   


z InternVLModel.get_image_featuresN	input_idsrD   position_idsr   inputs_embeds	use_cacherl   r   return_dictcache_positionimage_sizesrP   r   c                 K   s  |
d ur|
n| j j}
|d ur|n| j j}|d ur|n| j j}|d ur$|n| j j}|d ur.|n| j j}|d u |d uA r>td|d u rH|  |}|d ur| j||||d}|d u rx||  t	j
| j jt	j|jdk}|jddjddd }n|| j jkd}|||j}|| j jk }t s||  | kr|| j jk }|jd |jd  }td| d	| ||j|j}|||}| jd|||||	|
|d
|d	|}t|j|j|j|j|d ur|dS d dS )Nz:You must specify exactly one of input_ids or inputs_embeds)r{   r   r   r   )r1   devicer   rH   r   r0   z6Image features and image tokens do not match: tokens: z, features T)	rD   r   r   r   r   rl   r   r   r   )r   r   r5   r   r   r.   )rX   rl   r   use_return_dictr   r   r^   r   r   r%   ZtensorZimage_token_idlongr  sumr   Z	expand_asr2   r   Znumelr8   r1   Zmasked_scatterr   r   r   r   r5   r   )r)   r   r{   rD   r   r   r   r   r   r   rl   r   r   r   r   rP   r   Zspecial_image_maskZn_image_tokensZn_image_featuresrw   r.   r.   r/   r6     sv   

zInternVLModel.forwardr   r   r   c              	   C   s   |  \}}}}|| dks|| dkrtd|||t|| t|| }|dddd }||t|| t|| t||d  }|dddd }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )rn   r^   rp   r   r   rO   )r)   r   r   rs   r   r   r   r.   r.   r/   r     s   $zInternVLModel.pixel_shuffle)NNNNNNNNNNNNNN)r   )r;   r<   r=   _checkpoint_conversion_mappingr   r"   r   r   r%   r   r   r   r   strr   r   r   
LongTensorr   ry   r   r   r   r   r   r6   floatr   r>   r.   r.   r,   r/   r   g  s|    
-	

Vr   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	InternVLCausalLMOutputWithPasta  
    Base class for InternVL causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitsr   r5   r   r   )r;   r<   r=   rx   r
  r   r%   r   r   r  r   r   r5   r   r   r   r.   r.   r.   r/   r	  &  s   
 r	  c                   @   s   e Zd ZdS )KwargsForCausalLMN)r;   r<   r=   r.   r.   r.   r/   r  N  s    r  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c                )       s  e Zd ZdddddZdgZdef fdd	Zd
d Zdd Zde	j
fddZdd Zedd Zedd Zedd Zee																d6dejdejdeej deej deeej  d eej d!eeeee f  d"ee d#eej d$ee d%ee d&ee d'ee d(eej d)eeejf d*eej d+ee deee f f$d,d-Z!						d7 fd.d/	Z"e#dejd0ed1ed2ej$d(ejd3efd4d5Z%  Z&S )8 InternVLForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightrX   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFrY   )r!   r"   r   modelr#   rc   r   r*   
vocab_sizer  r   r   r,   r.   r/   r"   _  s   
z)InternVLForConditionalGeneration.__init__c                 C   r   r   )r  r   r9   r.   r.   r/   r   e  r   z5InternVLForConditionalGeneration.get_input_embeddingsc                 C   r   r   )r  r   r   r.   r.   r/   r   h  r   z5InternVLForConditionalGeneration.set_input_embeddingsr   c                 C   s   | j S r   r  r9   r.   r.   r/   get_output_embeddingsk  s   z6InternVLForConditionalGeneration.get_output_embeddingsc                 C   s
   || _ d S r   r  )r)   Znew_embeddingsr.   r.   r/   set_output_embeddingsn  r   z6InternVLForConditionalGeneration.set_output_embeddingsc                 C   r   r   )r  r   r9   r.   r.   r/   r   r     z/InternVLForConditionalGeneration.language_modelc                 C   r   r   )r  r   r9   r.   r.   r/   r   v  r  z-InternVLForConditionalGeneration.vision_towerc                 C   r   r   )r  r   r9   r.   r.   r/   r   z  r  z6InternVLForConditionalGeneration.multi_modal_projectorNr   r   r{   rD   r   r   r   r   r   labelsr   rl   r   r   r   logits_to_keepr   rP   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|dur.|n| j j}| jd|||||||||
||d||d|}|d }t|trXt	| dn|}| 
|dd|ddf }d}|	dur}| jd||	| j jjd|}t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", torch_dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```NT)r   r{   rD   r   r   r   r   r   r   rl   r   r   r   r   r   )r  r  r  )r
  r  r   r5   r   r   r.   )rX   rl   r   r  r   r   r  r   r   slicer  Zloss_functionr   r  r	  r   r5   r   r   )r)   r   r{   rD   r   r   r   r   r   r  r   rl   r   r   r   r  r   rP   rw   r5   Zslice_indicesr  r
  r.   r.   r/   r6   ~  s\   >z(InternVLForConditionalGeneration.forwardc           
         s8   t  j|f|||||d|}	|d dkr||	d< |	S )N)r   r   rD   r   r  r   r{   )r!   prepare_inputs_for_generation)
r)   r   r   r   r{   rD   r   r  rP   Zmodel_inputsr,   r.   r/   r    s   
z>InternVLForConditionalGeneration.prepare_inputs_for_generationsequence_lengthtarget_lengthr1   rs   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuer1   r  r   )Zdiagonal)r  r0   r   )rI   r%   Zfinfominfullr  ZtriuZarangero   r   cloner8   r2   Zmasked_fill)rD   r  r  r1   r   rs   rP   rT   Z	min_dtypeZmask_lengthZpadding_maskr.   r.   r/   5_prepare_4d_causal_attention_mask_with_cache_position  s,    $
6  zVInternVLForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNNNNr   N)NNNNNN)'r;   r<   r=   r  Z_tied_weights_keysr   r"   r   r   r#   Moduler  r  propertyr   r   r   r   r   r%   r  r   r   ry   r   r   r   r  r   r   r  r   r	  r6   r  staticmethodr1   r  r>   r.   r.   r,   r/   r  Q  s    


	

sr  )rz   r   r   r   r  )r?   )Fcollections.abcr   dataclassesr   typingr   r   r   r   r   r%   Ztorch.nnr#   Zactivationsr	   Z
generationr
   Zintegrationsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   r   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   r   r   r   autor   Zconfiguration_internvlr   r   Z
get_loggerr;   rr   r   r   ry   r  rV   rW   rz   r   r   r   r   r   r   r|   r   r   r   r   r   r   r	  r  r  __all__r.   r.   r.   r/   <module>   s   $	

Q#&^0+5  ;' q