o
    Zhbb                     @   s(  d dl mZ d dlmZmZmZmZ d dlZd dlmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ eG dd deZeG dd deZG dd dejZeG dd deZeddG dd deZeddG dd deeZ g dZ!dS )    )	dataclass)ListOptionalTupleUnionN)nn   )ACT2FN)GenerationMixin)BaseModelOutputWithPastModelOutput)PreTrainedModel)auto_docstringcan_return_tupleis_torchdynamo_compiling   )	AutoModel   )VipLlavaConfigc                   @   s$   e Zd ZU dZdZeej ed< dS )VipLlavaModelOutputWithPasta  
    Base class for VipLlava outputs, with hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__ r   r   ]/var/www/auris/lib/python3.10/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   %   s   
 r   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeej ed< dS )	VipLlavaCausalLMOutputWithPasta  
    Base class for VipLlava causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r!   r   r   r   r   r"   r#   r   r$   r   r%   r   r   r   r   r   r    F   s   
 r    c                       s*   e Zd Zdef fddZdd Z  ZS )VipLlavaMultiModalProjectorconfigc                    s   t    t|jtrdnt|j}tj||jj	 |j
d| _tj||jj	 |jj	dd| _t|j | _tj|jj	|jj	dd| _d S )Nr   )ZepsTbias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeZprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r	   Zprojector_hidden_actactlinear_2)selfr'   Znum_feature_layers	__class__r   r   r+   o   s   

z$VipLlavaMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S N)r3   r6   r7   r8   )r9   r$   r   r   r   forward~   s
   



z#VipLlavaMultiModalProjector.forward)r   r   r   r   r+   r=   __classcell__r   r   r:   r   r&   n   s    r&   c                   @   s<   e Zd ZeZdZdZdZdZdZ	dZ
dZdZdZdd ZdS )VipLlavaPreTrainedModel Tr#   c                 C   s   t | jd| j j}t|tjr)|jjj	d|d |j
d ur'|j
j  d S d S t|tjr>|jjd |j
j  d S d S )Ninitializer_rangeg        )meanstdg      ?)getattrr'   Zget_text_configrA   r,   r   r4   weightdataZnormal_r)   Zzero_r0   Zfill_)r9   modulerC   r   r   r   _init_weights   s   
z%VipLlavaPreTrainedModel._init_weightsN)r   r   r   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_skip_keys_device_placementZ_supports_cache_classZ_supports_flash_attn_2Z_supports_sdpaZ_supports_quantized_cacheZ_supports_static_cacheZ_supports_attention_backendrH   r   r   r   r   r?      s    r?   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    )Zcustom_introc                       s  e Zd ZddiZdef fddZdd Zdd	 Zd
ej	de
eee f fddZe												ddejd
ej	deej deej deeej	  deej	 dee
eee f  dee dee dee dee deej de
eef fddZ  ZS )VipLlavaModelzlanguage_model.modellanguage_modelr'   c                    s>   t  | t|j| _t|| _t|j| _	| 
  d S r<   )r*   r+   r   Zfrom_configr1   vision_towerr&   multi_modal_projectorr5   rJ   	post_initr9   r'   r:   r   r   r+      s
   
zVipLlavaModel.__init__c                 C   
   | j  S r<   )rJ   get_input_embeddingsr9   r   r   r   rP         
z"VipLlavaModel.get_input_embeddingsc                 C      | j | d S r<   )rJ   set_input_embeddingsr9   valuer   r   r   rT         z"VipLlavaModel.set_input_embeddingspixel_valuesr-   c                    sb   | j |dd t|tr j| ddddf }n fdd|D }tj|dd}| |}|S )	aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, List[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        T)output_hidden_statesNr   c                    s&   g | ]} j | d d dd f qS )Nr   )r$   ).0indexZimage_outputsr   r   
<listcomp>   s   & z4VipLlavaModel.get_image_features.<locals>.<listcomp>)dim)rK   r,   r.   r$   r   catrL   )r9   rX   r-   image_featuresr   r\   r   get_image_features   s   

z VipLlavaModel.get_image_featuresN	input_idsattention_maskposition_idsr#   inputs_embeds	use_cacheoutput_attentionsrY   return_dictcache_positionreturnc                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du |duA r4td|dur@|dur@td|du rJ|  |}|dur| j||d}|| j jk	d}|
||j}t s||  | kr|| j jk }|jd |jd  }td| d	| ||j|j}|||}| jd||||||	|
d
|d	|}t|j|j|j|j|dur|ndd}|r|S | S )z
        vision_feature_layers (`Union[int, List[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)rX   r-   r^   r   r   z6Image features and image tokens do not match: tokens: z, features T)	rd   re   r#   rf   rg   rh   rY   ri   rj   )last_hidden_stater#   r$   r%   r   r   )r'   rh   rY   use_return_dictr-   
ValueErrorrP   rb   Zimage_token_idZ	unsqueezeZ	expand_astodevicer   ZnumelsumshapedtypeZmasked_scatterrJ   r   rl   r#   r$   r%   Zto_tuple)r9   rc   rX   rd   re   r#   rf   r-   rg   rh   rY   ri   rj   	lm_kwargsra   Zspecial_image_maskZn_image_tokensZn_image_featuresoutputsoutputr   r   r   r=      sb   
zVipLlavaModel.forward)NNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r+   rP   rT   r   r   r   r.   r   rb   r   
LongTensorr   Tensorboolr   r   r=   r>   r   r   r:   r   rI      s\     	

rI   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c                #       s  e Zd ZdddddZdgZdef fdd	Zd
d Zdd Zde	j
fddZdd Zedd Zedd Zedd Zee														d3dejdejdeej deej deeej  d eej d!eeeee f  d"eej d#ee d$ee d%ee d&ee d'eej d(eeejf deeef fd)d*Z						d4 fd+d,	Ze dejd-ed.ed/ej!d'ejd0efd1d2Z"  Z#S )5 VipLlavaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr'   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFr(   )r*   r+   rI   modelr   r4   r5   r2   
vocab_sizer|   rM   rN   r:   r   r   r+   1  s   
z)VipLlavaForConditionalGeneration.__init__c                 C   rO   r<   )r}   rP   rQ   r   r   r   rP   7  rR   z5VipLlavaForConditionalGeneration.get_input_embeddingsc                 C   rS   r<   )r}   rT   rU   r   r   r   rT   :  rW   z5VipLlavaForConditionalGeneration.set_input_embeddingsrk   c                 C   s   | j S r<   r|   rQ   r   r   r   get_output_embeddings=  s   z6VipLlavaForConditionalGeneration.get_output_embeddingsc                 C   s
   || _ d S r<   r   )r9   Znew_embeddingsr   r   r   set_output_embeddings@  rR   z6VipLlavaForConditionalGeneration.set_output_embeddingsc                 C      | j jS r<   )r}   rJ   rQ   r   r   r   rJ   D     z/VipLlavaForConditionalGeneration.language_modelc                 C   r   r<   )r}   rK   rQ   r   r   r   rK   H  r   z-VipLlavaForConditionalGeneration.vision_towerc                 C   r   r<   )r}   rL   rQ   r   r   r   rL   L  r   z6VipLlavaForConditionalGeneration.multi_modal_projectorNr   rc   rX   rd   re   r#   rf   r-   labelsrg   rh   rY   ri   rj   logits_to_keepc                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| jd|||||||	||
|d|d|}|d }t|trLt| dn|}| 	|dd|ddf }d}|durm| j
||| j jjd}t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        vision_feature_layers (`Union[int, List[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)rc   rX   rd   re   r#   rf   rg   r-   rh   rY   ri   rj   r   )r"   r   r~   )r!   r"   r#   r$   r%   r   r   )r'   rh   rY   rm   r-   r}   r,   r.   slicer|   Zloss_functionr5   r~   r    r#   r$   r%   r   )r9   rc   rX   rd   re   r#   rf   r-   r   rg   rh   rY   ri   rj   r   rt   ru   r$   Zslice_indicesr"   r!   r   r   r   r=   P  sH   6z(VipLlavaForConditionalGeneration.forwardc           
         s8   t  j|f|||||d|}	|d dkr||	d< |	S )N)r#   rf   rd   rj   r   r   rX   )r*   prepare_inputs_for_generation)
r9   rc   r#   rf   rX   rd   rj   r   kwargsZmodel_inputsr:   r   r   r     s   
z>VipLlavaForConditionalGeneration.prepare_inputs_for_generationsequence_lengthtarget_lengthrs   
batch_sizec                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuers   rp   r   )Zdiagonal)rp   r^   r   )r_   r   Zfinfominfullrp   ZtriuZarangeZreshapeexpandclonerr   ro   Zmasked_fill)rd   r   r   rs   rj   r   r   Zcausal_maskZ	min_dtypeZmask_lengthZpadding_maskr   r   r   5_prepare_4d_causal_attention_mask_with_cache_position  s,    $
6  zVVipLlavaForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position)NNNNNNNNNNNNNr   )NNNNNN)$r   r   r   rw   Z_tied_weights_keysr   r+   rP   rT   r   Moduler   r   propertyrJ   rK   rL   r   r   r   rx   r   r   ry   r   r   r.   rz   r   r    r=   r   staticmethodrs   r   r>   r   r   r:   r   r{   #  s    


	

br{   )rI   r{   r?   )"dataclassesr   typingr   r   r   r   r   r   Zactivationsr	   Z
generationr
   Zmodeling_outputsr   r   Zmodeling_utilsr   utilsr   r   r   autor   Zconfiguration_vipllavar   r   r    r   r&   r?   rI   r{   __all__r   r   r   r   <module>   s8    '| `