o
    ZhO1                     @   s   d dl mZmZmZmZ d dlZd dlmZ d dlmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZ dd	lmZ eeZG d
d deZG dd deZG dd dejZG dd deZG dd de
ZG dd de	Zg dZdS )    )ListOptionalTupleUnionN)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)auto_docstringis_torchdynamo_compilinglogging   )VipLlavaConfigc                   @      e Zd ZdS )VipLlavaModelOutputWithPastN__name__
__module____qualname__ r   r   \/var/www/auris/lib/python3.10/site-packages/transformers/models/vipllava/modular_vipllava.pyr   %       r   c                   @   r   )VipLlavaCausalLMOutputWithPastNr   r   r   r   r   r   )   r   r   c                       s*   e Zd Zdef fddZdd Z  ZS )VipLlavaMultiModalProjectorconfigc                    s   t    t|jtrdnt|j}tj||jj	 |j
d| _tj||jj	 |jj	dd| _t|j | _tj|jj	|jj	dd| _d S )Nr   )ZepsT)Zbias)super__init__
isinstancevision_feature_layersintlenr   Z	LayerNormZvision_configZhidden_sizeZprojector_layernorm_epsprojector_layernormZLineartext_configlinear_1r   Zprojector_hidden_actactlinear_2)selfr   Znum_feature_layers	__class__r   r   r    .   s   

z$VipLlavaMultiModalProjector.__init__c                 C   s,   |  |}| |}| |}| |}|S )N)r%   r'   r(   r)   )r*   hidden_statesr   r   r   forward=   s
   



z#VipLlavaMultiModalProjector.forward)r   r   r   r   r    r.   __classcell__r   r   r+   r   r   -   s    r   c                   @   r   )VipLlavaPreTrainedModelNr   r   r   r   r   r0   E   r   r0   c                   @   s   e Zd Zdejdeeee f fddZe													ddej
dejdeej deej
 d	eeej  d
eej deeeee f  dee dee dee dee deej
 deeef fddZdS )VipLlavaModelpixel_valuesr"   c                    sb   | j |dd t|tr j| ddddf }n fdd|D }tj|dd}| |}|S )	aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, List[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        T)output_hidden_statesNr   c                    s&   g | ]} j | d d dd f qS )Nr   )r-   ).0indexZimage_outputsr   r   
<listcomp>_   s   & z4VipLlavaModel.get_image_features.<locals>.<listcomp>)dim)Zvision_towerr!   r#   r-   torchcatZmulti_modal_projector)r*   r2   r"   image_featuresr   r6   r   get_image_featuresJ   s   

z VipLlavaModel.get_image_featuresN	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsr3   return_dictcache_positionreturnc                 K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du |duA r4td|dur@|dur@td|du rJ|  |}|dur| j||d}|| j jk	d}|
||j}t s||  | kr|| j jk }|jd |jd  }td| d	| ||j|j}|||}| jd||||||	|
d
|d	|}t|j|j|j|j|dur|ndd}|r|S | S )z
        vision_feature_layers (`Union[int, List[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r2   r"   r8   r   r   z6Image features and image tokens do not match: tokens: z, features T)	r?   r@   rA   rB   rC   rD   r3   rE   rF   )last_hidden_staterA   r-   
attentionsimage_hidden_statesr   )r   rD   r3   use_return_dictr"   
ValueErrorZget_input_embeddingsr=   Zimage_token_idZ	unsqueezeZ	expand_astoZdevicer   ZnumelsumshapeZdtypeZmasked_scatterlanguage_modelr   rH   rA   r-   rI   Zto_tuple)r*   r>   r2   r?   r@   rA   rB   r"   rC   rD   r3   rE   rF   	lm_kwargsr<   Zspecial_image_maskZn_image_tokensZn_image_featuresoutputsoutputr   r   r   r.   d   sb   
zVipLlavaModel.forward)NNNNNNNNNNNN)r   r   r   r:   FloatTensorr   r#   r   r=   r   
LongTensorr   Tensorboolr   r   r.   r   r   r   r   r1   I   sT     	

r1   c                !   @   s   e Zd Z														ddejdejdeej deej deeej  deej d	ee	e
ee
 f  d
eej dee dee dee dee deej de	e
ejf de	eef fddZdS ) VipLlavaForConditionalGenerationNr   r>   r2   r?   r@   rA   rB   r"   labelsrC   rD   r3   rE   rF   logits_to_keeprG   c                 K   s   |
dur|
n| j j}
|dur|n| j j}|dur|n| j j}|dur$|n| j j}| jd|||||||	||
|d|d|}|d }t|trLt| dn|}| 	|dd|ddf }d}|durm| j
||| j jjd}t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        vision_feature_layers (`Union[int, List[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)r>   r2   r?   r@   rA   rB   rC   r"   rD   r3   rE   rF   r   )logitsrY   
vocab_size)lossr[   rA   r-   rI   rJ   r   )r   rD   r3   rK   r"   modelr!   r#   sliceZlm_headZloss_functionr&   r\   r   rA   r-   rI   rJ   )r*   r>   r2   r?   r@   rA   rB   r"   rY   rC   rD   r3   rE   rF   rZ   rQ   rR   r-   Zslice_indicesr[   r]   r   r   r   r.      sH   4z(VipLlavaForConditionalGeneration.forward)NNNNNNNNNNNNNr   )r   r   r   r:   rU   rT   r   rV   r   r   r#   rW   r   r   r.   r   r   r   r   rX      s\    	

rX   )r1   rX   r0   ) typingr   r   r   r   r:   r   Z(transformers.models.llava.modeling_llavar   r   r	   r
   r   Zactivationsr   utilsr   r   r   Zconfiguration_vipllavar   Z
get_loggerr   loggerr   r   Moduler   r0   r1   rX   __all__r   r   r   r   <module>   s   
la