o
    Zhm                     @   sZ  d dl mZ d dlmZmZmZ d dlZd dlZd dlm	Z	m
Z
 d dlmZmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ e e!Z"G dd de
Z#G dd de	Z$G dd deZ%G dd deZ&G dd deZ'G dd deZ(eG dd deZ)G dd deZ*G dd deZ+g d Z,dS )!    )	dataclass)OptionalTupleUnionN)InstructBlipQFormerConfigInstructBlipVisionConfig)$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutputInstructBlipModelInstructBlipPreTrainedModelInstructBlipQFormerModelInstructBlipVisionModelKwargsForCausalLM   )PretrainedConfig)FlashAttentionKwargs)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)Unpack)logging   )CONFIG_MAPPING
AutoConfigc                   @      e Zd ZdS )InstructBlipVideoVisionConfigN__name__
__module____qualname__ r   r   n/var/www/auris/lib/python3.10/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   /       r   c                   @   r   )InstructBlipVideoQFormerConfigNr   r   r   r   r   r!   3   r    r!   c                       s^   e Zd ZdZdZddiZeeedZ						d fdd		Z
ed
ededefddZ  ZS )InstructBlipVideoConfiga
  
    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the Instructblipvideo
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        video_token_index (`int`, *optional*):
            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
    ```Zinstructblipvideovideo_token_idvideo_token_index)text_configqformer_configvision_configN    c                    s   t  jdi | |d u ri }td |d u ri }td |d u r*i }td tdi || _tdi || _d|v rB|d nd}t| di || _	|| _
|| _| jj| j_| j	jtv | _d| _d| _d S )	NzZvision_config is None. initializing the InstructBlipVideoVisionConfig with default values.z\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.zTtext_config is None. Initializing the text config with default values (`OPTConfig`).
model_typeoptg      ?g{Gz?r   )super__init__loggerinfor   r'   r!   r&   r   r%   num_query_tokensr$   Zhidden_sizeZencoder_hidden_sizer)   r   use_decoder_only_language_modelZinitializer_factorZinitializer_range)selfr'   r&   r%   r/   r$   kwargsZtext_model_type	__class__r   r   r,   y   s(   	



z InstructBlipVideoConfig.__init__r'   r&   r%   c                 K   s"   | d|  |  |  d|S )a  
        Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
        language model configurations.

        Returns:
            [`InstructBlipVideoConfig`]: An instance of a configuration object
        )r'   r&   r%   Nr   )to_dict)clsr'   r&   r%   r2   r   r   r    from_vision_qformer_text_configs   s   z8InstructBlipVideoConfig.from_vision_qformer_text_configs)NNNr(   N)r   r   r   __doc__r)   Zattribute_mapr   r!   r   Zsub_configsr,   classmethodr   r7   __classcell__r   r   r3   r   r"   7   s.    7#r"   c                   @   r   ) InstructBlipVideoPreTrainedModelNr   r   r   r   r   r;      r    r;   c                   @   r   )InstructBlipVideoVisionModelNr   r   r   r   r   r<      r    r<   c                   @   r   )InstructBlipVideoQFormerModelNr   r   r   r   r   r=      r    r=   c                   @   r   )4InstructBlipVideoForConditionalGenerationModelOutputNr   r   r   r   r   r>      s    r>   c                   @   s   e Zd Z										ddejdejdeej deej deej deej d	eej d
ee dee dee dedee dee	 de
eef fddZdS )InstructBlipVideoModelNFpixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictinterpolate_pos_encoding	use_cacher2   returnc                 K   s  |
d ur|
n| j j}
|j\}}}}}||| |||}| j|||	|
|d}|d }tj| d d tj|j	d}| j
|jd dd}tj| d d tj|j	d}|d u r^t|}|j|dd}|j|dd}tj||gdd}| j|||||||	|
d}|d d d d |dd d f }| |}||| j j| d}| j |}|d u rt|}|| j jkd|}| ||< | j jr| jd||||	|
|d|}n| jd||||||	|
|d	|}t|||d
S )N)r@   rG   rH   rI   rJ   r   Zdtypedevicedim   )rC   rD   query_embedsencoder_hidden_statesencoder_attention_maskrG   rH   rI   inputs_embedsrD   rG   rH   rI   rK   )rW   rD   rE   rF   rG   rH   rI   rK   )vision_outputsqformer_outputslanguage_model_outputsr   )configuse_return_dictshapereshapevision_modeltorchonessizelongrO   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionr/   language_modelget_input_embeddingsr#   	unsqueeze	expand_asflattenr0   r>   )r1   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   r2   
batch_sizeframeschannelheightwidthrX   image_embedsimage_attention_maskrd   query_attention_maskquery_outputsquery_outputlanguage_model_inputsrW   special_image_maskoutputsr   r   r   forward   s     
$



	zInstructBlipVideoModel.forward)
NNNNNNNNFN)r   r   r   r`   FloatTensorr   
LongTensorboolr   r   r   r   r>   r}   r   r   r   r   r?      sP    	

r?   c                !   @   st  e Zd Z			ddejdejdeej dee dee f
dd	Z			ddejdejdeej dee dee f
d
dZ												ddejdejdeej deej deej deej deej dee dee deej dee dedee de
e deeef fddZe 					ddejdeej deej deej deej dedejfddZdS ))InstructBlipVideoForConditionalGenerationNFr@   rA   rB   rJ   rI   c                 C   s>  |j \}}}}	}
||| ||	|
}| j||dd}|d }tj| dd tj|jd}| j	|j d dd}tj| dd tj|jd}|du rRt
|}|j|dd}|j|dd}tj||gdd}| j|||||dd	}|d ddd|dddf }| |}||| jj| d}|r|||fS |S )
a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        T)r@   rJ   rI   r   NrM   rN   rP   rR   )rC   rD   rS   rT   rU   rI   )r]   r^   r_   r`   ra   rb   rc   rO   rd   re   rf   rg   rh   ri   rj   r[   r/   )r1   r@   rA   rB   rJ   rI   rp   rq   rr   rs   rt   rX   ru   rv   rd   rw   rx   ry   rz   r   r   r   get_video_features*  s<     
$

z<InstructBlipVideoForConditionalGeneration.get_video_featuresc                 C   s   d S )Nr   )r1   r@   rA   rB   rJ   rI   r   r   r   get_image_featuresf  s   z<InstructBlipVideoForConditionalGeneration.get_image_featuresrC   rD   rE   rF   rG   rH   labelsrK   r2   rL   c                 K   s  |dur|n| j j}| j||||dd\}}}|s| n|}|s%| n|}tj| dd tj|jd}| j	
 |}|du rGt|}t| j dddurg|| j jkd|}| |j||< ntd tj|||jgdd	}tj|||jgdd	}| j jr| j	d||||	||d
|}|r|jn|d }d}|
dur| jd||
| j jjd|}n$| j	d||||||	||
|d	|}|r|jn|d }|r|jn|d }t|||||dS )a0
  
        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`List[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTrA   rB   rJ   rI   rM   rN   r#   K  Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.rR   rP   rV   r   )logitsr   
vocab_size)	rW   rD   rE   rF   rG   rH   rI   r   rK   )lossr   rX   rY   rZ   r   )r[   r\   r   Zto_tupler`   ra   rb   rc   rO   rk   rl   rf   getattrr#   rm   rn   ro   tor-   warning_oncerh   r0   r   Zloss_functionr%   r   r   r>   )r1   r@   rA   rB   rC   rD   rE   rF   rG   rH   r   rI   rJ   rK   r2   rz   rX   rx   Zlanguage_model_attention_maskrW   r{   r|   r   r   r   r   r   r}   p  s   I
	
z1InstructBlipVideoForConditionalGeneration.forwardc                 K   s  t | dr	|   |jd }| j||||dd\}	}
}tj|	 dd tj|	jd}|du r\| j	j
jg}t| j	dddurK| j	jg| j	j d	 | }tj|gtj|jd}||d
}|du ret|}|  |}t| j	dddur|| j	jkd|}|	 |j||< n@td tj|	||	jgd
d}tj|||jgd
d}| jj	js|dd|	jd
  d
 |d< |dd|	jd
  |d< ||d}| jj	js||d< | jjdi ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        Zhf_device_mapr   Tr   NrM   rN   r#      rR   r   rP   
max_length   Z
min_length)rW   rD   rC   r   )hasattrZ_preprocess_accelerater]   r   r`   ra   rb   rc   rO   r[   r%   Zbos_token_idr   r#   r/   Ztensorrepeatrf   rl   rm   rn   ro   r   r-   r   rh   rk   Zis_encoder_decodergetgenerate)r1   r@   rA   rB   rC   rD   rJ   Zgenerate_kwargsrp   rz   rX   rx   Zlanguage_attention_maskZstart_tokensrW   r{   Zinputsr|   r   r   r   r     sR   





z2InstructBlipVideoForConditionalGeneration.generate)NFF)NNNNNNNNNFN)NNNNF)r   r   r   r`   r~   r   r   r   r   r   r   r   r   r   r>   r}   Zno_gradr   r   r   r   r   r   )  s    
@
	


 	r   )r"   r!   r   r<   r;   r=   r?   r   )-dataclassesr   typingr   r   r   r`   Ztorch.utils.checkpointZ;transformers.models.instructblip.configuration_instructblipr   r   Z6transformers.models.instructblip.modeling_instructblipr   r	   r
   r   r   r   r   Zconfiguration_utilsr   Zmodeling_flash_attention_utilsr   Zmodels.auto.modeling_autor   Zprocessing_utilsr   utilsr   autor   r   Z
get_loggerr   r-   r   r!   r"   r;   r<   r=   r>   r?   r   __all__r   r   r   r   <module>   s4   $

}d  ;