
    fThm                        S SK Jr  S SKJrJrJr  S SKrS SKrS SKJ	r	J
r
  S SKJrJrJrJrJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  \R@                  " \!5      r" " S S\
5      r# " S S\	5      r$ " S S\5      r% " S S\5      r& " S S\5      r' " S S\5      r(\ " S S\5      5       r) " S S\5      r* " S S\5      r+/ S Qr,g)!    )	dataclass)OptionalTupleUnionN)InstructBlipQFormerConfigInstructBlipVisionConfig)$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutputInstructBlipModelInstructBlipPreTrainedModelInstructBlipQFormerModelInstructBlipVisionModelKwargsForCausalLM   )PretrainedConfig)FlashAttentionKwargs)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)Unpack)logging   )CONFIG_MAPPING
AutoConfigc                       \ rS rSrSrg)InstructBlipVideoVisionConfig/    N__name__
__module____qualname____firstlineno____static_attributes__r       w/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   r   /       r#   r   c                       \ rS rSrSrg)InstructBlipVideoQFormerConfig3   r   Nr   r   r#   r$   r'   r'   3   r%   r#   r'   c                   r   ^  \ rS rSrSrSrSS0r\\\	S.r
     SU 4S jjr\S\	S	\S
\4S j5       rSrU =r$ )InstructBlipVideoConfig7   a
  
[`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
[`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
the defaults will yield a similar configuration to that of the Instructblipvideo
[Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
    qformer_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize any [`PretrainedConfig`].
    num_query_tokens (`int`, *optional*, defaults to 32):
        The number of query tokens passed through the Transformer.

    video_token_index (`int`, *optional*):
        Token index of special video token.
    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import (
...     InstructBlipVideoVisionConfig,
...     InstructBlipVideoQFormerConfig,
...     OPTConfig,
...     InstructBlipVideoConfig,
...     InstructBlipVideoForConditionalGeneration,
... )

>>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
>>> configuration = InstructBlipVideoConfig()

>>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
>>> model = InstructBlipVideoForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig

>>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
>>> vision_config = InstructBlipVideoVisionConfig()
>>> qformer_config = InstructBlipVideoQFormerConfig()
>>> text_config = OPTConfig()

>>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
```instructblipvideovideo_token_idvideo_token_index)text_configqformer_configvision_configc                   > [         TU ]  " S0 UD6  Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [	        S0 UD6U l        [        S0 UD6U l        SU;   a  US   OSn[        U   " S0 UD6U l	        X@l
        XPl        U R
                  R                  U R                  l        U R                  R                  [        ;   U l        SU l        SU l        g )	NzZvision_config is None. initializing the InstructBlipVideoVisionConfig with default values.z\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.zTtext_config is None. Initializing the text config with default values (`OPTConfig`).
model_typeoptg      ?g{Gz?r   )super__init__loggerinfor   r1   r'   r0   r   r/   num_query_tokensr.   hidden_sizeencoder_hidden_sizer3   r   use_decoder_only_language_modelinitializer_factorinitializer_range)	selfr1   r0   r/   r9   r.   kwargstext_model_type	__class__s	           r$   r6    InstructBlipVideoConfig.__init__y   s     	"6" MKKtu!NKKvwKKKno:K]K<N~N7C{7R+l3X])/:I[I 0!2262D2D2P2P//3/?/?/J/JNo/o,"%!%r#   r1   r0   r/   c                 n    U " SUR                  5       UR                  5       UR                  5       S.UD6$ )z
Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
language model configurations.

Returns:
    [`InstructBlipVideoConfig`]: An instance of a configuration object
)r1   r0   r/   r   )to_dict)clsr1   r0   r/   r@   s        r$    from_vision_qformer_text_configs8InstructBlipVideoConfig.from_vision_qformer_text_configs   sD       
'//1)113#++-
 	
 	
r#   )r=   r>   r9   r0   r/   r<   r.   r1   )NNN    N)r   r   r    r!   __doc__r3   attribute_mapr   r'   r   sub_configsr6   classmethodr   rG   r"   __classcell__)rB   s   @r$   r*   r*   7   sv    5n %J-M "86K !&F 
4
 7
 &	
 
r#   r*   c                       \ rS rSrSrg) InstructBlipVideoPreTrainedModel   r   Nr   r   r#   r$   rP   rP      r%   r#   rP   c                       \ rS rSrSrg)InstructBlipVideoVisionModel   r   Nr   r   r#   r$   rS   rS      r%   r#   rS   c                       \ rS rSrSrg)InstructBlipVideoQFormerModel   r   Nr   r   r#   r$   rV   rV      r%   r#   rV   c                       \ rS rSrSrg)4InstructBlipVideoForConditionalGenerationModelOutput   r   Nr   r   r#   r$   rY   rY      s    r#   rY   c                   @   \ rS rSr          SS\R
                  S\R
                  S\\R                     S\\R
                     S\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\S\\   S\	\
   S\\\4   4S jjrSrg)InstructBlipVideoModel   Npixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictinterpolate_pos_encoding	use_cacher@   returnc                 @   U
b  U
OU R                   R                  n
UR                  u  pnnnUR                  X-  UUU5      nU R	                  UUU	U
US9nUS   n[
        R                  " UR                  5       S S [
        R                  UR                  S9nU R                  R                  UR                  S   SS5      n[
        R                  " UR                  5       S S [
        R                  UR                  S9nUc  [
        R                  " U5      nUR                  USS9nUR                  USS9n[
        R                  " UU/SS9nU R                  UUUUUUU	U
S9nUS   S S 2S UR                  S5      2S S 24   nU R!                  U5      nUR                  XR                   R"                  U-  S5      nU R$                  R'                  5       " U5      nUc  [
        R                  " U5      nX@R                   R(                  :H  R+                  S5      R-                  U5      nUR/                  5       UU'   U R                   R0                  (       a  U R$                  " SUUUU	U
US.UD6nOU R$                  " SUUUUUU	U
US	.UD6n[3        UUUS
9$ )N)r^   re   rf   rg   rh   r   dtypedevicedim   )ra   rb   query_embedsencoder_hidden_statesencoder_attention_maskre   rf   rg   inputs_embedsrb   re   rf   rg   ri   )rw   rb   rc   rd   re   rf   rg   ri   )vision_outputsqformer_outputslanguage_model_outputsr   )configuse_return_dictshapereshapevision_modeltorchonessizelongro   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionr9   language_modelget_input_embeddingsr-   	unsqueeze	expand_asflattenr<   rY   )r?   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   r@   
batch_sizeframeschannelheightwidthrx   image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsrw   special_image_maskoutputss                                 r$   forwardInstructBlipVideoModel.forward   s     &1%<k$++B]B] 6B5G5G2
GVU#++J,?&RWX**%/!5#%= + 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7/!5# % 	
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j++JfJfioJoqs t++@@B9M!"__Y7N';;+E+EEPPQST^^_lm,A,I,I,K();;66)) +-"3%9'# G )) 
+-"3'="3%9'#
 
G D))#*
 	
r#   r   )
NNNNNNNNFN)r   r   r    r!   r   FloatTensorr   
LongTensorboolr   r   r   r   rY   r   r"   r   r#   r$   r\   r\      s+   
 >B15598<=A,0/3&*).$(`
''`
 !,,`
 !))9)9 :	`

 E--.`
 !!1!12`
 $E$4$45`
 !))9)9 :`
 $D>`
 'tn`
 d^`
 #'`
 D>`
 -.`
 
uJJ	K`
 `
r#   r\   c            !       8   \ rS rSr   SS\R
                  S\R                  S\\R                     S\\   S\\   4
S jjr	   SS\R
                  S\R                  S\\R                     S\\   S\\   4
S	 jjr
           SS\R
                  S\R
                  S\\R                     S
\\R
                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\R                     S\\   S\S\\   S\\   S\\\4   4S jjr\R"                  " 5            SS\R
                  S\\R                     S\\R                     S
\\R                     S\\R                     S\S\R                  4S jj5       rSrg))InstructBlipVideoForConditionalGenerationi)  Nr^   r_   r`   rh   rg   c           	      ^   UR                   u  pgpn
UR                  Xg-  XU
5      nU R                  UUSS9nUS   n[        R                  " UR                  5       SS [        R                  UR                  S9nU R                  R                  UR                   S   SS5      n[        R                  " UR                  5       SS [        R                  UR                  S9nUc  [        R                  " U5      nUR                  USS9nUR                  USS9n[        R                  " X/SS9nU R                  UUUUUSS	9nUS   SS2SUR                  S5      2SS24   nU R                  U5      nUR                  X`R                  R                   U-  S5      nU(       a  UUU4$ U$ )
z
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
T)r^   rh   rg   r   Nrl   rm   rp   rr   )ra   rb   rs   rt   ru   rg   )r}   r~   r   r   r   r   r   ro   r   r   r   r   r   r   r   r{   r9   )r?   r^   r_   r`   rh   rg   r   r   r   r   r   rx   r   r   r   r   r   r   r   s                      r$   get_video_features<InstructBlipVideoForConditionalGeneration.get_video_features*  s   " 6B5G5G2
GU#++J,?RWX**%%= + 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@+Y_`!a'1%".#7 % 
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j++JfJfioJoqs t(.-GG$$r#   c                     g )Nr   )r?   r^   r_   r`   rh   rg   s         r$   get_image_features<InstructBlipVideoForConditionalGeneration.get_image_featuresf  s     	r#   ra   rb   rc   rd   re   rf   labelsri   r@   rj   c                 V   Ub  UOU R                   R                  nU R                  UUUUSS9u  nnnU(       d  UR                  5       OUnU(       d  UR                  5       OUn[        R
                  " UR                  5       SS [        R                  UR                  S9nU R                  R                  5       " U5      nUc  [        R                  " U5      n[        U R                   SS5      bc  X@R                   R                  :H  R                  S5      R                  U5      nUR!                  5       R#                  UR                  5      UU'   Ou[$        R'                  S5        [        R(                  " UUR#                  UR                  5      /SS	9n[        R(                  " UUR#                  UR                  5      /SS	9nU R                   R*                  (       aj  U R                  " SUUUU	UUS
.UD6nU(       a  UR,                  OUS   nSnU
b3  U R.                  " SUXR                   R0                  R2                  S.UD6nOLU R                  " SUUUUUU	UU
US.	UD6nU(       a  UR4                  OUS   nU(       a  UR,                  OUS   n[7        UUUUUS9$ )a  
```python
>>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
>>> import torch
>>> from huggingface_hub import hf_hub_download
>>> import av
>>> import numpy as np

>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`List[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

>>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
>>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

>>> file_path = hf_hub_download(
...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample uniformly 4 frames from the videWhy is this video funny?o
>>> total_frames = container.streams.video[0].frames
>>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
>>> clip = read_video_pyav(container, indices)

>>> prompt = "What is happening in the video?"
>>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

>>> outputs = model.generate(
...     **inputs,
...     do_sample=False,
...     num_beams=5,
...     max_length=256,
...     repetition_penalty=1.5,
...     length_penalty=1.0,
... )
>>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
>>> print(generated_text)
"A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
```NTr_   r`   rh   rg   rl   rm   r-   K  Expanding inputs for video tokens in InstructBLIPVideo should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/65f22892b054dc0d68228af56fbeaac2) to update your InstructBLIPVideo model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.rr   rp   rv   r   )logitsr   
vocab_size)	rw   rb   rc   rd   re   rf   rg   r   ri   )lossr   rx   ry   rz   r   )r{   r|   r   to_tupler   r   r   r   ro   r   r   r   getattrr-   r   r   r   tor7   warning_oncer   r<   r   loss_functionr/   r   r   rY   )r?   r^   r_   r`   ra   rb   rc   rd   re   rf   r   rg   rh   ri   r@   r   rx   r   language_model_attention_maskrw   r   r   r   r   s                           r$   r   1InstructBlipVideoForConditionalGeneration.forwardp  s   R &1%<k$++B]B]?C?V?V/#9%= @W @
<~} ;F002>8C..0(-

!&&("-UZZH]HdHd)
% ++@@B9M!"__Y7N 4;; 0$7C"+{{/I/I"I!T!TUW!X!b!bcp!q0E0M0M0O0R0RS`SgSg0hM,-z
 "II'<m>N>NOdOkOk>l&mstuM"YY.0A0AB_BfBf0ghnoN ;;66)) +-"3%9'# G (3W^^
FD!)) !&[[=T=T=_=_ci
 )) +-"3'="3%9'# G $/7<<GAJD'2W^^
FC))#*
 	
r#   c                 &   [        U S5      (       a  U R                  5         UR                  S   nU R                  UUUUSS9u  pn[        R
                  " U	R                  5       SS [        R                  U	R                  S9nUc  U R                  R                  R                  /n[        U R                  SS5      b4  U R                  R                  /U R                  R                  -  S	-  U-   n[        R                  " U/[        R                  UR                  S9nUR!                  US
5      nUc  [        R"                  " U5      nU R%                  5       " U5      n[        U R                  SS5      bb  X@R                  R                  :H  R'                  S5      R)                  U5      nU	R+                  5       R-                  UR                  5      X'   O[.        R1                  S5        [        R2                  " XR-                  U	R                  5      /S
S9n[        R2                  " XR-                  UR                  5      /S
S9nU R4                  R                  R6                  (       dM  UR9                  SS5      U	R                  S
   -   S
-
  US'   UR9                  SS5      U	R                  S
   -   US'   XS.nU R4                  R                  R6                  (       d  UUS'   U R4                  R:                  " S0 UDUD6nU$ )a  
Overrides `generate` function to be able to use the model as a conditional generator.

Args:
    pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
        (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
    qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt to be fed to the Q-Former module.
    qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt for the generation.
    attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the positional encoding of the image embeddings.

Returns:
    captions (list): A list of strings of length batch_size * num_captions.
hf_device_mapr   Tr   Nrl   rm   r-      rr   r   rp   
max_length   
min_length)rw   rb   ra   r   )hasattr_preprocess_accelerater}   r   r   r   r   r   ro   r{   r/   bos_token_idr   r-   r9   tensorrepeatr   r   r   r   r   r   r7   r   r   r   is_encoder_decodergetgenerate)r?   r^   r_   r`   ra   rb   rh   generate_kwargsr   r   rx   r   language_attention_maskstart_tokensrw   r   inputsr   s                     r$   r   2InstructBlipVideoForConditionalGeneration.generate  s   > 4))'')!''*
?C?V?V/#9%= @W @
<} #(**!&&("-UZZH]HdHd#
  KK33@@ALt{{$4d;G $ : :;dkk>Z>ZZ]^^amml^5::lNaNabI!((Q7I!"__Y7N113I> 4;; 0$7C"+{{/I/I"I!T!TUW!X!b!bcp!q0E0M0M0O0R0RS`SgSg0hM-z
 "II'<>N>NOdOkOk>l&mstuM"YY(*;*;<S<Z<Z*[\bcN &&--@@#''b9<Q<W<WXY<ZZ]^^  - 1@0C0CLRS0TWlWrWrstWu0u-#0S""))<<"+F;%%..KK?Kr#   r   )NFF)NNNNNNNNNFN)NNNNF)r   r   r    r!   r   r   r   r   r   r   r   r   r   r   r   rY   r   no_gradr   r"   r   r#   r$   r   r   )  s   
 >B38&+9%''9% !++9% !))9)9 :	9%
 #+4.9% d^9%@ >B38&+'' !++ !))9)9 :	
 #+4. d^ >B15598<=A,0/3-1&*).$(S
''S
 !,,S
 !))9)9 :	S

 E--.S
 !!1!12S
 $E$4$45S
 !))9)9 :S
 $D>S
 'tnS
 ))*S
 d^S
 #'S
 D>S
 *+S
  
uJJ	K!S
j ]]_ 9==A0459).Y''Y $E$4$45Y !))9)9 :	Y
 E,,-Y !!1!12Y #'Y 
		Y Yr#   r   )r*   r'   r   rS   rP   rV   r\   r   )-dataclassesr   typingr   r   r   r   torch.utils.checkpoint;transformers.models.instructblip.configuration_instructblipr   r   6transformers.models.instructblip.modeling_instructblipr	   r
   r   r   r   r   r   configuration_utilsr   modeling_flash_attention_utilsr   models.auto.modeling_autor   processing_utilsr   utilsr   autor   r   
get_loggerr   r7   r   r'   r*   rP   rS   rV   rY   r\   r   __all__r   r#   r$   <module>r      s     " ) )     4 B J &  - 
		H	%	$< 		%> 	z
. z
z	'B 		#: 		$< 	 	;j 	 	a
. a
Hv0T vr		r#   