
    fThbb                        S SK Jr  S SKJrJrJrJr  S SKrS SKJr  SSK	J
r
  SSKJr  SSKJrJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJr  \ " S S\5      5       r\ " S S\5      5       r " S S\R8                  5      r\ " S S\5      5       r\" SS9 " S S\5      5       r\" SS9 " S S\\5      5       r / SQr!g)    )	dataclass)ListOptionalTupleUnionN)nn   )ACT2FN)GenerationMixin)BaseModelOutputWithPastModelOutput)PreTrainedModel)auto_docstringcan_return_tupleis_torchdynamo_compiling   )	AutoModel   )VipLlavaConfigc                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)VipLlavaModelOutputWithPast%   a  
Base class for VipLlava outputs, with hidden states and attentions.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations____static_attributes__r       f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   r   %   s    8 8<%"3"34;r$   r   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)VipLlavaCausalLMOutputWithPastF   a@  
Base class for VipLlava causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr   r   )r   r   r   r   r   r)   r   r    r!   r"   r*   r+   r   r,   r   r-   r   r#   r   r$   r%   r'   r'   F   s    < )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r$   r'   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VipLlavaMultiModalProjectorn   configc                 B  > [         TU ]  5         [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " X!R                  R                  -  UR                  S9U l        [        R                  " X!R                  R                  -  UR                  R                  SS9U l        [        UR                      U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )Nr   )epsTbias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r
   projector_hidden_actactlinear_2)selfr1   num_feature_layers	__class__s      r%   r7   $VipLlavaMultiModalProjector.__init__o   s    ",V-I-I3"O"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr$   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ N)r@   rC   rE   rF   )rG   r,   s     r%   forward#VipLlavaMultiModalProjector.forward~   sB    00?m4/m4r$   )rE   rC   rF   r@   )	r   r   r   r   r   r7   rM   r#   __classcell__rI   s   @r%   r/   r/   n   s    m~ m r$   r/   c                   B    \ rS rSr\rSrSrSrSr	Sr
SrSrSrSrS rSrg)VipLlavaPreTrainedModel    Tr+   c                 4   [        U R                  SU R                  R                  5       R                  5      n[	        U[
        R                  5      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  5      (       aJ  UR                  R                  R                  S5        UR                  R                  R                  5         g g )Ninitializer_rangeg        )meanstdg      ?)getattrr1   get_text_configrV   r8   r   rA   weightdatanormal_r5   zero_r<   fill_)rG   modulerX   s      r%   _init_weights%VipLlavaPreTrainedModel._init_weights   s     dkk#68S8S8U8g8ghfbii((MM&&CS&9{{&  &&( '--MM$$S)KK""$ .r$   r   N)r   r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_cache_class_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_static_cache_supports_attention_backendra   r#   r   r$   r%   rR   rR      sA    !L&*#"3 !N $!"&%r$   rR   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                     ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  S	\\\\   4   4S
 jr\            SS\	R                   S\	R                  S\\	R$                     S\\	R                      S\\\	R                        S\\	R                     S	\\\\\   4      S\\   S\\   S\\   S\\   S\\	R                      S\\\4   4S jj5       rSrU =r$ )VipLlavaModel   zlanguage_model.modellanguage_modelr1   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g rL   )r6   r7   r   from_configr=   vision_towerr/   multi_modal_projectorrB   rq   	post_initrG   r1   rI   s     r%   r7   VipLlavaModel.__init__   sY     %11&2F2FG%@%H"'33F4F4FGr$   c                 6    U R                   R                  5       $ rL   )rq   get_input_embeddingsrG   s    r%   rz   "VipLlavaModel.get_input_embeddings   s    ""7799r$   c                 :    U R                   R                  U5        g rL   )rq   set_input_embeddingsrG   values     r%   r~   "VipLlavaModel.set_input_embeddings   s    007r$   pixel_valuesr9   c                 &   U R                  USS9n[        U[        5      (       a  UR                  U   SS2SS24   nO<U Vs/ s H  oSR                  U   SS2SS24   PM     nn[        R
                  " USS9nU R                  U5      nU$ s  snf )a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layers (`Union[int, List[int]]`):
        The vision feature layer, or the list of indexes of the layers to select
        the vision feature.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
T)output_hidden_statesNr   )dim)rt   r8   r:   r,   r    catru   )rG   r   r9   image_outputsimage_featuresindexs         r%   get_image_features VipLlavaModel.get_image_features   s     )),T)R +S11*889NOPQSTSUPUVN VkkUjE99%@ABGUjNk"YY~2>N33NC ls   !B	input_idsattention_maskposition_idsr+   inputs_embeds	use_cacheoutput_attentionsr   return_dictcache_positionreturnc                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUb  Ub  [        S5      eUc  U R                  5       " U5      nUGb   U R                  X'S9nXR                   R                  :H  R                  S5      nUR                  U5      R                  UR                  5      n[        5       (       dz  Xo   R                  5       UR                  5       :w  aV  XR                   R                  :H  R                  5       nUR                   S   UR                   S   -  n[        SU S	U 35      eUR                  UR                  UR"                  5      nUR%                  X5      nU R&                  " SUUUUUU	U
S
US.	UD6n[)        UR*                  UR,                  UR.                  UR0                  Ub  WOSS9nU(       a  U$ UR3                  5       $ )z
vision_feature_layers (`Union[int, List[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)r   r9   r   r   r   z6Image features and image tokens do not match: tokens: z, features T)	r   r   r+   r   r   r   r   r   r   )last_hidden_stater+   r,   r-   r   r   )r1   r   r   use_return_dictr9   
ValueErrorrz   r   image_token_id	unsqueeze	expand_astodevicer   numelsumshapedtypemasked_scatterrq   r   r   r+   r,   r-   to_tuple)rG   r   r   r   r   r+   r   r9   r   r   r   r   r   	lm_kwargsr   special_image_maskn_image_tokensn_image_featuresoutputsoutputs                       r%   rM   VipLlavaModel.forward   sd   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 -t";<YZZ#(Av    557	BM#!44) 5 N #,{{/I/I"I!T!TUW!X!3!=!=m!L!O!OP]PdPd!e+---2S2Y2Y2[_m_s_s_u2u"+{{/I/I"I!N!N!P#1#7#7#:^=Q=QRS=T#T  L^L\\ghxgyz  ,..}/C/C]EXEXYN)889K\M%% 
)%+'/!5)
 
 -%77#33!//))2>2JPT
 %v;&//*;;r$   )rq   ru   rt   )NNNNNNNNNNNN)r   r   r   r   _checkpoint_conversion_mappingr   r7   rz   r~   r    r!   r   r:   r   r   r   
LongTensorr   Tensorboolr   r   rM   r#   rO   rP   s   @r%   ro   ro      s    '=>N%O"~ :8u/@/@ Y^_bdhildm_mYn 4  '+*.1537=A59AE$(,0/3&*59M<##M< ''M< !.	M<
 u//0M< "$u'8'8"9:M<   1 12M<  (c49n(=>M< D>M< $D>M< 'tnM< d^M< !!1!12M< 
u11	2M< M<r$   ro   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c            #         ^  \ rS rSrSSSSS.rS/rS\4U 4S	 jjrS
 rS r	S\
R                  4S jrS r\S 5       r\S 5       r\S 5       r\\              S(S\R*                  S\R,                  S\\R0                     S\\R*                     S\\\R,                        S\\R,                     S\\\\\   4      S\\R*                     S\\   S\\   S\\   S\\   S\\R*                     S\\\R0                  4   S\\\4   4S  jj5       5       r      S)U 4S! jjr \!S\R0                  S"\S#\S$\RD                  S\R0                  S%\4S& j5       r#S'r$U =r%$ )* VipLlavaForConditionalGenerationi#  zmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr1   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFr4   )r6   r7   ro   modelr   rA   rB   r>   
vocab_sizer   rv   rw   s     r%   r7   )VipLlavaForConditionalGeneration.__init__1  sS     "6*
yy!3!3!?!?ASASA^A^ejkr$   c                 6    U R                   R                  5       $ rL   )r   rz   r{   s    r%   rz   5VipLlavaForConditionalGeneration.get_input_embeddings7  s    zz..00r$   c                 :    U R                   R                  U5        g rL   )r   r~   r   s     r%   r~   5VipLlavaForConditionalGeneration.set_input_embeddings:  s    

''.r$   r   c                     U R                   $ rL   r   r{   s    r%   get_output_embeddings6VipLlavaForConditionalGeneration.get_output_embeddings=  s    ||r$   c                     Xl         g rL   r   )rG   new_embeddingss     r%   set_output_embeddings6VipLlavaForConditionalGeneration.set_output_embeddings@  s    %r$   c                 .    U R                   R                  $ rL   )r   rq   r{   s    r%   rq   /VipLlavaForConditionalGeneration.language_modelD  s    zz(((r$   c                 .    U R                   R                  $ rL   )r   rt   r{   s    r%   rt   -VipLlavaForConditionalGeneration.vision_towerH  s    zz&&&r$   c                 .    U R                   R                  $ rL   )r   ru   r{   s    r%   ru   6VipLlavaForConditionalGeneration.multi_modal_projectorL  s    zz///r$   r   r   r   r   r+   r   r9   labelsr   r   r   r   r   logits_to_keepc                 r   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R
                  " SUUUUUUU	UU
USUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb.  U R                  UXR                   R                  R                  S9n[        UUUR                  UR                  UR                   UR"                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
vision_feature_layers (`Union[int, List[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.

Example:

```python
>>> import torch
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

>>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
>>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

>>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
>>> question = "Can you please describe this image?"
>>> prompt = prompt.format(question)
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=20)
>>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
The image features a brown and white cat sitting on a green surface, with a red ball in its
```NT)r   r   r   r   r+   r   r   r9   r   r   r   r   r   )r*   r   r   )r)   r*   r+   r,   r-   r   r   )r1   r   r   r   r9   r   r8   r:   slicer   loss_functionrB   r   r'   r+   r,   r-   r   )rG   r   r   r   r   r+   r   r9   r   r   r   r   r   r   r   r   r   r,   slice_indicesr*   r)   s                        r%   rM   (VipLlavaForConditionalGeneration.forwardP  sZ   l 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 ** 
%)%+'"7/!5)
 
   
8B>SV8W8W~ot4]kmA}a,?@A%%VF{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r$   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  XIS'   U	$ )N)r+   r   r   r   r   r   r   )r6   prepare_inputs_for_generation)rG   r   r+   r   r   r   r   r   kwargsmodel_inputsrI   s             r%   r   >VipLlavaForConditionalGeneration.prepare_inputs_for_generation  sT     w<
+')))
 
 !! ,8(r$   sequence_lengthtarget_lengthr   
batch_sizec                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer   r   r   )diagonal)r   r   r   )r   r    finfominfullr   triuarangereshapeexpandcloner   r   masked_fill)r   r   r   r   r   r   r   causal_mask	min_dtypemask_lengthpadding_masks              r%   5_prepare_4d_causal_attention_mask_with_cache_positionVVipLlavaForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  s}   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r$   )r   r   )NNNNNNNNNNNNNr   )NNNNNN)&r   r   r   r   r   _tied_weights_keysr   r7   rz   r~   r   Moduler   r   propertyrq   rt   ru   r   r   r    r   r!   r   r   r   r   r:   r   r   r'   rM   r   staticmethodr   r   r#   rO   rP   s   @r%   r   r   #  sv    "8-"?#,	&" ++~ 1/ryy & ) ) ' ' 0 0  '+*.1537=A59AE-1$(,0/3&*5934]
##]
 '']
 !.	]

 u//0]
 "$u'8'8"9:]
   1 12]
  (c49n(=>]
 ))*]
 D>]
 $D>]
 'tn]
 d^]
 !!1!12]
 c5<</0]
" 
u44	5#]
  ]
D < 444 4 {{	4
 4 4 4r$   r   )ro   r   rR   )"dataclassesr   typingr   r   r   r   r    r   activationsr
   
generationr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   autor   configuration_vipllavar   r   r'   r   r/   rR   ro   r   __all__r   r$   r%   <module>r      s   , " / /   ! ) D - O O  2 <"9 < <@ $<[ $< $<N")) 0 %o % %6 
y<+ y<
y<x 
\'> \
\~ [r$   