
    fThO1                     <   S SK JrJrJrJr  S SKrS SKJr  S SKJrJ	r	J
r
JrJr  SSKJr  SSKJrJrJr  SS	KJr  \R*                  " \5      r " S
 S\5      r " S S\5      r " S S\R4                  5      r " S S\5      r " S S\
5      r " S S\	5      r/ SQrg)    )ListOptionalTupleUnionN)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)auto_docstringis_torchdynamo_compilinglogging   )VipLlavaConfigc                       \ rS rSrSrg)VipLlavaModelOutputWithPast%    N__name__
__module____qualname____firstlineno____static_attributes__r       e/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vipllava/modular_vipllava.pyr   r   %       r   r   c                       \ rS rSrSrg)VipLlavaCausalLMOutputWithPast)   r   Nr   r   r   r   r"   r"   )   r    r   r"   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VipLlavaMultiModalProjector-   configc                 B  > [         TU ]  5         [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " X!R                  R                  -  UR                  S9U l        [        R                  " X!R                  R                  -  UR                  R                  SS9U l        [        UR                      U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )Nr   )epsT)bias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr'   num_feature_layers	__class__s      r   r,   $VipLlavaMultiModalProjector.__init__.   s    ",V-I-I3"O"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ )N)r5   r8   r:   r;   )r<   hidden_statess     r   forward#VipLlavaMultiModalProjector.forward=   sB    00?m4/m4r   )r:   r8   r;   r5   )	r   r   r   r   r   r,   rB   r   __classcell__)r>   s   @r   r%   r%   -   s    m~ m r   r%   c                       \ rS rSrSrg)VipLlavaPreTrainedModelE   r   Nr   r   r   r   rF   rF   E   r    r   rF   c                      \ rS rSrS\R
                  S\\\\   4   4S jr	\
            SS\R                  S\R
                  S\\R                     S\\R                     S	\\\R
                        S
\\R
                     S\\\\\   4      S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rSrg)VipLlavaModelI   pixel_valuesr.   c                 &   U R                  USS9n[        U[        5      (       a  UR                  U   SS2SS24   nO<U Vs/ s H  oSR                  U   SS2SS24   PM     nn[        R
                  " USS9nU R                  U5      nU$ s  snf )a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layers (`Union[int, List[int]]`):
        The vision feature layer, or the list of indexes of the layers to select
        the vision feature.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
T)output_hidden_statesNr   )dim)vision_towerr-   r/   rA   torchcatmulti_modal_projector)r<   rK   r.   image_outputsimage_featuresindexs         r   get_image_features VipLlavaModel.get_image_featuresJ   s     )),T)R +S11*889NOPQSTSUPUVN VkkUjE99%@ABGUjNk"YY~2>N33NC ls   !BN	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrM   return_dictcache_positionreturnc                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUb  Ub  [        S5      eUc  U R                  5       " U5      nUGb   U R                  X'S9nXR                   R                  :H  R                  S5      nUR                  U5      R                  UR                  5      n[        5       (       dz  Xo   R                  5       UR                  5       :w  aV  XR                   R                  :H  R                  5       nUR                   S   UR                   S   -  n[        SU S	U 35      eUR                  UR                  UR"                  5      nUR%                  X5      nU R&                  " SUUUUUU	U
S
US.	UD6n[)        UR*                  UR,                  UR.                  UR0                  Ub  WOSS9nU(       a  U$ UR3                  5       $ )z
vision_feature_layers (`Union[int, List[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)rK   r.   rN   r   r   z6Image features and image tokens do not match: tokens: z, features T)	rZ   r[   r\   r]   r^   r_   rM   r`   ra   )last_hidden_stater\   rA   
attentionsimage_hidden_statesr   )r'   r_   rM   use_return_dictr.   
ValueErrorget_input_embeddingsrW   image_token_id	unsqueeze	expand_astodevicer   numelsumshapedtypemasked_scatterlanguage_modelr   rd   r\   rA   re   to_tuple)r<   rY   rK   rZ   r[   r\   r]   r.   r^   r_   rM   r`   ra   	lm_kwargsrU   special_image_maskn_image_tokensn_image_featuresoutputsoutputs                       r   rB   VipLlavaModel.forwardd   sd   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 -t";<YZZ#(Av    557	BM#!44) 5 N #,{{/I/I"I!T!TUW!X!3!=!=m!L!O!OP]PdPd!e+---2S2Y2Y2[_m_s_s_u2u"+{{/I/I"I!N!N!P#1#7#7#:^=Q=QRS=T#T  L^L\\ghxgyz  ,..}/C/C]EXEXYN)889K\M%% 
)%+'/!5)
 
 -%77#33!//))2>2JPT
 %v;&//*;;r   r   )NNNNNNNNNNNN)r   r   r   r   rQ   FloatTensorr   r/   r   rW   r   
LongTensorr   Tensorboolr   r   rB   r   r   r   r   rI   rI   I   sj   u/@/@ Y^_bdhildm_mYn 4  '+*.1537=A59AE$(,0/3&*59M<##M< ''M< !.	M<
 u//0M< "$u'8'8"9:M<   1 12M<  (c49n(=>M< D>M< $D>M< 'tnM< d^M< !!1!12M< 
u11	2M< M<r   rI   c            !          \ rS rSr              SS\R
                  S\R                  S\\R                     S\\R
                     S\\	\R                        S\\R                     S	\\
\\	\   4      S
\\R
                     S\\   S\\   S\\   S\\   S\\R
                     S\
\\R                  4   S\
\\4   4S jjrSrg) VipLlavaForConditionalGeneration   NrY   rK   rZ   r[   r\   r]   r.   labelsr^   r_   rM   r`   ra   logits_to_keeprb   c                 r   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R
                  " SUUUUUUU	UU
USUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb.  U R                  UXR                   R                  R                  S9n[        UUUR                  UR                  UR                   UR"                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
vision_feature_layers (`Union[int, List[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.

Example:

```python
>>> import torch
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

>>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
>>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

>>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
>>> question = "Can you please describe this image?"
>>> prompt = prompt.format(question)
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=20)
>>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
The image features a brown and white cat sitting on a green surface, with a red ball in its
```NT)rY   rK   rZ   r[   r\   r]   r^   r.   r_   rM   r`   ra   r   )logitsr   
vocab_size)lossr   r\   rA   re   rf   r   )r'   r_   rM   rg   r.   modelr-   r/   slicelm_headloss_functionr7   r   r"   r\   rA   re   rf   )r<   rY   rK   rZ   r[   r\   r]   r.   r   r^   r_   rM   r`   ra   r   rv   rz   rA   slice_indicesr   r   s                        r   rB   (VipLlavaForConditionalGeneration.forward   sZ   h 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 ** 
%)%+'"7/!5)
 
   
8B>SV8W8W~ot4]kmA}a,?@A%%VF{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r   r   )NNNNNNNNNNNNNr   )r   r   r   r   rQ   r~   r}   r   r   r   r   r/   r   r   r"   rB   r   r   r   r   r   r      si    '+*.1537=A59AE-1$(,0/3&*5934]
##]
 '']
 !.	]

 u//0]
 "$u'8'8"9:]
   1 12]
  (c49n(=>]
 ))*]
 D>]
 $D>]
 'tn]
 d^]
 !!1!12]
 c5<</0]
" 
u44	5#]
 ]
r   r   )rI   r   rF   ) typingr   r   r   r   rQ   r   (transformers.models.llava.modeling_llavar   r	   r
   r   r   activationsr   utilsr   r   r   configuration_vipllavar   
get_loggerr   loggerr   r"   Moduler%   rF   rI   r   __all__r   r   r   <module>r      s     0 /    " F F 2 
		H	%	": 		%@ 	")) 0	2 	i<J i<X^
'D ^
B [r   