ó
    fT–hO1  ã                   ó<  • S SK JrJrJrJr  S SKrS SKJr  S SKJrJ	r	J
r
JrJr  SSKJr  SSKJrJrJr  SS	KJr  \R*                  " \5      r " S
 S\5      r " S S\5      r " S S\R4                  5      r " S S\5      r " S S\
5      r " S S\	5      r/ SQrg)é    )ÚListÚOptionalÚTupleÚUnionN)Únn)ÚLlavaCausalLMOutputWithPastÚLlavaForConditionalGenerationÚ
LlavaModelÚLlavaModelOutputWithPastÚLlavaPreTrainedModelé   )ÚACT2FN)Úauto_docstringÚis_torchdynamo_compilingÚloggingé   )ÚVipLlavaConfigc                   ó   • \ rS rSrSrg)ÚVipLlavaModelOutputWithPasté%   © N©Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__static_attributes__r   ó    Úe/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vipllava/modular_vipllava.pyr   r   %   ó   † Úr   r   c                   ó   • \ rS rSrSrg)ÚVipLlavaCausalLMOutputWithPasté)   r   Nr   r   r   r   r"   r"   )   r    r   r"   c                   ó6   ^ • \ rS rSrS\4U 4S jjrS rSrU =r$ )ÚVipLlavaMultiModalProjectoré-   Úconfigc                 óB  >• [         TU ]  5         [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " X!R                  R                  -  UR                  S9U l        [        R                  " X!R                  R                  -  UR                  R                  SS9U l        [        UR                      U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )Nr   )ÚepsT)Úbias)ÚsuperÚ__init__Ú
isinstanceÚvision_feature_layersÚintÚlenr   Ú	LayerNormÚvision_configÚhidden_sizeÚprojector_layernorm_epsÚprojector_layernormÚLinearÚtext_configÚlinear_1r   Úprojector_hidden_actÚactÚlinear_2)Úselfr'   Únum_feature_layersÚ	__class__s      €r   r,   Ú$VipLlavaMultiModalProjector.__init__.   s×   ø€ Ü‰ÑÔÜ",¨V×-IÑ-IÌ3×"OÑ"O™QÔUXÐY_×YuÑYuÓUvÐÜ#%§<¢<Ø×!5Ñ!5×!AÑ!AÑAÀv×GeÑGeñ$
ˆÔ ô Ÿ	š	Ø×!5Ñ!5×!AÑ!AÑAØ×Ñ×*Ñ*Øñ
ˆŒô
 ˜&×5Ñ5Ñ6ˆŒÜŸ	š	 &×"4Ñ"4×"@Ñ"@À&×BTÑBT×B`ÑB`ÐgkÑlˆr   c                 óŽ   • U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ )N)r5   r8   r:   r;   )r<   Úhidden_statess     r   ÚforwardÚ#VipLlavaMultiModalProjector.forward=   sB   € Ø×0Ñ0°Ó?ˆØŸ™ mÓ4ˆØŸ™ Ó/ˆØŸ™ mÓ4ˆØÐr   )r:   r8   r;   r5   )	r   r   r   r   r   r,   rB   r   Ú__classcell__)r>   s   @r   r%   r%   -   s   ø† ðm˜~÷ m÷ð r   r%   c                   ó   • \ rS rSrSrg)ÚVipLlavaPreTrainedModeléE   r   Nr   r   r   r   rF   rF   E   r    r   rF   c                   ó–  • \ rS rSrS\R
                  S\\\\   4   4S jr	\
            SS\R                  S\R
                  S\\R                     S\\R                     S	\\\R
                        S
\\R
                     S\\\\\   4      S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rSrg)ÚVipLlavaModeléI   Úpixel_valuesr.   c                 ó&  • U R                  USS9n[        U[        5      (       a  UR                  U   SS2SS24   nO<U Vs/ s H  oSR                  U   SS2SS24   PM     nn[        R
                  " USS9nU R                  U5      nU$ s  snf )a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layers (`Union[int, List[int]]`):
        The vision feature layer, or the list of indexes of the layers to select
        the vision feature.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
T)Úoutput_hidden_statesNr   éÿÿÿÿ)Údim)Úvision_towerr-   r/   rA   ÚtorchÚcatÚmulti_modal_projector)r<   rK   r.   Úimage_outputsÚimage_featuresÚindexs         r   Úget_image_featuresÚ VipLlavaModel.get_image_featuresJ   s£   € ð ×)Ñ)¨,ÈTÐ)ÐRˆô Ð+¬S×1Ñ1Ø*×8Ñ8Ð9NÑOÒPQÐSTÑSUÐPUÑV‰Nñ VkÓkÒUjÈE×9Ñ9¸%Ñ@ÂÀAÁBÀÔGÑUjˆNÐkÜ"ŸYšY ~¸2Ñ>ˆNØ×3Ñ3°NÓCˆØÐùò ls   Á!BNÚ	input_idsÚattention_maskÚposition_idsÚpast_key_valuesÚinputs_embedsÚ	use_cacheÚoutput_attentionsrM   Úreturn_dictÚcache_positionÚreturnc                 ó†  • U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUb  Ub  [        S5      eUc  U R                  5       " U5      nUGb   U R                  X'S9nXR                   R                  :H  R                  S5      nUR                  U5      R                  UR                  5      n[        5       (       dz  Xo   R                  5       UR                  5       :w  aV  XR                   R                  :H  R                  5       nUR                   S   UR                   S   -  n[        SU S	U 35      eUR                  UR                  UR"                  5      nUR%                  Xþ5      nU R&                  " SUUUUUU	U
S
US.	UD6n[)        UR*                  UR,                  UR.                  UR0                  Ub  WOSS9nU(       a  U$ UR3                  5       $ )z£
vision_feature_layers (`Union[int, List[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)rK   r.   rN   r   r   z6Image features and image tokens do not match: tokens: z, features T)	rZ   r[   r\   r]   r^   r_   rM   r`   ra   )Úlast_hidden_stater\   rA   Ú
attentionsÚimage_hidden_statesr   )r'   r_   rM   Úuse_return_dictr.   Ú
ValueErrorÚget_input_embeddingsrW   Úimage_token_idÚ	unsqueezeÚ	expand_asÚtoÚdevicer   ÚnumelÚsumÚshapeÚdtypeÚmasked_scatterÚlanguage_modelr   rd   r\   rA   re   Úto_tuple)r<   rY   rK   rZ   r[   r\   r]   r.   r^   r_   rM   r`   ra   Ú	lm_kwargsrU   Úspecial_image_maskÚn_image_tokensÚn_image_featuresÚoutputsÚoutputs                       r   rB   ÚVipLlavaModel.forwardd   sd  € ð, 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà%:Ñ%FÑ!ÈDÏKÉK×LmÑLmð 	ð ˜Ð -°tÐ";×<ÜÐYÓZÐZàÑ#¨Ñ(AÜØvóð ð Ñ Ø ×5Ñ5Ô7¸	ÓBˆMàÒ#Ø!×4Ñ4Ø)ð 5ð ˆNð #,¯{©{×/IÑ/IÑ"I×!TÑ!TÐUWÓ!XÐØ!3×!=Ñ!=¸mÓ!L×!OÑ!OÐP]×PdÑPdÓ!eÐÜ+×-Ñ-°-Ñ2S×2YÑ2YÓ2[Ð_m×_sÑ_sÓ_uÓ2uØ"+¯{©{×/IÑ/IÑ"I×!NÑ!NÓ!PØ#1×#7Ñ#7¸Ñ#:¸^×=QÑ=QÐRSÑ=TÑ#TÐ Ü ØLÈ^ÐL\Ð\gÐhxÐgyÐzóð ð ,×.Ñ.¨}×/CÑ/CÀ]×EXÑEXÓYˆNØ)×8Ñ8Ð9KÓ\ˆMà×%Ò%ð 
Ø)Ø%Ø+Ø'ØØ/Ø!5ØØ)ñ
ð ñ
ˆô -Ø%×7Ñ7Ø#×3Ñ3Ø!×/Ñ/Ø×)Ñ)Ø2>Ñ2J¡ÐPTñ
ˆö %ˆvÐ;¨&¯/©/Ó*;Ð;r   r   )NNNNNNNNNNNN)r   r   r   r   rQ   ÚFloatTensorr   r/   r   rW   r   Ú
LongTensorr   ÚTensorÚboolr   r   rB   r   r   r   r   rI   rI   I   sj  † ð¨u×/@Ñ/@ð ÐY^Ð_bÐdhÐilÑdmÐ_mÑYnô ð4 ð '+Ø*.Ø15Ø37Ø=AØ59ØAEØ$(Ø,0Ø/3Ø&*Ø59ñM<à×#Ñ#ðM<ð ×'Ñ'ðM<ð ! §¡Ñ.ð	M<ð
 ˜u×/Ñ/Ñ0ðM<ð " $ u×'8Ñ'8Ñ"9Ñ:ðM<ð   × 1Ñ 1Ñ2ðM<ð  (¨¨c°4¸±9¨nÑ(=Ñ>ðM<ð ˜D‘>ðM<ð $ D™>ðM<ð ' t™nðM<ð ˜d‘^ðM<ð ! ×!1Ñ!1Ñ2ðM<ð 
ˆuÐ1Ð1Ñ	2ôM<ó óM<r   rI   c            !       óš  • \ rS rSr              SS\R
                  S\R                  S\\R                     S\\R
                     S\\	\R                        S\\R                     S	\\
\\	\   4      S
\\R
                     S\\   S\\   S\\   S\\   S\\R
                     S\
\\R                  4   S\
\\4   4S jjrSrg)Ú VipLlavaForConditionalGenerationéµ   NrY   rK   rZ   r[   r\   r]   r.   Úlabelsr^   r_   rM   r`   ra   Úlogits_to_keeprb   c                 ór  • U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R
                  " SUUUUUUU	UU
USUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb.  U R                  UX€R                   R                  R                  S9n[        UUUR                  UR                  UR                   UR"                  S9$ )aØ  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
vision_feature_layers (`Union[int, List[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.

Example:

```python
>>> import torch
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

>>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
>>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

>>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
>>> question = "Can you please describe this image?"
>>> prompt = prompt.format(question)
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=20)
>>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
The image features a brown and white cat sitting on a green surface, with a red ball in its
```NT)rY   rK   rZ   r[   r\   r]   r^   r.   r_   rM   r`   ra   r   )Úlogitsr„   Ú
vocab_size)Úlossr‡   r\   rA   re   rf   r   )r'   r_   rM   rg   r.   Úmodelr-   r/   ÚsliceÚlm_headÚloss_functionr7   rˆ   r"   r\   rA   re   rf   )r<   rY   rK   rZ   r[   r\   r]   r.   r„   r^   r_   rM   r`   ra   r…   rv   rz   rA   Úslice_indicesr‡   r‰   s                        r   rB   Ú(VipLlavaForConditionalGeneration.forward¶   sZ  € ðh 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà%:Ñ%FÑ!ÈDÏKÉK×LmÑLmð 	ð —*’*ð 
ØØ%Ø)Ø%Ø+Ø'ØØ"7Ø/Ø!5ØØ)ñ
ð ñ
ˆð    ™
ˆä8BÀ>ÔSV×8WÑ8Wœ˜~˜o¨tÔ4Ð]kˆØ—‘˜mªA¨}ºaÐ,?Ñ@ÓAˆàˆØÑØ×%Ñ%¨V¸FÏ{É{×OfÑOf×OqÑOqÐ%ÐrˆDä-ØØØ#×3Ñ3Ø!×/Ñ/Ø×)Ñ)Ø '× ;Ñ ;ñ
ð 	
r   r   )NNNNNNNNNNNNNr   )r   r   r   r   rQ   r~   r}   r   r   r   r   r/   r€   r   r"   rB   r   r   r   r   r‚   r‚   µ   si  † ð '+Ø*.Ø15Ø37Ø=AØ59ØAEØ-1Ø$(Ø,0Ø/3Ø&*Ø59Ø34ñ]
à×#Ñ#ð]
ð ×'Ñ'ð]
ð ! §¡Ñ.ð	]
ð
 ˜u×/Ñ/Ñ0ð]
ð " $ u×'8Ñ'8Ñ"9Ñ:ð]
ð   × 1Ñ 1Ñ2ð]
ð  (¨¨c°4¸±9¨nÑ(=Ñ>ð]
ð ˜×)Ñ)Ñ*ð]
ð ˜D‘>ð]
ð $ D™>ð]
ð ' t™nð]
ð ˜d‘^ð]
ð ! ×!1Ñ!1Ñ2ð]
ð ˜c 5§<¡<Ð/Ñ0ð]
ð" 
ˆuÐ4Ð4Ñ	5÷#]
ð ]
r   r‚   )rI   r‚   rF   ) Útypingr   r   r   r   rQ   r   Ú(transformers.models.llava.modeling_llavar   r	   r
   r   r   Úactivationsr   Úutilsr   r   r   Úconfiguration_vipllavar   Ú
get_loggerr   Úloggerr   r"   ÚModuler%   rF   rI   r‚   Ú__all__r   r   r   Ú<module>r™      s›   ð÷  0Ó /ã Ý ÷õ õ "ß FÑ FÝ 2ð 
×	Ò	˜HÓ	%€ô	Ð":ô 	ô	Ð%@ô 	ô "§)¡)ô ô0	Ð2ô 	ôi<Jô i<ôX^
Ð'Dô ^
òB [r   