
    fThaH                        S SK JrJrJrJr  S SKrS SKrS SKJr  SSKJ	r	  SSK
Jr  SSKJr  SSKJrJrJr  S	S
KJrJr  S	SKJr  S	SKJrJrJrJrJr  \R:                  " \5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r& " S S\5      r'/ SQr(g)    )ListOptionalTupleUnionN)nn   )DynamicCache)FlashAttentionKwargs)Unpack)auto_docstringcan_return_tuplelogging   )Idefics3ConfigIdefics3VisionConfig)Idefics3ImageProcessor)Idefics3BaseModelOutputWithPast Idefics3ForConditionalGenerationIdefics3ModelIdefics3PreTrainedModelIdefics3VisionTransformerc                       \ rS rSrSrSrSrg)SmolVLMVisionConfig(   a  
This is the configuration class to store the configuration of a [`SmolVLMVisionModel`]. It is used to instantiate a
SmolVLM vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
[google/siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) used in SmolVLM
[HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1152):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 32):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

Example:

```python
>>> from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
>>> from transformers.models.smolvlm.configuration_smolvlm import SmolVLMVisionConfig

>>> # Initializing a SmolVLMVisionConfig with google/siglip-so400m-patch14-384 style configuration
>>> configuration = SmolVLMVisionConfig()

>>> # Initializing a SmolVLMVisionTransformer (with random weights) from the google/siglip-so400m-patch14-384 style configuration
>>> model = SmolVLMVisionTransformer(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```smolvlm_vision N__name__
__module____qualname____firstlineno____doc__
model_type__static_attributes__r       c/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/smolvlm/modular_smolvlm.pyr   r   (   s    1f "Jr%   r   c                       \ rS rSrS rSrg)SmolVLMPreTrainedModel`   c                 Z   [        U R                  SU R                  R                  5       R                  5      n[	        U[
        R                  [
        R                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [	        U[
        R                  5      (       aJ  UR                  R                  R!                  S5        UR                  R                  R                  5         g g )Ninitializer_range        )meanstdg      ?)getattrconfigget_text_configr+   
isinstancer   LinearConv2dweightdatanormal_biaszero_	Embeddingpadding_idx	LayerNormfill_)selfmoduler.   s      r&   _init_weights$SmolVLMPreTrainedModel._init_weightsa   s(   dkk#68S8S8U8g8ghfryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)KK""$ .r%   r   N)r   r   r    r!   r@   r$   r   r%   r&   r(   r(   `   s    %r%   r(   c                       \ rS rSrSrg)SmolVLMVisionTransformerq   r   Nr   r   r    r!   r$   r   r%   r&   rC   rC   q       r%   rC   c                       \ rS rSrSrSrSrg)SmolVLMConfigu   aC  
This is the configuration class to store the configuration of a [`SmolVLMModel`]. It is used to instantiate a
SmolVLM model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the model of the SmolVLM
[HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should cache the key/value pairs of the attention mechanism. Only
        relevant if `config.is_decoder=True`.
    image_token_id (`int`, *optional*, defaults to 128257):
        The id of the "image" token.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether or not to tie the word embeddings with the token embeddings.
    vision_config (`IdeficsVisionConfig` or `dict`, *optional*, defaults to `IdeficsVisionConfig`):
        Custom vision config or dict for the vision tower
    text_config (`PretrainedConfig` or `dict`, *optional*, defaults to `LlamaConfig`):
        Custom text config or dict for the text model
    scale_factor (`int`, *optional*, defaults to 2):
        The scale factor for the image encoder.
    pad_token_id (`int`, *optional*, defaults to 128002):
        The id of the padding token.

Example:
```python
>>> from transformers import SmolVLMModel, SmolVLMConfig
>>> # Initializing configuration
>>> configuration = SmolVLMConfig()
>>> # Initializing a model from the configuration
>>> model = SmolVLMModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```smolvlmr   Nr   r   r%   r&   rH   rH   u   s    #J Jr%   rH   c                       \ rS rSrSrg)SmolVLMImageProcessor   r   NrE   r   r%   r&   rL   rL      rF   r%   rL   c                       \ rS rSrSrg)SmolVLMBaseModelOutputWithPast   r   NrE   r   r%   r&   rO   rO      rF   r%   rO   c            #       @   \ rS rSrSrS\R                  S\R                  S\R                  4S jrSS\R                  S	\R                  4S
 jjr
\\" SS9             SS\\R                     S\\R                     S\\R                     S\\\R                        S\\R                     S\\R                     S	\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       5       rSrg)SmolVLMModel   z
A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
in forward. Instead, we override inputs_merger here with custom logic.
	input_idsinputs_embedsimage_hidden_statesc                 0   UR                   u  pEnXR                  :H  nUR                  SS9n[        R                  " Xu-  S:H  5      (       d  [        S5      eXu-  n[        R                  R                  R                  UR                  SS9SSS9n	U	S S n
UR                  SS9nUS-
  U-  nUS-
  U-  nU
R                  S5      U-   n[        R                  " U5      nX>U   X   S S 24   X'   [        R                  " UR                  S5      X5      nU$ )N   dimr   zCAt least one sample has <image> tokens not divisible by patch_size.)rX   r   )value)shapeimage_token_idsumtorchall
ValueErrorr   
functionalpadcumsum	unsqueeze
zeros_likewhere)r>   rT   rU   rV   _
patch_size
image_masknum_image_tokensblocks_per_sampleoffsetsblock_offsetrow_cum	chunk_idx	local_idx	block_idximage_embedsmerged_embedss                    r&   inputs_mergerSmolVLMModel.inputs_merger   s,    /44q"5"55
%>>a>0yy)6!;<<bcc,:((%%))*;*B*Bq*B*I6YZ)[s|###+q[Z/	q[J.	 **1-	9	''6#67LiNcef7f#g J$8$8$<lZr%   Npixel_valuespixel_attention_maskc                 D   UR                   u  p4pVnUR                  " X4-  /UR                   SS Q76 nUR                   SS R                  5       nUS:H  R                  SS9U:g  n	[	        U	5      (       d  SU	S'   X   R                  5       nUcL  [        R                  " S	 V
s/ s H  oR                   U
   PM     sn
[        R                  UR                  S
9nO4UR                  " X4-  /UR                   SS Q76 nX)   R                  5       nU R                  R                  R                  nUR                  SXS9nUR                  SXS9nUR                  SS9S:  R                  5       nU R                  XS9nUR                  nU R!                  U5      nU$ s  sn
f )az  
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
    pixel_attention_mask (`torch.LongTensor`, *optional*):
        The attention mask indicating padded regions in the image.
r   NrX   r,   )r\   rY   Tr   )r   r   r   )sizedtypedevice)	dimensionr}   step)r\   r{   )rx   patch_attention_mask)r]   viewnumelr_   any
contiguousr`   onesboolr   r0   vision_configrj   unfoldvision_modellast_hidden_state	connector)r>   rx   ry   
batch_size
num_imagesnum_channelsheightwidthnb_values_per_imagereal_images_indsirj   patches_subgridr   rV   s                  r&   get_image_featuresSmolVLMModel.get_image_features   s    ?K>P>P;
e#(()@Z<CUCUVWVXCYZ +004::<(C/444FJ]]#$$"&Q#5@@B'#(::5>?Y((+Y?jj#**$  $8#<#<Z=T#vWkWqWqrsrtWu#v #7#I#T#T#V [[..99
.55
5d)001:0_ / 3 3 3 AA EKKM #//\/u1CC #nn-@A""' @s   #Fa  
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        )custom_introattention_maskposition_idspast_key_values	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionkwargsreturnc                 .   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU R
                  (       a9  U R                  R                  (       a  U	(       a  [        R                  S5        Sn	Ub  UR                  u  nnOUb  UR                  u  nnnO[        S5      eSnU	(       a  Uc
  [        5       nUR                  5       nUb  Uc  US:X  a  [        S5      eUc9  U R                  R                  5       " U5      R                  UR                   5      nUb  Ub  [        S5      eUb  U R#                  Xg5      nO'Ub$  UR                  U R$                  UR                   S9nUb  Ub  U R'                  UUUS9nU R                  " SUUUUU	U
US	US
.	UD6n[)        UR*                  UR,                  UR.                  UR0                  US9$ )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz5You have to specify either input_ids or inputs_embedsr   zWWhen first calling the model, if input_embeds are passed, input_ids should not be None.zMYou cannot specify both pixel_values and image_hidden_states at the same time)r~   r   )rT   rU   rV   T)	rU   r   r   r   r   r   r   r   r   )r   r   hidden_states
attentionsrV   r   )r0   r   r   r   use_return_dicttraining
text_modelgradient_checkpointingloggerwarning_oncer]   rb   r	   get_seq_lengthget_input_embeddingstor   r   r~   rv   rO   r   r   r   r   )r>   rT   r   r   r   rU   rx   ry   rV   r   r   r   r   r   r   r   
seq_lengthri   past_seen_tokensoutputss                       r&   forwardSmolVLMModel.forward   s7   : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]==T__CC	l I  %.__"J
&(5(;(;%J
ATUU&"...==?$):?OST?Tvww  OO@@B9MPPQZQaQabM #(;(Glmm%"&"9"9,"] ,"5"8"8tzzR[RbRb"8"c$)<)H !..#+$7 / M // 
')%+/!5)
 
 .%77#33!//)) 3
 	
r%   r   )N)NNNNNNNNNNNNN)r   r   r    r!   r"   r`   
LongTensorTensorrv   FloatTensorr   r   r   r   r   
BoolTensorr   r   r
   r   r   rO   r   r$   r   r%   r&   rR   rR      s   
)):?,,]b]i]i2,#u/@/@ ,#X]XhXh ,#\ 
 151537=A5948;?;?$(,0/3&*59V
E,,-V
 !.V
 u//0	V

 "$u'8'8"9:V
   1 12V
 u001V
 'u'7'78V
 &e&7&78V
 D>V
 $D>V
 'tnV
 d^V
 !!1!12V
 -.V
  
u44	5!V

 V
r%   rR   c                   4   ^  \ rS rSrU 4S jrU 4S jrSrU =r$ )SmolVLMForConditionalGenerationiY  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)r8   )super__init__rR   modelr   r3   text_confighidden_size
vocab_sizelm_head	post_init)r>   r0   	__class__s     r&   r   (SmolVLMForConditionalGeneration.__init__Z  sS     !&)
yy!3!3!?!?ASASA^A^ejkr%   c                 &   > [         TU ]  " S0 UD6  g)a  
Example:

```python
>>> import requests
>>> import torch
>>> from PIL import Image
>>> from io import BytesIO

>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> from transformers.image_utils import load_image

>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
>>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
>>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

>>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
>>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")

>>> # Create inputs
>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {"type": "video", "path": path/to/video},
...             {"type": "text", "text": "What is happening in this video?"},
...         ]
...     }
... ]

>>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)

>>> # Generate
>>> generated_ids = model.generate(**inputs, max_new_tokens=256)
>>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

>>> print(generated_texts)
```Nr   )r   r   )r>   super_kwargsr   s     r&   r   'SmolVLMForConditionalGeneration.forward`  s    P 	','r%   )r   r   )r   r   r    r!   r   r   r$   __classcell__)r   s   @r&   r   r   Y  s    (( ((r%   r   )r   rH   rL   r   r(   rR   rC   ))typingr   r   r   r   r`   torch.utils.checkpointr   cache_utilsr	   modeling_flash_attention_utilsr
   processing_utilsr   utilsr   r   r   idefics3.configuration_idefics3r   r   "idefics3.image_processing_idefics3r   idefics3.modeling_idefics3r   r   r   r   r   
get_loggerr   r   r   r(   rC   rH   rL   rO   rR   r   __all__r   r%   r&   <module>r      s     0 /    ' B & > > R G  
		H	%5	. 5	p%4 %"	8 	'	N '	T	2 		%D 	o
= o
d/(&F /(dr%   