o
    ZhԽ                     @   sB  d dl mZ d dlmZmZmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z% e &e'Z(eG dd deZ)G dd de	j*Z+	d>de	j*dej,dej,dej,deej, de-de-fdd Z.G d!d" d"e	j*Z/G d#d$ d$e	j*Z0G d%d& d&e	j*Z1G d'd( d(e	j*Z2ed)d*G d+d, d,e)Z3eG d-d. d.eZ4G d/d0 d0e	j*Z5G d1d2 d2e	j*Z6ed3d*G d4d5 d5e)Z7eG d6d7 d7eZ8G d8d9 d9eeZ9ed:d*G d;d< d<e)eZ:g d=Z;dS )?    )	dataclass)CallableListOptionalTupleUnionN)nn   )ACT2FN)DynamicCache)GenerationMixin)_prepare_4d_attention_mask)FlashAttentionKwargs)BaseModelOutputModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tuplelogging   )	AutoModel   )SmolVLMConfigSmolVLMVisionConfigc                   @   s@   e Zd ZeZdZdZddgZdZdZ	dZ
dZdZdZdd ZdS )	SmolVLMPreTrainedModelmodelTSmolVLMVisionAttentionZSmolVLMDecoderLayerpast_key_valuesc                 C   s   t | jd| j j}t|tjtjfr,|jj	j
d|d |jd ur*|jj	  d S d S t|tjrM|jj	j
d|d |jd urK|jj	|j   d S d S t|tjrb|jj	d |jj	  d S d S )Ninitializer_range        )meanstd      ?)getattrconfigZget_text_configr!   
isinstancer   LinearConv2dweightdataZnormal_biasZzero_	Embeddingpadding_idx	LayerNormZfill_)selfmoduler$    r3   [/var/www/auris/lib/python3.10/site-packages/transformers/models/smolvlm/modeling_smolvlm.py_init_weights9   s   

z$SmolVLMPreTrainedModel._init_weightsN)__name__
__module____qualname__r   config_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_skip_keys_device_placementZ_supports_flash_attn_2_supports_sdpa_supports_flex_attnZ_supports_cache_classZ_supports_attention_backendr5   r3   r3   r3   r4   r   ,   s    r   c                       sB   e Zd ZdZdef fddZdejdejdej	fdd	Z
  ZS )
SmolVLMVisionEmbeddingsaP  
    This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
    resolution.

    The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
    which allows treating images in their native aspect ratio and without the need to resize them to the same
    fixed size. In particular, we start from the original pre-trained SigLIP model
    (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
    r'   c                    sx   t    |j| _|j| _|j| _tj|j| j| j| jdd| _	| j| j | _
| j
d | _| j| _t| j| j| _d S )NZvalid)Zin_channelsZout_channelsZkernel_sizeZstridepaddingr   )super__init__hidden_size	embed_dim
image_size
patch_sizer   r*   num_channelspatch_embeddingnum_patches_per_sideZnum_patchesZnum_positionsr.   position_embeddingr1   r'   	__class__r3   r4   r?   T   s   
z SmolVLMVisionEmbeddings.__init__pixel_valuespatch_attention_maskreturnc                 C   s<  |j \}}}}| |}|ddd}|| j || j }	}
td| j dd| j }tj||	|
 fdd}t	|D ]P\}}|d d df 
 }|d 
 }tddd| }tddd| }tj||dd}tj||dd}|d d d f | j |  }||| |d	 < q<|| jjj}|| | }|S )
Nr   r   r%   r   )sizeZ
fill_valueg!?T)right)shaperE   flatten	transposerC   torchZarangerF   full	enumeratesumZ	bucketizeviewcputorG   r+   device)r1   rK   rL   
batch_size_Zmax_im_hZmax_im_wZpatch_embeds
embeddingsZmax_nb_patches_hZmax_nb_patches_wZ
boundariesposition_idsZ	batch_idxZp_attn_maskZnb_patches_hZnb_patches_wZfractional_coords_hZfractional_coords_wZbucket_coords_hZbucket_coords_wZpos_idsr3   r3   r4   forwardg   s$   
zSmolVLMVisionEmbeddings.forward)r6   r7   r8   __doc__r   r?   rT   FloatTensor
BoolTensorTensorr`   __classcell__r3   r3   rI   r4   r<   I   s    
$r<   r"   r2   querykeyvalueattention_maskscalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )NrP   )dimdtype)ptrainingr   r   )rT   matmulrS   r   
functionalZsoftmaxZfloat32rZ   rn   rk   rp   
contiguous)
r2   rf   rg   rh   ri   rj   rk   kwargsattn_weightsattn_outputr3   r3   r4   eager_attention_forward   s   
rw   c                       s\   e Zd ZdZ fddZ		ddejdeej dee d	e	ejeej f fd
dZ
  ZS )r   z=Multi-headed attention from 'Attention Is All You Need' paperc                    s   t    || _|j| _|j| _| j| j | _| j| j | jkr-td| j d| j d| jd | _	|j
| _t| j| j| _t| j| j| _t| j| j| _t| j| j| _d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r>   r?   r'   r@   rA   Znum_attention_heads	num_headshead_dim
ValueErrorscaleZattention_dropoutrk   r   r)   k_projv_projq_projout_proj	is_causalrH   rI   r3   r4   r?      s$   


zSmolVLMVisionAttention.__init__NFhidden_statesri   output_attentionsrM   c              
   C   s  |j \}}}| |}| |}| |}	|||| j| jdd}|||| j| jdd}|	||| j| jdd}	t}
| j	j
dkr[| j	j
dkrU|rUtd nt| j	j
 }
|
| |||	|| j| j| jsjdn| jd\}}|||| }| |}|sd}||fS )	z#Input shape: Batch x Time x Channelr   r   eagerZsdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r"   )r   rj   rk   N)rQ   r~   r|   r}   rX   rx   ry   rS   rw   r'   _attn_implementationloggerwarning_oncer   r   r{   rp   rk   reshapers   r   )r1   r   ri   r   r\   
seq_lengthrA   ZquerieskeysvaluesZattention_interfacerv   ru   r3   r3   r4   r`      s:   




zSmolVLMVisionAttention.forward)NF)r6   r7   r8   ra   r?   rT   rd   r   boolr   r`   re   r3   r3   rI   r4   r      s    r   c                       s2   e Zd Z fddZdejdejfddZ  ZS )SmolVLMVisionMLPc                    sD   t    || _t|j | _t|j|j	| _
t|j	|j| _d S N)r>   r?   r'   r
   Z
hidden_actactivation_fnr   r)   r@   Zintermediate_sizefc1fc2rH   rI   r3   r4   r?      s
   
zSmolVLMVisionMLP.__init__r   rM   c                 C   s"   |  |}| |}| |}|S r   )r   r   r   )r1   r   r3   r3   r4   r`      s   


zSmolVLMVisionMLP.forward)r6   r7   r8   r?   rT   rd   r`   re   r3   r3   rI   r4   r      s    r   c                
       sN   e Zd Zdef fddZ	ddejdejdee de	ej
 fd	d
Z  ZS )SmolVLMEncoderLayerr'   c                    sR   t    |j| _t|| _tj| j|jd| _	t
|| _tj| j|jd| _d S )NZeps)r>   r?   r@   rA   r   	self_attnr   r0   layer_norm_epslayer_norm1r   mlplayer_norm2rH   rI   r3   r4   r?      s   


zSmolVLMEncoderLayer.__init__Fr   ri   r   rM   c                 C   sb   |}|  |}| j|||d\}}|| }|}| |}| |}|| }|f}|r/||f7 }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   ri   r   )r   r   r   r   )r1   r   ri   r   Zresidualru   outputsr3   r3   r4   r`      s    




zSmolVLMEncoderLayer.forward)F)r6   r7   r8   r   r?   rT   rd   r   r   r   rb   r`   re   r3   r3   rI   r4   r      s    r   c                       sh   e Zd ZdZdef fddZ				ddeej dee	 dee	 d	ee	 d
e
eef f
ddZ  ZS )SmolVLMEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`SmolVLMEncoderLayer`].

    Args:
        config: SmolVLMConfig
    r'   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r3   )r   ).0r]   r'   r3   r4   
<listcomp>.  s    z+SmolVLMEncoder.__init__.<locals>.<listcomp>F)	r>   r?   r'   r   Z
ModuleListrangeZnum_hidden_layerslayersgradient_checkpointingrH   rI   r   r4   r?   +  s   
 
zSmolVLMEncoder.__init__Nri   r   output_hidden_statesreturn_dictrM   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|r"dnd}|r(dnd}|}| jD ]-}	|r8||f }| jrH| jrH| |	j|||}
n|	|||d}
|
d }|r\||
d f }q/|rd||f }|srt	dd |||fD S t
|||dS )	ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr3   )r   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r3   )r   vr3   r3   r4   	<genexpr>t  s    z)SmolVLMEncoder.forward.<locals>.<genexpr>last_hidden_stater   
attentions)r'   r   r   use_return_dictr   r   rp   Z_gradient_checkpointing_func__call__tupler   )r1   inputs_embedsri   r   r   r   Zencoder_statesZall_attentionsr   Zencoder_layerZlayer_outputsr3   r3   r4   r`   2  sB   


zSmolVLMEncoder.forwardNNNN)r6   r7   r8   ra   r   r?   r   rT   rd   r   r   r   r   r`   re   r3   r3   rI   r4   r   "  s$    

r   zN
    The SmolVLM Vision Transformer Model outputting raw image embedding.
    Zcustom_introc                       s   e Zd ZeZdZdZdZdef fddZdd Z	dd Z
								dd
eej dee dee dee deeef f
ddZ  ZS )SmolVLMVisionTransformerTr'   c                    sP   t  | |j}t|| _t|| _|j| _tj	||j
d| _|jdk| _d S )Nr   flash_attention_2)r>   r?   r@   r<   r^   r   encoderrC   r   r0   r   post_layernormr   _use_flash_attention_2)r1   r'   rA   rI   r3   r4   r?     s   

z!SmolVLMVisionTransformer.__init__c                 C      | j S r   r^   r1   r3   r3   r4   get_input_embeddings     z-SmolVLMVisionTransformer.get_input_embeddingsc                 C   
   || _ d S r   r   r1   rh   r3   r3   r4   set_input_embeddings     
z-SmolVLMVisionTransformer.set_input_embeddingsNrL   r   r   r   rM   c                 C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d}|d u rE| j}t||d| |d| f}|jtj	|j
d}| j||d}||d}t| s[d }n	| jsdt||j}| j|||||d}	|	d }
| |
}
|s|
f|	dd   S t|
|	j|	jd	S )
Nr   r   r	   rn   r[   rK   rL   rP   )r   ri   r   r   r   r   r   )r'   r   r   r   rN   rC   rT   onesrZ   r   r[   r^   rX   anyr   r   rn   r   r   r   r   r   )r1   rK   rL   r   r   r   r\   rC   r   Zencoder_outputsr   r3   r3   r4   r`     sH   

z SmolVLMVisionTransformer.forwardr   )r6   r7   r8   r   r9   r:   Z_supports_flash_attention_2r;   r?   r   r   r   rT   rc   r   r   r   r   r`   re   r3   r3   rI   r4   r   z  s.    

r   c                   @   s   e Zd ZU dZdZeej ed< dZ	ee
e
ej   ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed< dS )SmolVLMBaseModelOutputWithPasta	  
    Base class for SmolVLM model's outputs that may also contain a past key/values (to speed up sequential decoding).
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.
            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder
    Nr   r    r   r   image_hidden_states)r6   r7   r8   ra   r   r   rT   rb   __annotations__r    r   r   r   r   r3   r3   r3   r4   r     s   
 r   c                       s$   e Zd Z fddZdd Z  ZS )SmolVLMSimpleMLPc                    s:   t    |jj|jd  }|jj}tj||dd| _d S )Nr   Fr-   )	r>   r?   vision_configr@   scale_factortext_configr   r)   proj)r1   r'   Z
input_sizeZoutput_sizerI   r3   r4   r?     s   
zSmolVLMSimpleMLP.__init__c                 C   s
   |  |S r   )r   )r1   xr3   r3   r4   r`     r   zSmolVLMSimpleMLP.forward)r6   r7   r8   r?   r`   re   r3   r3   rI   r4   r     s    r   c                       s.   e Zd Z fddZdddZdd Z  ZS )	SmolVLMConnectorc                    s    t    |j| _t|| _d S r   )r>   r?   r   r   modality_projectionrH   rI   r3   r4   r?     s   
zSmolVLMConnector.__init__r   c                 C   s   |  \}}}t|d  }}|||||}|||t|| || }|dddd}||t|| t|| ||d  }|dddd}||t||d  ||d  }|S )Ng      ?r   r   r   r	   )rN   intrX   Zpermuter   )r1   r   r   ZbszseqrA   heightwidthr3   r3   r4   pixel_shuffle  s   ("zSmolVLMConnector.pixel_shufflec                 C   s   |  || j}| |}|S r   )r   r   r   )r1   r   r3   r3   r4   r`     s   
zSmolVLMConnector.forward)r   )r6   r7   r8   r?   r   r`   re   r3   r3   rI   r4   r     s    
r   zY
    SmolVLM model consisting of a SIGLIP vision encoder and Llama3 language decoder
    c                #       s@  e Zd ZdZdef fddZdd Zdd Zd	d
 Zdd Z	de
jde
jde
jfddZd%de
jde
jfddZeedd													d&dee
j dee
j dee
j deee
j  dee
j dee
j dee
j dee
j dee dee dee dee d ee
j d!ee d"eeef fd#d$Z  ZS )'SmolVLMModelz
    A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
    in forward. Instead, we override inputs_merger here with custom logic.
    r'   c                    s   t  | | jjj| _| jjj| _t|j	| _
t|| _t|j| _t|j	j|j	j d |jd  | _| jj| _|jjdk| _|   d S )Nr   r   )r>   r?   r'   r   Zpad_token_idr/   
vocab_sizer   Z_from_configr   vision_modelr   	connectorr   Zfrom_config
text_modelr   rB   rC   r   Zimage_seq_lenimage_token_idr   r   	post_initrH   rI   r3   r4   r?   #  s   

zSmolVLMModel.__init__c                    s:    fdd dd }|   || _ | j|| _dS )aE  
        Enables the gradients for the input embeddings.

        This is useful for lora when using gradient checkpointing.
        c.f. https://github.com/huggingface/peft/issues/1402#issuecomment-1913675032

        Override to set output.requires_grad = True for both the decoder's and vision model's embeddings.
        c                    s,   t t|  dkr| S  t|  d S )Nr   )lenlistchildren)r2   get_lowest_moduler3   r4   r   ?  s   zBSmolVLMModel.enable_input_require_grads.<locals>.get_lowest_modulec                 S      | d d S NTZrequires_grad_r2   inputoutputr3   r3   r4   make_inputs_require_gradsG     zJSmolVLMModel.enable_input_require_grads.<locals>.make_inputs_require_gradsN)r   register_forward_hook_text_require_grads_hookr   _vision_require_grads_hookr1   r   r3   r   r4   enable_input_require_grads5  s   


z'SmolVLMModel.enable_input_require_gradsc                 C      | j   | j  d S r   r   remover   r   r3   r3   r4   disable_input_require_gradsO     
z(SmolVLMModel.disable_input_require_gradsc                 C   s
   | j  S r   )r   r   r   r3   r3   r4   r   S  r   z!SmolVLMModel.get_input_embeddingsc                 C   s   | j | d S r   )r   r   r   r3   r3   r4   r   V  s   z!SmolVLMModel.set_input_embeddings	input_idsr   r   c                 C   s   |j \}}}|| jk}|jdd}t|| dkstd|| }tjjj|j	ddddd}	|	dd }
|j	dd}|d | }|d | }|

d| }t|}||| || ddf ||< t|
d||}|S )	ar  
        This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
        The merging happens as follows:
        - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
        - We get the image hidden states for the image through the vision encoder and that hidden state, after a pixel shuffle operation, is then projected into the text embedding space.
        We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
        - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
        - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
        r   rm   r   zCAt least one sample has <image> tokens not divisible by patch_size.)r   r   )rh   NrP   )rQ   r   rW   rT   allrz   r   rr   padZcumsumZ	unsqueezeZ
zeros_likewhere)r1   r   r   r   r]   rC   Z
image_maskZnum_image_tokensZblocks_per_sampleoffsetsZblock_offsetZrow_cumZ	chunk_idxZ	local_idxZ	block_idxZimage_embedsZmerged_embedsr3   r3   r4   inputs_mergerY  s    

zSmolVLMModel.inputs_mergerNrK   pixel_attention_maskc                    s*   j \}}}}} j|| g j dd R    j dd  } dkjdd|k}	t|	s3d|	d<  |	   |du rOtj fd	d
dD tj j	d}n|j|| g|j dd R  }||	  }| j
jj}
|jd|
|
d}|jd|
|
d}|jdddk }| j |d}|j}| |}|S )a  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            pixel_attention_mask (`torch.LongTensor`, *optional*):
                The attention mask indicating padded regions in the image.
        r   Nr   r"   )rP   rl   r   Tr   c                    s   g | ]} j | qS r3   )rQ   )r   irK   r3   r4   r     s    z3SmolVLMModel.get_image_features.<locals>.<listcomp>)r   r   r	   )rN   rn   r[   )	dimensionrN   step)rP   rl   r   )rQ   rX   ZnumelrW   r   rs   rT   r   r   r[   r'   r   rC   Zunfoldr   r   r   )r1   rK   r   r\   Z
num_imagesrD   r   r   Znb_values_per_imageZreal_images_indsrC   Zpatches_subgridrL   r   r3   r   r4   get_image_features{  s.   
  

zSmolVLMModel.get_image_featuresa  
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        r   ri   r_   r    	use_cacher   r   r   cache_positionrt   rM   c                 K   s  |
dur|
n| j j}
|dur|n| j j}|	dur|	n| j j}	|dur$|n| j j}| jr8| jjr8|	r8t	d d}	|durB|j
\}}n|durM|j
\}}}ntdd}|	r`|du r\t }| }|durp|du rp|dkrptd|du r| j ||j}|dur|durtd|dur| ||}n|dur|j| j|jd}|dur|dur| j|||d	}| jd|||||	|
|d
|d	|}t|j|j|j|j|dS )a|  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz5You have to specify either input_ids or inputs_embedsr   zWWhen first calling the model, if input_embeds are passed, input_ids should not be None.zMYou cannot specify both pixel_values and image_hidden_states at the same timer   )r   r   r   T)	r   ri   r_   r    r   r   r   r   r   )r   r    r   r   r   r3   )r'   r   r   r   r   rp   r   r   r   r   rQ   rz   r   Zget_seq_lengthr   rZ   r[   r   rn   r   r   r   r    r   r   )r1   r   ri   r_   r    r   rK   r   r   r   r   r   r   r   rt   r\   r   r]   Zpast_seen_tokensr   r3   r3   r4   r`     sp   #
zSmolVLMModel.forwardr   )NNNNNNNNNNNNN)r6   r7   r8   ra   r   r?   r   r   r   r   rT   
LongTensorrd   r   rb   r   r   r   r   r   rc   r   r   r   r   r   r   r`   re   r3   r3   rI   r4   r     s~    
".	

r   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )	SmolVLMCausalLMOutputWithPasta  
    Base class for Idefics causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder
    Nlosslogitsr    r   r   r   )r6   r7   r8   ra   r   r   rT   rb   r   r  r    r   r   r   r   r   r3   r3   r3   r4   r     s   
 r   c                   @   s   e Zd ZdS )KwargsForCausalLMN)r6   r7   r8   r3   r3   r3   r4   r  :  s    r  z
    The SmolVLM Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top.
    c                '       sX  e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
ee															d)deej deej deej deeej  deej deej deej deej deej dee dee dee deej dee d eeejf d!ee d"eeef f"d#d$Z								d* fd%d&	Z fd'd(Z  ZS )+SmolVLMForConditionalGenerationzlm_head.weightc                    sP   t  | t|| _| jj| _tj|jj	|jj
dd| _|jj
| _
|   d S )NFr   )r>   r?   r   r   r'   r   r   r)   r   r@   r   lm_headr   rH   rI   r3   r4   r?   E  s   


z(SmolVLMForConditionalGeneration.__init__c                 C   s0   dd }|   || _| jj  || _dS )z
        Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping
        the model weights fixed.
        c                 S   r   r   r   r   r3   r3   r4   r   U  r   z]SmolVLMForConditionalGeneration.enable_input_require_grads.<locals>.make_inputs_require_gradsN)r   r   r   r   r   r   r   r3   r3   r4   r   O  s
   
z:SmolVLMForConditionalGeneration.enable_input_require_gradsc                 C   r   r   r   r   r3   r3   r4   r   ]  r   z;SmolVLMForConditionalGeneration.disable_input_require_gradsc                 C   s   | j j S r   )r   r   r   r   r3   r3   r4   r   a  s   z4SmolVLMForConditionalGeneration.get_input_embeddingsc                 C   s   | j j| d S r   )r   r   r   r   r3   r3   r4   r   d  s   z4SmolVLMForConditionalGeneration.set_input_embeddingsc                 C   r   r   r  r   r3   r3   r4   get_output_embeddingsg  r   z5SmolVLMForConditionalGeneration.get_output_embeddingsc                 C   r   r   r  )r1   Znew_embeddingsr3   r3   r4   set_output_embeddingsj  r   z5SmolVLMForConditionalGeneration.set_output_embeddingsNr   r   ri   r_   r    r   rK   r   r   labelsr   r   r   r   r   logits_to_keeprt   rM   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd|||||||||
|||dd|}|d }t|trCt| dn|}| |dd|ddf }d}|	durh| j	d||	| j j
jd|}t|||j|j|j|jdS )a
  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `SmolVLMForConditionalGeneration`).
            Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
            computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from io import BytesIO

        >>> from transformers import AutoProcessor, AutoModelForImageTextToText
        >>> from transformers.image_utils import load_image

        >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
        >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
        >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
        >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

        >>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
        >>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")

        >>> # Create inputs
        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {"type": "video", "path": path/to/video},
        ...             {"type": "text", "text": "What is happening in this video?"},
        ...         ]
        ...     }
        ... ]

        >>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)

        >>> # Generate
        >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
        >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

        >>> print(generated_texts)
        ```NT)r   ri   r_   r    r   rK   r   r   r   r   r   r   r   r   )r  r  r   )r   r  r    r   r   r   r3   )r'   r   r   r   r   r(   r   slicer  Zloss_functionr   r   r   r    r   r   r   )r1   r   ri   r_   r    r   rK   r   r   r  r   r   r   r   r   r	  rt   r   r   Zslice_indicesr  r   r3   r3   r4   r`   m  sN   Fz'SmolVLMForConditionalGeneration.forwardc
                    s^   t  j|f||||||||	d|
}|d ur!|d dkr!||d< |d ur-d |d< d |d< |S )N)r    ri   r   r   rK   r   r   r	  r   r   rK   r   )r>   prepare_inputs_for_generation)r1   r   r    ri   r   r   rK   r   r   r	  rt   Zmodel_inputsrI   r3   r4   r    s(   
z=SmolVLMForConditionalGeneration.prepare_inputs_for_generationc                    s(   t  jd|||d|}|j|d< |S )N)r   model_kwargsis_encoder_decoderr   r3   )r>   #_update_model_kwargs_for_generationr   )r1   r   r  r  rt   rI   r3   r4   r    s   
zCSmolVLMForConditionalGeneration._update_model_kwargs_for_generation)NNNNNNNNNNNNNNr   )NNNNNNNN)r6   r7   r8   Z_tied_weights_keysr?   r   r   r   r   r  r  r   r   r   rT   r   rd   r   rb   rc   r   r   r   r   r  r   r   r`   r  r  re   r3   r3   rI   r4   r  =  s    
	

s(r  )r  r   r   r   )r"   )<dataclassesr   typingr   r   r   r   r   rT   r   Zactivationsr
   Zcache_utilsr   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   r   Zmodeling_utilsr   r   Zprocessing_utilsr   utilsr   r   r   r   autor   Zconfiguration_smolvlmr   r   Z
get_loggerr6   r   r   Moduler<   rd   floatrw   r   r   r   r   r   r   r   r   r   r   r  r  __all__r3   r3   r3   r4   <module>   sz   
A
I0XP& x% R