o
    ZhN(                    @   s  d Z ddlmZ ddlmZmZmZmZmZm	Z	m
Z
 ddlZddlm  mZ ddlZddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 e* rddl3m4Z4 ddl5m6Z6 e+7e8Z9eG dd deZ:eG dd deZ;				dGddZ<g fddZ=G d d! d!ej>Z?G d"d# d#ej@ZAG d$d% d%ejBZCe%DeC G d&d' d'ejjBZEd(d) ZFdHd*d+ZGG d,d- d-ejBZH	.dId/ejBd0ejId1ejId2ejId3eejI d4eJd5eJfd6d7ZKG d8d9 d9ejBZLG d:d; d;ejBZMG d<d= d=ejBZNe(G d>d? d?e!ZOG d@dA dAee'ZPe(G dBdC dCeOZQG dDdE dEeOeZRg dFZSdS )JzPyTorch Idefics model.    )	dataclass)AnyCallableDictListOptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)ModelOutput)ALL_ATTENTION_FUNCTIONSPretrainedConfigPreTrainedModel)Unpack)ALL_LAYERNORM_LAYERS)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )IdeficsConfig)IdeficsPerceiverResampler)IdeficsVisionEmbeddingsIdeficsVisionTransformer)	BlockMask)make_flex_block_causal_maskc                   @   s   e Zd ZU dZdZeej ed< dZ	ee
e
ej   ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed< dS )IdeficsBaseModelOutputWithPasta	  
    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.

            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states)__name__
__module____qualname____doc__r%   r   torchFloatTensor__annotations__r&   r   r'   r(   r)    r1   r1   [/var/www/auris/lib/python3.10/site-packages/transformers/models/idefics/modeling_idefics.pyr$   6   s   
 $r$   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )	IdeficsCausalLMOutputWithPasta  
    Base class for Idefics causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.

            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr&   r'   r(   r)   )r*   r+   r,   r-   r4   r   r.   r/   r0   r5   r&   r   r'   r   r(   r)   r1   r1   r1   r2   r3   c   s   
  r3   Fc                 K   sJ  t | jd ddd|d| j}| d|} |dd |d< |dd |d< |dd |d< |dd |d< d|v rM|d }|d||d< |d urY|d||d	< |d d uri|d d||d< |d d ur}|d d||d< | |fS |d d ur|d d||d< | |fS |d d ur|d d||d< | |fS )
Nr   r   pixel_valuesimage_encoder_embeddingsperceiver_embeddingsimage_attention_masktoken_type_idsattention_mask)	r.   arangeshapeviewrepeattodeviceindex_selectget)	input_idsZexpand_sizeis_encoder_decoderr<   Zencoder_outputsmodel_kwargsZexpanded_return_idxr;   r1   r1   r2   expand_inputs_for_generation   s:   ,		rH   c                    sf   t jt jt jd  fdd|D }|  D ]|r+tfdd|D r+d qd q| S )N)	LayerNormLinear	Embeddingc                    s   g | ]} | qS r1   r1   ).0m)mappingr1   r2   
<listcomp>       z freeze_model.<locals>.<listcomp>c                 3   s    | ]}t  |V  qd S N)
isinstance)rL   t)moduler1   r2   	<genexpr>   s    zfreeze_model.<locals>.<genexpr>TF)r
   rI   rJ   rK   modulesanyrequires_grad_)modelmodule_exceptionsZmodule_exceptions_mappedr1   )rN   rT   r2   freeze_model   s   r[   c                       sN   e Zd ZdZ				ddee ddf fddZdd	 Zdefd
dZ	  Z
S )IdeficsDecoupledEmbeddinga  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
    then it will create `num_additional_embeddings` additional parameters that are always trained. If
    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
    FNpartially_freezereturnc           	         s   |dur||krt d| d| t jd|||||d| || _|| _|| _|| _|r5| jd | jdkrGt	j
| j|||d| _dS dS )	a)  
        Args:
            num_embeddings (`int`):
                Size of the dictionary of embeddings
            num_additional_embeddings (`int`):
                Number of additional embeddings. Only useful when you `partially_freeze=True`.
            embedding_dim (`int`):
                The size of each embedding vector
            partially_freeze: (`bool`, *optional*, defaults to `False`):
                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
            padding_idx (`int`, *optional*):
                The padding index (needs to be less than num_embeddings)

        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
        `max_norm` or `norm_type`. We are not supporting these.
        Nz/padding_idx must be within num_embeddings. Got z and )num_embeddingsembedding_dimrB   dtypepadding_idxFr   )r_   r`   rB   ra   r1   )
ValueErrorsuper__init__r_   rb   num_additional_embeddingsr]   weightrX   r
   rK   additional_embedding)	selfr_   rf   r`   r]   rB   ra   rb   kwargs	__class__r1   r2   re      s2   
z"IdeficsDecoupledEmbedding.__init__c                 C   sj   | j dkrt|| jS | }t|| jk}|| }| || j }d||< t|| j}|||< |S )a  
        we have 2 embeddings, with different indices - one pretrained self.weight and another
        self.additional_embedding.weight that is being trained.

        in order to make a lookup of the input ids, we:
        1. find out the indices of the entries belonging to the 2nd embedding
        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
           embedding starts from 0 and not num_embeddings
        3. perform the 2nd embedding lookup
        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
        5. perform the 1st embedding lookup
        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
        measure.

        r   )	rf   FZ	embeddingrg   cloner.   wherer_   rh   )ri   rE   Zadditional_vocab_indicesZinput_ids_additional_vocabZadditional_embeddingsZfull_vectorr1   r1   r2   forward  s   
z!IdeficsDecoupledEmbedding.forwardc                 C   s   d | j| j| j| jS )NzVnum_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={})formatr_   rf   r`   r]   ri   r1   r1   r2   
extra_repr.  s   z$IdeficsDecoupledEmbedding.extra_repr)FNNN)r*   r+   r,   r-   r   boolre   rp   strrs   __classcell__r1   r1   rk   r2   r\      s    
5'r\   c                       sj   e Zd ZdZ					ddedededed	ed
df fddZdejd
ejfddZ	d
e
fddZ  ZS )IdeficsDecoupledLineara  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
    r   TNin_featuresout_featuresout_additional_featuresbiasr]   r^   c                    sr   t  ||||| || _|| _|| _|| _|r&| jd |r&| jd |dkr7t	j
|||||d| _dS dS )aG  
        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
        parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
        Fr   )rx   ry   r{   rB   ra   N)rd   re   rz   r]   rx   ry   rg   rX   r{   r
   rJ   additional_fc)ri   rx   ry   rz   r{   r]   rB   ra   rk   r1   r2   re   @  s$   zIdeficsDecoupledLinear.__init__inputc                 C   s:   t || j| j}| jdkr| |}t||fd}|S )Nr   r6   )rm   Zlinearrg   r{   rz   r|   r.   cat)ri   r}   outputZadditional_featuresr1   r1   r2   rp   d  s
   

zIdeficsDecoupledLinear.forwardc                 C   s    d | j| j| j| jdu| jS )z=Overwriting `nn.Linear.extra_repr` to include new parameters.zYin_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}N)rq   rx   ry   rz   r{   r]   rr   r1   r1   r2   rs   m  s   z!IdeficsDecoupledLinear.extra_repr)r   TTNN)r*   r+   r,   r-   intrt   re   r.   Tensorrp   ru   rs   rv   r1   r1   rk   r2   rw   7  s,    	$	rw   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	IdeficsRMSNormư>c                    s&   t    tt|| _|| _dS )z=
        IdeficsRMSNorm is equivalent to T5LayerNorm
        N)rd   re   r
   	Parameterr.   onesrg   variance_epsilon)ri   hidden_sizeepsrk   r1   r2   re   z  s   

zIdeficsRMSNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   r6   T)Zkeepdim)rA   r.   float32powmeanZrsqrtr   rg   ra   Zfloat16Zbfloat16)ri   r'   Zvariancer1   r1   r2   rp     s
   
zIdeficsRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tuplerg   r>   r   rr   r1   r1   r2   rs     s   zIdeficsRMSNorm.extra_repr)r   )r*   r+   r,   re   rp   rs   rv   r1   r1   rk   r2   r   y  s    
r   c                       s0   e Zd Zd
 fdd	Zdd Zddd	Z  ZS )IdeficsEmbedding   '  Nc                    sz   t    || _|| _|| _d| jtjd| jdtjdj|tj	d| j   }| j
d|dd | j|| jjt d	 d S )
N      ?r   r   ra   rB   ra   inv_freqF
persistentseq_lenrB   ra   )rd   re   dimmax_position_embeddingsbaser.   r=   int64rA   floatregister_buffer_set_cos_sin_cacher   rB   Zget_default_dtype)ri   r   r   r   rB   r   rk   r1   r2   re     s   
&
zIdeficsEmbedding.__init__c                 C   s|   || _ tj| j |tjd| j}td|| j}tj||fdd}| jd|	 
|dd | jd| 
|dd d S )	Nr   zi,j->ijr6   r   
cos_cachedFr   
sin_cached)max_seq_len_cachedr.   r=   r   Ztype_asr   Zeinsumr~   r   cosrA   sin)ri   r   rB   ra   rS   ZfreqsZembr1   r1   r2   r     s   z#IdeficsEmbedding._set_cos_sin_cachec                 C   sN   || j kr| j||j|jd | jd | j|jd| jd | j|jdfS )Nr   r   )r   r   rB   ra   r   rA   r   )ri   xr   r1   r1   r2   rp     s
   
zIdeficsEmbedding.forward)r   r   NrQ   )r*   r+   r,   re   r   rp   rv   r1   r1   rk   r2   r     s    
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr6   r   r   )r>   r.   r~   )r   x1Zx2r1   r1   r2   rotate_half  s   r   c                 C   sL   ||  |}||  |}| | t| |  }|| t||  }||fS )an  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   position_idsZunsqueeze_dimZq_embedZk_embedr1   r1   r2   apply_rotary_pos_emb  s
   r   c                       s2   e Zd Zdededef fddZdd Z  ZS )
IdeficsMLPr   intermediate_size
hidden_actc                    sN   t    tj||dd| _tj||dd| _tj||dd| _t| | _d S )NFr{   )	rd   re   r
   rJ   	gate_proj	down_projup_projr   act_fn)ri   r   r   r   rk   r1   r2   re     s
   
zIdeficsMLP.__init__c                 C   s    |  | | || | S rQ   )r   r   r   r   )ri   r   r1   r1   r2   rp         zIdeficsMLP.forward)r*   r+   r,   r   ru   re   rp   rv   r1   r1   rk   r2   r     s    r           rT   querykeyvaluer<   scalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr6   )r   ra   ptrainingr   r   )r.   matmul	transposer
   
functionalZsoftmaxr   rA   ra   r   r   
contiguous)
rT   r   r   r   r<   r   r   rj   attn_weightsattn_outputr1   r1   r2   eager_attention_forward  s   
r   c                       s   e Zd ZdZ					ddedededed	ed
edee f fddZ	de
jdedefddZ							dde
jdee
j dee
j dee
j deee
j  dededee
j dee
jee
j eee
j  f fddZ  ZS ) IdeficsAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FNr   	num_headsr   is_cross_attentionconfigqk_layer_norms	layer_idxc           	         s  t    || _|| _|| _|| | _|| _d| _| jd | _|| _	|d u r1t
d| jj d | j| | jkrEtd| j d| d|| _ttjdsRtd	| jrt|jd
s^| jn|jj}tj| j|| j dd| _tj||| j dd| _tj||| j dd| _n'tj| j|| j dd| _tj| j|| j dd| _tj| j|| j dd| _tj|| j |dd| _t| j| _|| _| jrt| j|jd| _t| j|jd| _ d S d S )NTg      zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).Zscaled_dot_product_attentionz)this model requires pytorch 2.0 or higher	embed_dimFr   r   )!rd   re   r   r   r   head_dimr   Z	is_causalr   r   loggerwarning_oncerl   r*   rc   r   hasattrr
   r   vision_configr   rJ   q_projk_projv_projo_projr   
rotary_embr   r   rms_norm_epsq_layer_normk_layer_norm)	ri   r   r   r   r   r   r   r   Zkv_input_dimrk   r1   r2   re     s|   




zIdeficsAttention.__init__tensorr   bszc                 C   s    | ||| j| jdd S )Nr   r   )r?   r   r   r   r   )ri   r   r   r   r1   r1   r2   _shape]  r   zIdeficsAttention._shaper'   key_value_statesr<   r   past_key_valueoutput_attentions	use_cachecache_positionr^   c	                 K   s  | j p|d u}
| \}}}| |||| j| jdd}|
sD| |||| j| jdd}| |||| j| jdd}n)| \}}}| |||| j| jdd}| |||| j| jdd}|j	d }|d ur|||d 7 }|
s| j
|t||d\}}t|||||\}}|d urd|i}|||| j|\}}| jr| |}| |}t}| jjdkr| jjdkr|rtd	 nt| jj }|| ||||f| jsd
n| j| jd|	\}}|||d }| |}|rd }|||fS )Nr   r   r   r   )r   r   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r6   )r   sizer   r?   r   r   r   r   r   r>   r   maxr   updater   r   r   r   r   r   _attn_implementationr   r   r   r   r   r   reshaper   r   )ri   r'   r   r<   r   r   r   r   r   rj   r   r   Zq_len_Zquery_statesZ
key_statesZvalue_statesZkv_lenZ
kv_seq_lenr   r   Zcache_kwargsZattention_interfacer   r   r1   r1   r2   rp   `  s\   ""$" 





zIdeficsAttention.forward)r   FNFNNNNNFFN)r*   r+   r,   r-   r   r   rt   r   r   re   r.   r   r   
LongTensorr   rp   rv   r1   r1   rk   r2   r   	  sb    Q	r   c                       s   e Zd Zddedee f fddZ						ddejdeej d	eej	 d
ee
ej  dee dee deej	 de
ejee
ejejf  f fddZ  ZS )IdeficsDecoderLayerNr   r   c                    sr   t    |j| _t| j|j|j||d| _t| j|j|j	d| _
t|j|jd| _t|j|jd| _|j| _d S )N)r   r   r   r   r   r   r   r   r   )rd   re   r   r   num_attention_headsr   	self_attnr   r   r   mlpr   r   input_layernormpost_attention_layernormri   r   r   rk   r1   r2   re     s"   
zIdeficsDecoderLayer.__init__Fr'   r<   r   r   r   r   r   r^   c              
   K   s   |}	|  |}| jd|||||||d|\}}
}tjj|| j| jd}|	| }|}	| |}| |}tjj|| j| jd}|	| }|f}|rN||
f7 }|rU||f7 }|S )a]  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        )r'   r<   r   r   r   r   r   r   Nr1   )r   r   r
   r   r   r   r   r   )ri   r'   r<   r   r   r   r   r   rj   residualself_attn_weightspresent_key_valueoutputsr1   r1   r2   rp     s4   





zIdeficsDecoderLayer.forwardrQ   )NNNFFN)r*   r+   r,   r   r   r   re   r.   r   r   r   rt   r/   rp   rv   r1   r1   rk   r2   r     s2    
r   c                       s   e Zd Zddedee f fddZ							ddejdeej d	eej d
eej deej dee	 dee	 dee
ej  de
ejee
ejejf  f fddZ  ZS )IdeficsGatedCrossAttentionLayerNr   r   c              	      s~  t    |j| _t| j|jd|j||j|d| _t| j|j	|j
d| _t|j|jd| _t|j|jd| _|j| _t | _t | _|jdkr|jdkrgttdd| j| _ttdd| j| _n|jdkrttd| _ttd| _ntd	|j d
|jdkr|jdkrttdd| j| _ttdd| j| _n|jdkrttd| _ttd| _njtd	|j d
|jdv r$|jdkrttjd|jdd| jfd| _ttjd|jdd| jfd| _n3|jdkrttjd|jdd| _ttjd|jdd| _ntd	|j d
t d|j dt!| dr9t!| ds=tdd S )NT)r   r   r   r   r   r   r   r   r   zerosZvectorr   r   z Unknown value for `alpha_type` ()r   >   normalrandomgaussianr   )r   stdr   zAlpha initialization scheme z not yet implemented!alpha_cross_attnalpha_densez+Alpha parameters not initialized correctly!)"rd   re   r   r   r   r   r   
cross_attnr   r   r   r   r   r   r   r   r   r
   ZTanhact_cross_attn	act_densealpha_initializerZ
alpha_typer   r.   r   r  r  rc   r   r   alphas_initializer_rangeNotImplementedErrorr   r   rk   r1   r2   re      sn   
	








z(IdeficsGatedCrossAttentionLayer.__init__Fr'   r<   r)   r:   cross_attention_gater   r   r   r^   c	                 K   s  |du rt d|du rt d|durtd|}
| |}| jd	||||d|	\}}}tjj|| j| jd}|	|dkdddddf d}|
| 
| j|  }|}
| |}| |}tjj|| j| jd}|
| | j|  }|f}|r||f7 }|r||f7 }|S )
a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            image_attention_mask (`torch.FloatTensor`, *optional*): image attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            cross_attention_gate (`torch.FloatTensor`, *optional*):
                gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        Nzt`image_hidden_states` is required for Idefics cross attention module which are visual features to be conditioned on.z`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images.zMPast key value states are not implemented for Idefics cross attention module.)r'   r   r<   r   r   r   r   r1   )rc   r	  r   r  r
   r   r   r   r   masked_fillr  r  r   r   r  r  )ri   r'   r<   r)   r:   r
  r   r   r   rj   r   r   r   r   r1   r1   r2   rp   B  sD   
"



z'IdeficsGatedCrossAttentionLayer.forwardrQ   r   )r*   r+   r,   r   r   r   re   r.   r   rt   r   r/   rp   rv   r1   r1   rk   r2   r     s8    E	r   c                   @   s<   e Zd ZeZdZdZddgZdZdZ	dZ
dZdZdd ZdS )	IdeficsPreTrainedModelrY   Tr   r   Fc                 C   s  | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tjrF|jjjd|d |jd urD|jj|j 
  d S d S t|tjr[|jjd |j	j
  d S t|tri|jjd d S t|trv|jj  d S t|tr| j jdkr|jj
  |jj
  d S | j jdkr|jjd |jjd d S | j jdv r|jjjd| j jd |jjjd| j jd d S d S t|tr|jj  d S d S )Nr   )r   r  r   r   r   >   r   r   r   )r   Zinitializer_rangerR   r
   rJ   ZConv2drg   dataZnormal_r{   Zzero_rK   rb   rI   Zfill_r   r    Zclass_embeddingr   r  r  r  r  r   Zlatents)ri   rT   r  r1   r1   r2   _init_weights  s@   





z$IdeficsPreTrainedModel._init_weightsN)r*   r+   r,   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_cache_classZ_supports_flash_attn_2Z_supports_static_cacheZ_supports_attention_backendr  r1   r1   r1   r2   r    s    r  c                   @   s   e Zd ZdS )KwargsForCausalLMN)r*   r+   r,   r1   r1   r1   r2   r    s    r  c                '       s  e Zd ZdZdef fddZd.ddZg fdd	Zg fd
dZdd Z	dd Z
ee															d/deej deej deej deeej  deej deej deej deej deej dee dee dee dee dee deej d ee d!eeef f"d"d#Z	d0deejd$f d%ejdejdedef
d&d'Zedejd(ed)ed*ejdejd+efd,d-Z  Z S )1IdeficsModelz
    Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

    Args:
        config: IdeficsConfig
    r   c                    s
  t     | _ j| _ j| _t j j j j	| jd| _
 jj| _ j| _t j| _ jrE j}t  jj|j|j|j|j| _t fddt jD | _ j| _ j| j }t fddt|D | _d| _t  j j!d| _"| #  | $  d S )N)r_   rf   r`   r]   rb   c                       g | ]}t  |d qS )r   )r   rL   ir   r1   r2   rO         z)IdeficsModel.__init__.<locals>.<listcomp>c                    r  r  )r   r  r  r1   r2   rO     r  Fr   )%rd   re   r   Zpad_token_idrb   
vocab_sizer\   additional_vocab_sizer   freeze_text_layersembed_tokensr   Z
image_sizer!   vision_modeluse_resamplerperceiver_configr   r   Zresampler_depthZresampler_n_headsZresampler_head_dimZresampler_n_latentsperceiver_resamplerr
   Z
ModuleListrangeZnum_hidden_layerslayerscross_layer_intervalgated_cross_attn_layersgradient_checkpointingr   r   norm	post_initfreeze_relevant_params)ri   r   r  Znum_cross_layersrk   r  r2   re     sH   
	zIdeficsModel.__init__Nc                 C   s>   |d u r| j }|jr| |j |jrt| j|jd d S d S N)rZ   )r   r  Zfreeze_text_module_exceptionsfreeze_vision_layersr[   r  Zfreeze_vision_module_exceptions)ri   r   r1   r1   r2   r&    s   z#IdeficsModel.freeze_relevant_paramsc                 C   s"   | j | jfD ]}t||d qd S r'  )r   r$  r[   )ri   rZ   rT   r1   r1   r2   r    s   zIdeficsModel.freeze_text_layersc                 C   s   t | j|d d S r'  )r[   r  )ri   rZ   r1   r1   r2   r(    s   z!IdeficsModel.freeze_vision_layersc                 C      | j S rQ   r  rr   r1   r1   r2   get_input_embeddings     z!IdeficsModel.get_input_embeddingsc                 C   
   || _ d S rQ   r*  ri   r   r1   r1   r2   set_input_embeddings     
z!IdeficsModel.set_input_embeddingsFrE   r<   r   r&   inputs_embedsr7   r8   r9   r:   r   r   output_hidden_statesinterpolate_pos_encodingreturn_dictr   rj   r^   c           *         s  |dur|j n|j }|dur|n| jj}|dur|n| jj}|
dur$|
n| jj}
|dur.|n| jj}|du |duA r>td| jrM| jrM|
rMt	
d d}
|du rV| |}d}|
rst|tssd}|du rit }n
t|}t	
d |j\}}}|dur| nd}|| }|du rtj|||jd  |j d	}|dur|du r| d
d }||dkd |dd| df }n	|du r|d}tdd |||fD dkrtd|dur
|j| j|d}|jdd \}}| j|| g|jdd R  }| j||dj}n|dur(| \}}}}|j| j|d}||| ||}| jj rN|du rC| !|}|d|d}}n| \}}}}|}n|du r_|d|d}}ntd|||| |}|	d}|	d
}	|	"ddd|}	|	|||| }	|dur| \}}}||f}|	du rtj#||d	}	| $|	}	nd}	|	dkj%d
dj| jdj&dd|} |du rtj#||ftj'|j d}| (|||||}|}!|rdnd}"|rdnd}#d}$t)| j*D ]o\}%}&|r|"|!f7 }" fdd}'| jr,| jr,d}|
rt	
d d}
| +|'|&|!|||||	| ||
|%| j,| j-|}(n|'|&|!f|||||	| ||
|%| j,| j-|d }(|(d }!|
rT|(|rQdnd }$|r^|#|(d f7 }#q| .|!}!|rm|"|!f7 }"|
rr|$nd})|r{|)/ })|||||}t0|!|)|"|#|dS )ab  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   rB   r6   c                 S   s   g | ]}|d u qS rQ   r1   )rL   r   r1   r1   r2   rO   c  rP   z(IdeficsModel.forward.<locals>.<listcomp>r   z_Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None.)ra   rB   )r7   r3  zBIf `perceiver_embeddings` are passed, use_resampler should be Truer   r   r   r1   c              
      sb   |
| dkr ||
|  }||f||||||	d d }|d }| |f|||||	|d }|S )Nr   )r<   r)   r:   r
  r   r   r   )r<   r   r   r   r   r   r1   )Z
main_blockr'   r<   r   r   r)   r:   r
  r   r   r   r!  r"  r   Zxblockr   layer_outputsrj   r1   r2   vblock  s:   	z$IdeficsModel.forward.<locals>.vblockzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...)r<   r   r   r)   r:   r
  r   r   r   r!  r"  r   )r%   r&   r'   r(   r)   )1rB   r   r   r2  r   use_return_dictrc   r#  r   r   r   r  rR   r   r   Zfrom_legacy_cacher>   get_seq_lengthr.   r=   longZcumsumZmasked_fill_r   sumrA   ra   r   r?   r  r%   r   r  r  r@   r   Zinvert_attention_maskrW   Zsqueezert   _update_causal_mask	enumerater   Z_gradient_checkpointing_funcr!  r"  r$  Zto_legacy_cacher$   )*ri   rE   r<   r   r&   r1  r7   r8   r9   r:   r   r   r2  r3  r4  r   rj   rB   Zreturn_legacy_cache
batch_size
seq_lengthr   past_key_values_lengthZseq_length_with_pastZ
num_imagesr)   Zimage_seq_lenZimage_hidden_sizeZtext_seq_lenZimage_batch_sizeZimage_sequence_lengthZimage_hidden_shaper
  r'   Zall_hidden_statesZall_self_attnsZnext_decoder_cacheidxZdecoder_layerr8  r6  Z
next_cacher1   r7  r2   rp     s8  



$








"


-

zIdeficsModel.forwardr"   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2r   Zflex_attentionr   Fr   )r1  rA  Zis_trainingr   r6   )sequence_lengthtarget_lengthra   r   r?  )cudaZxpuZnpu)r   r   rW   rR   r.   r   r#   r:  Zis_compileabler   Z_ignore_causal_mask_sdpar   ra   r>   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrB   typefinfominZ_unmask_unattended)ri   r<   rC  r   r&   r   Zpast_seen_tokensZusing_compilable_cachera   rD  rE  causal_mask	min_dtyper1   r1   r2   r=  (  sT   




z IdeficsModel._update_causal_maskrD  rE  ra   r?  c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuera   rB   r   )Zdiagonalr5  r6   r   )r   r.   rI  rJ  fullrB   Ztriur=   r   expandrn   r>   rA   r  )r<   rD  rE  ra   r   r?  rj   rK  rL  Zmask_lengthZpadding_maskr1   r1   r2   rG  l  s,    $
6  zBIdeficsModel._prepare_4d_causal_attention_mask_with_cache_positionrQ   )NNNNNNNNNNNNFNNF)!r*   r+   r,   r-   r   re   r&  r  r(  r+  r/  r   r   r   r.   r   r   r   r/   rt   r   r   r	   r   r$   rp   r   r=  staticmethodr   ra   rG  rv   r1   r1   rk   r2   r    s    
0
	

  
Dr  c                )       s  e Zd ZddgZd2 fdd	Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zee																d3deej deej deej deeej  deej deej deej deej deej deej dee d ee d!ee d"ee d#ee d$eej d%ee d&eeef f$d'd(Z									d4 fd)d*	Z	d5d+ed,eeef d-ed&eeef f fd.d/Z e!d0d1 Z"  Z#S )6IdeficsForVisionText2Textzmodel.embed_tokens.weightzlm_head.weightNc                    s>   t  | t|| _t|j|j|jd|jd| _	| 
  d S )NF)rx   ry   rz   r{   r]   )rd   re   r  rY   rw   r   r  r  Zfreeze_lm_headlm_headr%  )ri   r   r  rk   r1   r2   re     s   
	z"IdeficsForVisionText2Text.__init__c                 C   s   | j jS rQ   rY   r  rr   r1   r1   r2   r+    s   z.IdeficsForVisionText2Text.get_input_embeddingsc                 C   s   || j _d S rQ   rT  r.  r1   r1   r2   r/    s   z.IdeficsForVisionText2Text.set_input_embeddingsc                 C   r)  rQ   rS  rr   r1   r1   r2   get_output_embeddings  r,  z/IdeficsForVisionText2Text.get_output_embeddingsc                 C   r-  rQ   rU  )ri   Znew_embeddingsr1   r1   r2   set_output_embeddings  r0  z/IdeficsForVisionText2Text.set_output_embeddingsc                 C   r-  rQ   rY   )ri   decoderr1   r1   r2   set_decoder  r0  z%IdeficsForVisionText2Text.set_decoderc                 C   r)  rQ   rX  rr   r1   r1   r2   get_decoder  r,  z%IdeficsForVisionText2Text.get_decoderc                 C   s   |   }|  }t| jddr&|j|_|jdkr&|j|jks J |jj|j_t	|drDt	|drF|j
|_t	|drHt	|drJ|j|_dS dS dS dS dS )	z
        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
        IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
        Ztie_word_embeddingsTr   ry   r_   rz   rf   N)rV  r+  getattrr   rg   rf   rz   rh   r|   r   r_   ry   )ri   Zoutput_embeddingsZinput_embeddingsr1   r1   r2   tie_weights  s   
z%IdeficsForVisionText2Text.tie_weightsFrE   r<   r   r&   r1  r7   r8   r9   r:   labelsr   r   r2  r3  r4  r   rj   r^   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd|||||||||	||||d|d|}|d }| |}d}|
durS| jd||
| j jd|}t|||j	|j
|j|jdS )aC  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.

        Example:

        ```python
        >>> from transformers import AutoProcessor, IdeficsForVisionText2Text

        >>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")

        >>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
        >>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"

        >>> prompts = [
        ...     [
        ...         "User:",
        ...         dogs_image_url_1,
        ...         "Describe this image.\nAssistant: An image of two dogs.\n",
        ...         "User:",
        ...         dogs_image_url_2,
        ...         "Describe this image.\nAssistant:",
        ...     ]
        ... ]
        >>> inputs = processor(prompts, return_tensors="pt")
        >>> generate_ids = model.generate(**inputs, max_new_tokens=6)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)
        ```NT)rE   r<   r   r&   r1  r7   r8   r9   r:   r   r   r2  r3  r4  r   r   )r5   r^  r  )r4   r5   r&   r'   r(   r)   r1   )r   r   r2  r9  rY   rS  Zloss_functionr  r3   r&   r'   r(   r)   )ri   rE   r<   r   r&   r1  r7   r8   r9   r:   r^  r   r   r2  r3  r4  r   rj   r   r'   r5   r4   r1   r1   r2   rp     sH   <
z!IdeficsForVisionText2Text.forwardc              
      s   i }|d ur| j jr||d< n	||d< n||d< |dd|d< t j|f||||||
|	d||}|	d urP|d u rP|d jd }|	d d | d f |d	< |S )
Nr9   r8   r7   r3  F)r&   r<   r1  r   r   r   r:   rE   r   r:   )r   r  poprd   prepare_inputs_for_generationr>   )ri   rE   r<   r   r1  r&   r   r7   r)   r:   r   rj   Zimages_kwargsZmodel_inputsr@  rk   r1   r2   r`  D  s4   

	
z7IdeficsForVisionText2Text.prepare_inputs_for_generationr   rG   rF   c                    s~   t  j|||fi |}d|v r8|d }|d d dd d f d}|ddr-||d< ntj||gdd|d< |j|d< |S )Nr:   r6   r   r   Tr   r)   )rd   #_update_model_kwargs_for_generationr   rD   r.   r~   r)   )ri   r   rG   rF   rj   r:   Z	last_maskrk   r1   r2   ra  q  s   

z=IdeficsForVisionText2Text._update_model_kwargs_for_generationc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr1   c                 3   s    | ]	}| d  V  qdS )r   N)rC   )rL   Z
past_statebeam_idxr1   r2   rU     s    z;IdeficsForVisionText2Text._reorder_cache.<locals>.<genexpr>)r   )Zpastrc  Zreordered_pastZ
layer_pastr1   rb  r2   _reorder_cache  s   z(IdeficsForVisionText2Text._reorder_cacherQ   )NNNNNNNNNNNNNFNN)	NNNNNNNNNrP  )$r*   r+   r,   Z_tied_weights_keysre   r+  r/  rV  rW  rZ  r[  r]  r   r   r   r.   r   r   r   r/   rt   r   r  r	   r   r3   rp   r`  r   r   ru   r   ra  rQ  rd  rv   r1   r1   rk   r2   rR    s    	

g1

rR  )rR  r  r  )r   FNN)r   )r   )Tr-   dataclassesr   typingr   r   r   r   r   r   r	   r.   Ztorch.nn.functionalr
   r   rm   Ztorch.utils.checkpointZactivationsr   Zcache_utilsr   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_flash_attention_utilsr   Zmodeling_outputsr   Zmodeling_utilsr   r   r   Zprocessing_utilsr   Zpytorch_utilsr   utilsr   r   r   r   r   Zconfiguration_ideficsr   Z	perceiverr   Zvisionr    r!   Z!torch.nn.attention.flex_attentionr"   Zintegrations.flex_attentionr#   Z
get_loggerr*   r   r$   r3   rH   r[   rK   r\   rJ   rw   Moduler   appendr   r   r   r   r   r   r   r   r   r   r  r  r  rR  __all__r1   r1   r1   r2   <module>   s   $
,+
-nB
'

 &Q -   f o