
    fThN(                    z   S r SSKJr  SSKJrJrJrJrJrJ	r	J
r
  SSKrSSKJs  Jr  SSKrSSKJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJ r J!r!  SSK"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-  SSK.J/r/  SSK0J1r1J2r2  \*" 5       (       a  SSK3J4r4  SSK5J6r6  \+Rn                  " \85      r9\ " S S\5      5       r:\ " S S\5      5       r;    S@S jr</ 4S jr= " S S\R|                  5      r? " S S \R                  5      rA " S! S"\R                  5      rC\%R                  " \C5         " S# S$\R                  R                  5      rES% rFSAS& jrG " S' S(\R                  5      rH SBS)\R                  S*\R                  S+\R                  S,\R                  S-\\R                     S.\JS/\J4S0 jjrK " S1 S2\R                  5      rL " S3 S4\R                  5      rM " S5 S6\R                  5      rN\( " S7 S8\!5      5       rO " S9 S:\\'5      rP\( " S; S<\O5      5       rQ " S= S>\O\5      rR/ S?QrSg)CzPyTorch Idefics model.    )	dataclass)AnyCallableDictListOptionalTupleUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)ModelOutput)ALL_ATTENTION_FUNCTIONSPretrainedConfigPreTrainedModel)Unpack)ALL_LAYERNORM_LAYERS)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )IdeficsConfig)IdeficsPerceiverResampler)IdeficsVisionEmbeddingsIdeficsVisionTransformer)	BlockMask)make_flex_block_causal_maskc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\\R                           \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
IdeficsBaseModelOutputWithPast6   av	  
Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states )__name__
__module____qualname____firstlineno____doc__r(   r   torchFloatTensor__annotations__r)   r	   r*   r+   r,   __static_attributes__r-       d/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/idefics/modeling_idefics.pyr&   r&   6   s    "H 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju00129>B%(9(9":;Br7   r&   c                   "   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   S
rg)IdeficsCausalLMOutputWithPastc   ap  
Base class for Idefics causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
Nlosslogitsr)   r*   r+   r,   r-   )r.   r/   r0   r1   r2   r<   r   r3   r4   r5   r=   r)   r   r*   r	   r+   r,   r6   r-   r7   r8   r:   r:   c   s    @ )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju00129>B%(9(9":;Br7   r:   c                    [         R                  " U R                  S   5      R                  SS5      R	                  SU5      R                  S5      R                  U R                  5      nU R                  SU5      n UR                  SS 5      US'   UR                  SS 5      US'   UR                  SS 5      US'   UR                  SS 5      US'   SU;   a  US   nUR                  SU5      US'   Ub  UR                  SU5      US	'   US   b  US   R                  SU5      US'   US   b  US   R                  SU5      US'   X4$ US   b  US   R                  SU5      US'   X4$ US   b  US   R                  SU5      US'   X4$ )
Nr   r   pixel_valuesimage_encoder_embeddingsperceiver_embeddingsimage_attention_masktoken_type_idsattention_mask)	r3   arangeshapeviewrepeattodeviceindex_selectget)	input_idsexpand_sizeis_encoder_decoderrE   encoder_outputsmodel_kwargsexpanded_return_idxrD   s           r8   expand_inputs_for_generationrT      s    	Y__Q'(--b!4;;A{KPPQSTWWXaXhXhi  &&q*=>I#/#3#3ND#IL /;/?/?@Z\`/aL+,+7+;+;<RTX+YL'(+7+;+;<RTX+YL'(<'%&67)7)D)DQH[)\%&!)7)D)DQH[)\%&*+7/;<R/S/`/`"0
+, N#/'3N'C'P'PQRTg'h^$ "" 
0	1	=3?@Z3[3h3h"4
/0 "" 
,	-	9/;<R/S/`/`"0
+, ""r7   c                 R  ^ [         R                  [         R                  [         R                  S.nU Vs/ s H  o2U   PM	     nnU R	                  5        HH  mU(       a-  [        U4S jU 5       5      (       a  TR                  S5        M7  TR                  S5        MJ     U $ s  snf )N)	LayerNormLinear	Embeddingc              3   <   >#    U  H  n[        TU5      v   M     g 7fN)
isinstance).0tmodules     r8   	<genexpr>freeze_model.<locals>.<genexpr>   s     $]D\qZ%:%:D\s   TF)r   rV   rW   rX   modulesanyrequires_grad_)modelmodule_exceptionsmappingmmodule_exceptions_mappedr^   s        @r8   freeze_modelri      s    \\))\\G
 5FF4Eq
4EF--/$]D\$]!]!]!!$'!!%(	 "
 L  Gs   B$c                   ^   ^  \ rS rSrSr    S
S\\   SS4U 4S jjjrS rS\	4S jr
S	rU =r$ )IdeficsDecoupledEmbedding   a  
Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
then it will create `num_additional_embeddings` additional parameters that are always trained. If
`num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
Npartially_freezereturnc           	      F  > Ub  Xq:  a  [        SU SU 35      e[        T	U ]  " SUUUUUS.UD6  Xl        Xpl        X l        X@l        U(       a  U R                  R                  S5        U R
                  S:  a'  [        R                  " U R
                  UUUS9U l        gg)	a  
Args:
    num_embeddings (`int`):
        Size of the dictionary of embeddings
    num_additional_embeddings (`int`):
        Number of additional embeddings. Only useful when you `partially_freeze=True`.
    embedding_dim (`int`):
        The size of each embedding vector
    partially_freeze: (`bool`, *optional*, defaults to `False`):
        If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
    padding_idx (`int`, *optional*):
        The padding index (needs to be less than num_embeddings)

Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
`max_norm` or `norm_type`. We are not supporting these.
Nz/padding_idx must be within num_embeddings. Got z and )num_embeddingsembedding_dimrK   dtypepadding_idxFr   )rp   rq   rK   rr   r-   )
ValueErrorsuper__init__rp   rs   num_additional_embeddingsrm   weightrc   r   rX   additional_embedding)
selfrp   rw   rq   rm   rK   rr   rs   kwargs	__class__s
            r8   rv   "IdeficsDecoupledEmbedding.__init__   s    6 "{'CN{m[`ao`pqrr 	
)'#	
 	
 -&)B& 0KK&&u-))A-(*#==+	)D% .r7   c                 \   U R                   S:X  a   [        R                  " XR                  5      $ UR	                  5       n[
        R                  " XR                  :  5      nX   nU R                  X0R                  -
  5      nSX'   [        R                  " XR                  5      nXEU'   U$ )a{  
we have 2 embeddings, with different indices - one pretrained self.weight and another
self.additional_embedding.weight that is being trained.

in order to make a lookup of the input ids, we:
1. find out the indices of the entries belonging to the 2nd embedding
2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
   embedding starts from 0 and not num_embeddings
3. perform the 2nd embedding lookup
4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
5. perform the 1st embedding lookup
6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
measure.

r   )	rw   F	embeddingrx   cloner3   whererp   ry   )rz   rN   additional_vocab_indicesinput_ids_additional_vocabadditional_embeddingsfull_vectors         r8   forward!IdeficsDecoupledEmbedding.forward  s    * ))Q.;;y++66 OO%	#(;;y<O<O/O#P %.%H" $ 9 9:TWjWj:j k /0	+kk)[[9 1F,-r7   c                 z    SR                  U R                  U R                  U R                  U R                  5      $ )NzVnum_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={})formatrp   rw   rq   rm   rz   s    r8   
extra_repr$IdeficsDecoupledEmbedding.extra_repr.  s9    gnn**!!	
 	
r7   )ry   rw   rp   rs   rm   )FNNN)r.   r/   r0   r1   r2   r   boolrv   r   strr   r6   __classcell__r|   s   @r8   rk   rk      sP     ,13
 #4.3 
3 3j%N
C 
 
r7   rk   c                      ^  \ rS rSrSr     SS\S\S\S\S\S	S4U 4S
 jjjrS\R                  S	\R                  4S jr
S	\4S jrSrU =r$ )IdeficsDecoupledLineari7  a  
Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
then it will create `out_additional_features * in_features` additional parameters that are always trained. If
`out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
Nin_featuresout_featuresout_additional_featuresbiasrm   rn   c                 (  > [         TU ]  XXFU5        X0l        XPl        Xl        X l        U(       a=  U R                  R                  S5        U(       a  U R                  R                  S5        US:  a  [        R                  " UUUUUS9U l        gg)a'  
out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
`partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
Fr   )r   r   r   rK   rr   N)ru   rv   r   rm   r   r   rx   rc   r   r   rW   additional_fc)	rz   r   r   r   r   rm   rK   rr   r|   s	           r8   rv   IdeficsDecoupledLinear.__init__@  s     	D%H'>$ 0&(KK&&u-		((/"Q&!#'4"D 'r7   inputc                     [         R                  " XR                  U R                  5      nU R                  S:  a)  U R                  U5      n[        R                  " X#4S5      nU$ )Nr   r?   )r   linearrx   r   r   r   r3   cat)rz   r   outputadditional_featuress       r8   r   IdeficsDecoupledLinear.forwardd  sQ    %dii8''!+"&"4"4U";YY<bAFr7   c                     SR                  U R                  U R                  U R                  U R                  SLU R
                  5      $ )z=Overwriting `nn.Linear.extra_repr` to include new parameters.zYin_features={}, out_features={}, out_additional_features={}, bias={}, partially_freeze={}N)r   r   r   r   r   rm   r   s    r8   r   !IdeficsDecoupledLinear.extra_reprm  sE    jqq((IIT!!!
 	
r7   )r   r   r   r   rm   )r   TTNN)r.   r/   r0   r1   r2   intr   rv   r3   Tensorr   r   r   r6   r   r   s   @r8   r   r   7  s     ()!%"" " "%	"
 " " 
" "HU\\ ell 
C 
 
r7   r   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )IdeficsRMSNormiy  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z-
IdeficsRMSNorm is equivalent to T5LayerNorm
N)ru   rv   r   	Parameterr3   onesrx   variance_epsilon)rz   hidden_sizeepsr|   s      r8   rv   IdeficsRMSNorm.__init__z  s/     	ll5::k#:; #r7   c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   r?   T)keepdim)rJ   r3   float32powmeanrsqrtr   rx   rr   float16bfloat16)rz   r*   variances      r8   r   IdeficsRMSNorm.forward  s     ##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r7   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tuplerx   rG   r   r   s    r8   r   IdeficsRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr7   )r   rx   )gư>)	r.   r/   r0   r1   rv   r   r   r6   r   r   s   @r8   r   r   y  s    $+J Jr7   r   c                   <   ^  \ rS rSrSU 4S jjrS rSS jrSrU =r$ )IdeficsEmbeddingi  c           	        > [         TU ]  5         Xl        X l        X0l        SU R                  [
        R                  " SU R                  S[
        R                  S9R                  U[
        R                  S9U R                  -  -  -  nU R                  SUSS9  U R                  X R                  R                  [
        R                  " 5       S	9  g )
N      ?r   r   rr   rK   rr   inv_freqF
persistentseq_lenrK   rr   )ru   rv   dimmax_position_embeddingsbaser3   rF   int64rJ   floatregister_buffer_set_cos_sin_cacher   rK   get_default_dtype)rz   r   r   r   rK   r   r|   s         r8   rv   IdeficsEmbedding.__init__  s    '>$	IIQ!5;;?BB&X]XcXcBdgkgogooq
 	ZeD 	+MM4H4HPUPgPgPi 	  	
r7   c                    Xl         [        R                  " U R                   U[        R                  S9R	                  U R
                  5      n[        R                  " SX@R
                  5      n[        R                  " XU4SS9nU R                  SUR                  5       R                  U5      SS9  U R                  SUR                  5       R                  U5      SS9  g )	Nr   zi,j->ijr?   r   
cos_cachedFr   
sin_cached)max_seq_len_cachedr3   rF   r   type_asr   einsumr   r   cosrJ   sin)rz   r   rK   rr   r]   freqsembs          r8   r   #IdeficsEmbedding._set_cos_sin_cache  s    ")LL00u{{S[[\`\i\ijY==9iiB/\3779<<+>5Q\3779<<+>5Qr7   c                     X R                   :  a$  U R                  X!R                  UR                  S9  U R                  S U R                  UR                  S9U R                  S U R                  UR                  S94$ )Nr   r   )r   r   rK   rr   r   rJ   r   )rz   xr   s      r8   r   IdeficsEmbedding.forward  su    ,,,##GHHAGG#T OOHW%((qww(7OOHW%((qww(7
 	
r7   )r   r   r   r   )i   i'  NrZ   )	r.   r/   r0   r1   rv   r   r   r6   r   r   s   @r8   r   r     s    
"R
 
r7   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr?   r   r   )rG   r3   r   )r   x1x2s      r8   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   c                     X$   R                  U5      nX4   R                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a&  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`):
        The position indices of the tokens corresponding to the query and key tensors. For example, this can be
        used to pass offsetted position ids when working with a KV-cache.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   position_idsunsqueeze_dimq_embedk_embeds           r8   apply_rotary_pos_embr     s]    * 

%
%m
4C


%
%m
4Cw;q>C/0Gw;q>C/0Gr7   c                   >   ^  \ rS rSrS\S\S\4U 4S jjrS rSrU =r	$ )
IdeficsMLPi  r   intermediate_size
hidden_actc                    > [         TU ]  5         [        R                  " XSS9U l        [        R                  " X!SS9U l        [        R                  " XSS9U l        [        U   U l        g )NFr   )	ru   rv   r   rW   	gate_proj	down_projup_projr   act_fn)rz   r   r   r   r|   s       r8   rv   IdeficsMLP.__init__  sS     	;N#4NyyeLZ(r7   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      $ rZ   )r   r   r   r   )rz   r   s     r8   r   IdeficsMLP.forward  s0    ~~dkk$..*;<t||ANOOr7   )r   r   r   r   )
r.   r/   r0   r1   r   r   rv   r   r6   r   r   s   @r8   r   r     s0    
)
) 
) 	
)P Pr7   r   r^   querykeyvaluerE   scalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr?   )r   rr   ptrainingr   r   )r3   matmul	transposer   
functionalsoftmaxr   rJ   rr   r   r  
contiguous)
r^   r   r   r   rE   r   r   r{   attn_weightsattn_outputs
             r8   eager_attention_forwardr    s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r7   c                     ^  \ rS rSrSr     SS\S\S\S\S\S\S	\	\   4U 4S
 jjjr
S\R                  S\S\4S jr       SS\R                  S\	\R                     S\	\R                     S\	\R                     S\	\\R                        S\S\S\	\R                     S\\R                  \	\R                     \	\\R                        4   4S jjrSrU =r$ )IdeficsAttentioni	  z=Multi-headed attention from 'Attention Is All You Need' paperr   	num_headsr   is_cross_attentionconfigqk_layer_norms	layer_idxc                   > [         T	U ]  5         XPl        Xl        X l        X-  U l        X0l        SU l        U R
                  S-  U l        Xpl	        Uc-  [        R                  SU R                  R                   S35        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eX@l        [!        ["        R$                  S5      (       d  [        S	5      eU R                  (       a  [!        UR&                  S
5      (       d  U R                  OUR&                  R(                  n["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " XU R
                  -  SS9U l        ["        R*                  " UX R
                  -  SS9U l        O["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " U R                  X R
                  -  SS9U l        ["        R*                  " X R
                  -  USS9U l        [5        U R
                  5      U l        X`l        U R8                  (       aG  [;        U R
                  UR<                  S9U l        [;        U R
                  UR<                  S9U l         g g )NTg      zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).scaled_dot_product_attentionz)this model requires pytorch 2.0 or higher	embed_dimFr   r   )!ru   rv   r  r   r  head_dimr   	is_causalr   r  loggerwarning_oncer|   r.   rt   r  hasattrr   r  vision_configr  rW   q_projk_projv_projo_projr   
rotary_embr  r   rms_norm_epsq_layer_normk_layer_norm)
rz   r   r  r   r  r  r  r  kv_input_dimr|   s
            r8   rv   IdeficsAttention.__init__  st    	&"#0}}d*" !8!8 9 :, , MMI%$*:*::QRVRbRbQc$YKr3 
 #5r}}&DEEHII""(/0D0Dk(R(R  X^XlXlXvXv  ))  MM)DK
 ))Ldmm2KRWXDK))MM)DK ))  MM)DK
 ))  MM)DK
 ))  MM)DK
 ii%

 +4==9, .t}}&BUBU VD .t}}&BUBU VD r7   tensorr   bszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   r   )rH   r  r  r  r  )rz   r'  r   r(  s       r8   _shapeIdeficsAttention._shape]  s5    {{3GQQRSUVWbbddr7   r*   key_value_statesrE   r   past_key_valueoutput_attentions	use_cachecache_positionrn   c	                 z   U R                   =(       d    US Ln
UR                  5       u  pnU R                  U5      R                  XU R                  U R
                  5      R                  SS5      nU
(       d  U R                  U5      R                  XU R                  U R
                  5      R                  SS5      nU R                  U5      R                  XU R                  U R
                  5      R                  SS5      nOUR                  5       u  nnnU R                  U5      R                  UUU R                  U R
                  5      R                  SS5      nU R                  U5      R                  UUU R                  U R
                  5      R                  SS5      nUR                  S   nUb  UUS   -  nU
(       d-  U R                  U[        UU5      S9u  nn[        XUUU5      u  pUb%  SU0nUR                  UUU R                  U5      u  nnU R                  (       a"  U R!                  U5      nU R#                  U5      n[$        nU R&                  R(                  S:w  aT  U R&                  R(                  S:X  a  U(       a  [*        R-                  S	5        O[.        U R&                  R(                     nU" U UUUU4U R0                  (       d  S
OU R2                  U R4                  S.U	D6u  nnUR7                  XS5      R9                  5       nU R;                  U5      nU(       a  S nUUU4$ )Nr   r   r   r   )r   r0  eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r?   )r  sizer  rH   r  r  r  r  r  rG   r!  maxr   updater  r  r#  r$  r  r  _attn_implementationr  r  r   r  r   r   reshaper  r   )rz   r*   r,  rE   r   r-  r.  r/  r0  r{   r  r(  q_len_query_states
key_statesvalue_stateskv_len
kv_seq_lenr   r   cache_kwargsattention_interfacer
  r	  s                            r8   r   IdeficsAttention.forward`  s    "44T8HPT8T%**,A{{=166s4>>SWS`S`akklmopq!]388T^^UYUbUbcmmnoqrsJ;;}5::3t~~W[WdWdeoopqstuL+002LAvq%56;;CY]YfYfgqqrsuvwJ,-223PTP]P]^hhijlmn   %%b)
%.++J!|SU=STHC';LVY[^`l'm$L %,n=L'5'<'<ZW[WeWegs't$J,,\:L**:6J(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7	%
  $}}C$,,LL	%
 	%
!\ "))#b9DDFkk+.LL.88r7   )r  r   r  r   r  r  r$  r  r  r  r   r#  r  r  r!  r   r  )r4  FNFNNNNNFFN)r.   r/   r0   r1   r2   r   r   r   r   r   rv   r3   r   r*  
LongTensorr	   r   r6   r   r   s   @r8   r  r  	  s   G #(#'$#'OWOW OW 	OW
 !OW !OW OW C=OW OWbeU\\ eC ec e 4815378<"'59J9||J9 #5<<0J9 !.	J9
 u//0J9 !u||!45J9  J9 J9 !!1!12J9 
u||Xell3XeELL>Q5RR	SJ9 J9r7   r  c                   \  ^  \ rS rSrSS\S\\   4U 4S jjjr      SS\R                  S\\R                     S\\R                     S\\\R                        S	\\   S
\\   S\\R                     S\\R                  \\\R                  \R                  4      4   4S jjrSrU =r$ )IdeficsDecoderLayeri  r  r  c                   > [         TU ]  5         UR                  U l        [        U R                  UR                  UR
                  UUS9U l        [        U R                  UR                  UR                  S9U l
        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        UR
                  U l        g )N)r   r  r   r  r  r   r   r   r  )ru   rv   r   r  num_attention_headsr   	self_attnr   r   r   mlpr   r"  input_layernormpost_attention_layernormrz   r  r  r|   s      r8   rv   IdeficsDecoderLayer.__init__  s    !--)((00NN
 (($66((

  .f.@.@fFYFYZ(6v7I7IvObOb(c%~~r7   r*   rE   r   r-  r.  r/  r0  rn   c                    Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  pn[        R                  R	                  XR                  U R
                  S9nX-   nUn	U R                  U5      nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nX-   nU4nU(       a  X4-  nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
)r*   rE   r   r-  r.  r/  r0  r  r-   )rM  rK  r   r  r   r  rN  rL  )rz   r*   rE   r   r-  r.  r/  r0  r{   residualself_attn_weightspresent_key_valueoutputss                r8   r   IdeficsDecoderLayer.forward  s    2 !,,]; ?Cnn 	?
')%)/)	?
 	?
;*; --m||VZVcVc-d 0 !55mD/--m||VZVcVc-d 0 "++G++Gr7   )r   r   rM  rL  rN  rK  rZ   )NNNFFN)r.   r/   r0   r1   r   r   r   rv   r3   r   rE  r	   r   r4   r   r6   r   r   s   @r8   rG  rG    s    &} &# & &, 26378<,1$)59:||: !.: u//0	:
 !u||!45: $D>: D>: !!1!12: 
u  (51B1BEDUDU1U+V"WW	X: :r7   rG  c                   |  ^  \ rS rSrSS\S\\   4U 4S jjjr       SS\R                  S\\R                     S\\R                     S\\R                     S	\\R                     S
\\
   S\\
   S\\\R                        S\\R                  \\\R                  \R                  4      4   4S jjrSrU =r$ )IdeficsGatedCrossAttentionLayeri  r  r  c           
      	  > [         TU ]  5         UR                  U l        [        U R                  UR                  SUR
                  UUR                  US9U l        [        U R                  UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        UR
                  U l        ["        R$                  " 5       U l        ["        R$                  " 5       U l        UR*                  S:X  Ga  UR,                  S:X  ax  ["        R.                  " [0        R2                  " SSU R                  5      5      U l        ["        R.                  " [0        R2                  " SSU R                  5      5      U l        GOUR,                  S:X  a`  ["        R.                  " [0        R2                  " S5      5      U l        ["        R.                  " [0        R2                  " S5      5      U l        GO[9        S	UR,                   S
35      eUR*                  S:X  Ga  UR,                  S:X  ax  ["        R.                  " [0        R:                  " SSU R                  5      5      U l        ["        R.                  " [0        R:                  " SSU R                  5      5      U l        GOUR,                  S:X  a`  ["        R.                  " [0        R:                  " S5      5      U l        ["        R.                  " [0        R:                  " S5      5      U l        GO|[9        S	UR,                   S
35      eUR*                  S;   Ga9  UR,                  S:X  a  ["        R.                  " [0        R<                  " SUR>                  SSU R                  4S95      U l        ["        R.                  " [0        R<                  " SUR>                  SSU R                  4S95      U l        OUR,                  S:X  as  ["        R.                  " [0        R<                  " SUR>                  SS95      U l        ["        R.                  " [0        R<                  " SUR>                  SS95      U l        O2[9        S	UR,                   S
35      e[A        SUR*                   S35      e[C        U S5      (       a  [C        U S5      (       d  [9        S5      eg )NT)r   r  r  r   r  r  r  rI  r  zerosvectorr   r   z Unknown value for `alpha_type` ()r   >   normalrandomgaussianr4  )r   stdr5  zAlpha initialization scheme z not yet implemented!alpha_cross_attnalpha_densez+Alpha parameters not initialized correctly!)"ru   rv   r   r  rJ  r   r  
cross_attnr   r   r   rL  r   r"  rM  rN  r  r   Tanhact_cross_attn	act_densealpha_initializer
alpha_typer   r3   rZ  ra  rb  rt   r   r]  alphas_initializer_rangeNotImplementedErrorr  rO  s      r8   rv   (IdeficsGatedCrossAttentionLayer.__init__   s   !--*((00#NN!00
 (($66((

  .f.@.@fFYFYZ(6v7I7IvObOb(c%nn ggi##w.  H,(*U[[AtGWGW5X(Y%#%<<Aq$BRBR0S#T ""g-(*U[[^(D%#%<<A#?  #CFDUDUCVVW!XYY%%/  H,(*UZZ1dFVFV5W(X%#%<<

1aAQAQ0R#S ""g-(*UZZ](C%#%<<

1#>  #CFDUDUCVVW!XYY%%)II  H,(*LLcv/N/NVWYZ\`\l\lUmn)% $&<<LLcv/N/NVWYZ\`\l\lUmn$  ""g-(*LLcv/N/NVWY)% $&<<#6KjKjrs0u#v  #CFDUDUCVVW!XYY &(DVE]E]D^^s&tuu011gdM6R6RJKK 7Sr7   r*   rE   r,   rC   cross_attention_gater.  r/  r-  rn   c	                    Uc  [        S5      eUc  [        S5      eUb  [        S5      eUn
U R                  U5      nU R                  " S	UUUUS.U	D6u  pn[        R
                  R                  XR                  U R                  S9nUR                  US:H  SS2SS2S4   S5      nXR                  U R                  5      U-  -   nUn
U R                  U5      nU R                  U5      n[        R
                  R                  XR                  U R                  S9nXR                  U R                  5      U-  -   nU4nU(       a  X4-  nU(       a  X4-  nU$ )
am  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    image_attention_mask (`torch.FloatTensor`, *optional*): image attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    cross_attention_gate (`torch.FloatTensor`, *optional*):
        gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
Nzt`image_hidden_states` is required for Idefics cross attention module which are visual features to be conditioned on.z`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images.zMPast key value states are not implemented for Idefics cross attention module.)r*   r,  rE   r.  r  r   r4  r-   )rt   rj  rM  rc  r   r  r   r  r  masked_fillre  ra  rN  rL  rf  rb  )rz   r*   rE   r,   rC   rl  r.  r/  r-  r{   rR  rS  rT  rU  s                 r8   r   'IdeficsGatedCrossAttentionLayer.forwardB  s   : &# 
  ' ^  %%&uvv ,,]; ?Coo ?
'0//	?

 ?
;*; --m{{UYUbUb-c%113G13LaQRTXj2Y[^_ #6#6t7L7L#MP]#]] !55mD/--m{{UYUbUb-c >>$2B2B#Cm#SS "++G++Gr7   )
re  rf  ra  rb  r  rc  r   rM  rL  rN  rZ   rD  )r.   r/   r0   r1   r   r   r   rv   r3   r   r   r	   r4   r   r6   r   r   s   @r8   rX  rX    s   @L} @L# @L @LJ 266:7;7;,1$)8<K||K !.K &ell3	K
 'u||4K 'u||4K $D>K D>K !u||!45K 
u  (51B1BEDUDU1U+V"WW	XK Kr7   rX  c                   B    \ rS rSr\rSrSrSS/rSr	Sr
SrSrSrS rSrg	)
IdeficsPreTrainedModeli  rd   TrG  rX  Fc                 F   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        R                  5      (       aJ  UR                  R                  R                  S5        UR                  R                  R                  5         g [        U[        5      (       a&  UR                  R                  R                  S5        g [        U[         5      (       a%  UR"                  R                  R                  5         g [        U[$        5      (       GaT  U R                   R&                  S:X  aI  UR(                  R                  R                  5         UR*                  R                  R                  5         g U R                   R&                  S:X  aK  UR(                  R                  R                  S5        UR*                  R                  R                  S5        g U R                   R&                  S;   aq  UR(                  R                  R                  SU R                   R,                  S9  UR*                  R                  R                  SU R                   R,                  S9  g g [        U[.        5      (       a%  UR0                  R                  R                  5         g g )Nr4  )r   r`  r   rZ  r   >   r]  r^  r_  )r  initializer_ranger[   r   rW   Conv2drx   datanormal_r   zero_rX   rs   rV   fill_r   r!   class_embeddingrX  rg  ra  rb  ri  r    latents)rz   r^   r`  s      r8   _init_weights$IdeficsPreTrainedModel._init_weights  s    kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)KK""$//MM$$S) 788""''//1 ?@@{{,,7'',,224""''--/..&8'',,2237""''--c2..2RR'',,44#4;;CgCg4h""''//Sdkk>b>b/c S  9::NN'') ;r7   r-   N)r.   r/   r0   r1   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_cache_class_supports_flash_attn_2_supports_static_cache_supports_attention_backendr{  r6   r-   r7   r8   rq  rq    sA     L&*#.0QRN !""&*r7   rq  c                       \ rS rSrSrg)KwargsForCausalLMi  r-   N)r.   r/   r0   r1   r6   r-   r7   r8   r  r    s    3r7   r  c            '         ^  \ rS rSrSrS\4U 4S jjrS%S jr/ 4S jr/ 4S jr	S r
S	 r\\               S&S
\\R                      S\\R"                     S\\R                      S\\\R&                        S\\R&                     S\\R&                     S\\R&                     S\\R&                     S\\R"                     S\\   S\\   S\\   S\\   S\\   S\\R                      S\\   S\\\4   4"S jj5       5       r S'S\\R"                  S4   S\R"                  S\R"                  S\S\4
S jjr\S\R"                  S\S \S!\R>                  S\R"                  S"\4S# j5       r S$r!U =r"$ )(IdeficsModeli  z
Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

Args:
    config: IdeficsConfig
r  c           
      f  > [         TU ]  U5        Xl        UR                  U l        UR
                  U l        [        UR
                  UR                  UR                  UR                  U R                  S9U l
        UR                  R                  U l        UR                  U l        [        UR                  5      U l        UR                  (       a]  UR                   n[#        UUR                  R$                  UR&                  UR(                  UR*                  UR,                  5      U l        [0        R2                  " [5        UR6                  5       Vs/ s H  n[9        XS9PM     sn5      U l        UR<                  U l        UR6                  U R<                  -  n[0        R2                  " [5        U5       Vs/ s H  n[?        XS9PM     sn5      U l         SU l!        [E        UR                  URF                  S9U l$        U RK                  5         U RM                  U5        g s  snf s  snf )N)rp   rw   rq   rm   rs   )r  Fr  )'ru   rv   r  pad_token_idrs   
vocab_sizerk   additional_vocab_sizer   freeze_text_layersembed_tokensr  
image_sizer"   vision_modeluse_resamplerperceiver_configr    r  resampler_depthresampler_n_headsresampler_head_dimresampler_n_latentsperceiver_resamplerr   
ModuleListrangenum_hidden_layersrG  layerscross_layer_intervalrX  gated_cross_attn_layersgradient_checkpointingr   r"  norm	post_initfreeze_relevant_params)rz   r  r  inum_cross_layersr|   s        r8   rv   IdeficsModel.__init__  s    !.. ++5!,,&,&B&B ,,#66((
 !..99#114V5I5IJ %66'@$$.. 00 22 33 44(D$ mm?DVE]E]?^_?^! 5?^_
 %+$?$?!!33t7P7PP')}}KPQaKbcKba,VAKbc(
$ ',#"6#5#56;N;NO	 	##F+ ` ds   H)>H.c                     Uc  U R                   nUR                  (       a  U R                  UR                  5        UR                  (       a  [	        U R
                  UR                  S9  g g N)re   )r  r  freeze_text_module_exceptionsfreeze_vision_layersri   r  freeze_vision_module_exceptions)rz   r  s     r8   r  #IdeficsModel.freeze_relevant_params  sQ    >[[F$$##F$H$HI&&**f>d>de 'r7   c                 T    U R                   U R                  4 H  n[        X!S9  M     g r  )r  r  ri   )rz   re   r^   s      r8   r  IdeficsModel.freeze_text_layers  s!    {{DII.FE /r7   c                 ,    [        U R                  US9  g r  )ri   r  )rz   re   s     r8   r  !IdeficsModel.freeze_vision_layers  s    T&&:KLr7   c                     U R                   $ rZ   r  r   s    r8   get_input_embeddings!IdeficsModel.get_input_embeddings  s       r7   c                     Xl         g rZ   r  rz   r   s     r8   set_input_embeddings!IdeficsModel.set_input_embeddings  s    !r7   rN   rE   r   r)   inputs_embedsr@   rA   rB   rC   r/  r.  output_hidden_statesinterpolate_pos_encodingreturn_dictr0  r{   rn   c                   ^ Ub  UR                   OUR                   nUb  UOU R                  R                  nUb  UOU R                  R                  nU
b  U
OU R                  R                  n
Ub  UOU R                  R
                  nUSL USL-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U
(       a  [        R                  S5        Sn
Uc  U R                  U5      nSnU
(       aP  [        U[        5      (       d;  SnUc  [        5       nO+[        R                  " U5      n[        R                  S5        UR                   u  nnnUb  UR#                  5       OSnUU-   nUc0  [$        R&                  " UUUR                   S   -   UR                   S	9nUbG  UcD  UR)                  5       R+                  S
5      S-
  nUR-                  US:H  S5        USS2U* S24   nOUc  UR/                  S5      n[1        XgU4 Vs/ s H  nUSL PM	     sn5      S:w  a  [        S5      eUbw  UR3                  U R4                  US9nUR                   SS u  nnUR7                  5       R8                  " UU-  /UR                   SS Q76 nU R;                  XmS9R<                  nOHUbE  UR?                  5       u  nnnnUR3                  U R4                  US9nUR9                  UU-  UU5      nU R                  R@                  (       aO  Uc4  U RC                  W5      nUR?                  S5      UR?                  S5      nnOUR?                  5       u  nnnnUnO1Uc#  WR?                  S5      UR?                  S5      nnO[        S5      eUR9                  UWU-  U5      nU	R?                  S5      nU	R/                  S
5      n	U	RE                  SSSU5      n	U	R9                  UUUU-  5      n	UbB  UR?                  5       u  nnnUU4n U	c  [$        RF                  " U US	9n	U RI                  U	5      n	OSn	U	S:H  RK                  S
S9R3                  U R4                  S9RM                  SS9R3                  U5      n!Uc0  [$        RF                  " UU4[$        RN                  UR                   S9nU RQ                  X%XU5      nUn"U(       a  SOSn#U(       a  SOSn$Sn%[S        U RT                  5       H  u  n&n'U(       a  U#U"4-  n#U4S jn(U R                  (       ae  U R                  (       aT  SnU
(       a  [        R                  S5        Sn
U RW                  U(U'U"UUUUU	U!UU
U&U RX                  U RZ                  U5      n)O+U(" U'U"4UUUUU	U!UU
U&U RX                  U RZ                  US.TD6n)U)S   n"U
(       a  U)U(       a  SOS   n%U(       d  M  U$U)S   4-  n$M     U R]                  U"5      n"U(       a  U#U"4-  n#U
(       a  U%OSn*U(       a  U*R_                  5       n*UR9                  UUUU5      n[a        U"U*U#U$US9$ s  snf )a*  
image_encoder_embeddings (`torch.FloatTensor`, *optional*):
    The output of the image encoder.
perceiver_embeddings (`torch.FloatTensor`, *optional*):
    The output of the perceiver resampler.
image_attention_mask (`torch.LongTensor`, *optional*):
    The attention mask for the image encoder.
Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   rK   r?   r   z_Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None.)rr   rK   )r@   r  zBIf `perceiver_embeddings` are passed, use_resampler should be Truer4  r   r   r-   c                 r   > X-  S:X  a  XU-     nU" U4UUUUUU	S S.TD6nUS   nU " U4UUUUU	US.TD6nU$ )Nr   )rE   r,   rC   rl  r.  r/  r-  )rE   r   r-  r.  r/  r0  r-   )
main_blockr*   rE   r   r-  r,   rC   rl  r.  r/  r  r  r  r0  xblockrU  layer_outputsr{   s                    r8   vblock$IdeficsModel.forward.<locals>.vblock  s    " 3q84BV5VWF$%
'5,?-A-A*;"+'+
 !
G %,AJM *!	!#1!-#1&7'#1	! 	! %$r7   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...)rE   r   r-  r,   rC   rl  r.  r/  r  r  r  r0  )r(   r)   r*   r+   r,   )1rK   r  r.  r  r/  use_return_dictrt   r  r  r  r  r  r[   r   r   from_legacy_cacherG   get_seq_lengthr3   rF   longcumsummasked_fill_r   sumrJ   rr   r  rH   r  r(   r5  r  r  rI   r   invert_attention_maskrb   squeezer   _update_causal_mask	enumerater  _gradient_checkpointing_funcr  r  r  to_legacy_cacher&   )+rz   rN   rE   r   r)   r  r@   rA   rB   rC   r/  r.  r  r  r  r0  r{   rK   return_legacy_cache
batch_size
seq_lengthr;  past_key_values_lengthseq_length_with_pastr   
num_imagesr,   image_seq_lenimage_hidden_sizetext_seq_lenimage_batch_sizeimage_sequence_lengthimage_hidden_shaperl  r*   all_hidden_statesall_self_attnsnext_decoder_cacheidxdecoder_layerr  r  
next_caches+                   `                          r8   r   IdeficsModel.forward  s   : &/%:!!@T@T1B1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M $Z??"&&".."."@"@"Q##^ %2$7$7!
JETE`!?!?!Afg),BB!"\\&(>ATATUVAW(W`m`t`tN %,*>)..077;a?L%%n&91='J;<8L!)33A6LLL`#ab#aaT	#abcghhq  %'??F?KL%1%7%7%;"J
'22499*z:QkT`TfTfghgiTjkL #'"3"3) #4 #   &1G_GdGdGfDJ
M3D":"="=DJJW]"="^"5":"::
;RTact"u;;$$#+'+'?'?@S'T$3G3L3LQ3OQeQjQjklQm00K_KdKdKfH
J7H"6!)/B/G/G/JL_LdLdefLg,M,abb166z:P]C]_pq ,0033==bA3::1aMR388\S]`mSmn*9L9Q9Q9S63Q"24I!J#+',zz2DV'T$#'#=#=>R#S #'  $83#>"C"C"C"K!O!OVZV`V`!O!a j jop j quu 

 !"ZZ12%**]MaMaN 11>L]
 & #7BD0d!"+DKK"8C#!m%55!+%Z **t}}"&''t !&I $ A A!!" #'((%--00"!$ !'!!! $2!-#2(;)=)=&7'!)-)B)B,0,H,H#1! !$ *!,M%28I1q%Q"  =#3"55G #9J 		-0  -!11+4'$
#335J166z:}^op-+&+% 3
 	
y cs   Yr#   input_tensorc           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r4  flex_attentionr   Fr3  )r  r  is_trainingr   r?   )sequence_lengthtarget_lengthrr   r0  r  )cudaxpunpu)r  r8  rb   r[   r3   r   r$   r  is_compileabler   _ignore_causal_mask_sdpar  rr   rG   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrK   typefinfomin_unmask_unattended)rz   rE   r  r0  r)   r.  past_seen_tokensusing_compilable_cacherr   r  r  causal_mask	min_dtypes                r8   r   IdeficsModel._update_causal_mask(  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr7   r  r  rr   r  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuerr   rK   r   )diagonalr  r?   r   )r   r3   r  r  fullrK   triurF   r9  expandr   rG   rJ   rn  )rE   r  r  rr   r0  r  r{   r  r  mask_lengthpadding_masks              r8   r  BIdeficsModel._prepare_4d_causal_attention_mask_with_cache_positionl  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r7   )r  r  r  r  r  r  r  r  rs   r  r  r  r  rZ   )NNNNNNNNNNNNFNNF)#r.   r/   r0   r1   r2   r   rv   r  r  r  r  r  r   r   r   r3   rE  r   r   r4   r   r   r   r
   r	   r&   r   r   r  staticmethodr   rr   r  r6   r   r   s   @r8   r  r    s   .,} .,`f 46 F 68 M!"  151537=A5948@D<@7;$(,0/338&*59!R
E,,-R
 !.R
 u//0	R

 "$u'8'8"9:R
   1 12R
 u001R
 #+5+<+<"=R
 'u'8'89R
 'u||4R
 D>R
 $D>R
 'tnR
 #+4.R
 d^R
  !!1!12!R
" -.#R
$ 
u44	5%R
  R
v #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r7   r  c            )         ^  \ rS rSrSS/rS&U 4S jjrS rS rS rS r	S	 r
S
 rS r\\                S'S\\R"                     S\\R$                     S\\R"                     S\\\R(                        S\\R(                     S\\R(                     S\\R(                     S\\R(                     S\\R$                     S\\R"                     S\\   S\\   S\\   S\\   S\\   S\\R"                     S\\   S\\\4   4$S jj5       5       r         S(U 4S jjr S)S \S!\\\ 4   S"\S\\\ 4   4U 4S# jjjr!\"S$ 5       r#S%r$U =r%$ )*IdeficsForVisionText2Texti  zmodel.embed_tokens.weightzlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [	        UR
                  UR                  UR                  SUR                  S9U l	        U R                  5         g )NF)r   r   r   r   rm   )ru   rv   r  rd   r   r   r  r  freeze_lm_headlm_headr  )rz   r  r  r|   s      r8   rv   "IdeficsForVisionText2Text.__init__  s[     !&)
-****$*$@$@#22
 	r7   c                 .    U R                   R                  $ rZ   rd   r  r   s    r8   r  .IdeficsForVisionText2Text.get_input_embeddings  s    zz&&&r7   c                 $    XR                   l        g rZ   r  r  s     r8   r  .IdeficsForVisionText2Text.set_input_embeddings  s    "'

r7   c                     U R                   $ rZ   r  r   s    r8   get_output_embeddings/IdeficsForVisionText2Text.get_output_embeddings  s    ||r7   c                     Xl         g rZ   r  )rz   new_embeddingss     r8   set_output_embeddings/IdeficsForVisionText2Text.set_output_embeddings  s    %r7   c                     Xl         g rZ   rd   )rz   decoders     r8   set_decoder%IdeficsForVisionText2Text.set_decoder  s    
r7   c                     U R                   $ rZ   r  r   s    r8   get_decoder%IdeficsForVisionText2Text.get_decoder  s    zzr7   c                    U R                  5       nU R                  5       n[        U R                  SS5      (       ab  UR                  Ul        UR
                  S:  aA  UR                  UR
                  :X  d   eUR                  R                  UR                  l        [        US5      (       aY  [        US5      (       aG  UR                  Ul        [        US5      (       a$  [        US5      (       a  UR
                  Ul        ggggg)	z
Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
tie_word_embeddingsTr   r   rp   r   rw   N)r  r  getattrr  rx   rw   r   ry   r   r  rp   r   )rz   output_embeddingsinput_embeddingss      r8   tie_weights%IdeficsForVisionText2Text.tie_weights  s    
 !6684464;; 5t<<'7'>'>$99A=(@@DTDnDnnnn9I9^9^9e9e!//6$n55'BRTd:e:e-=-L-L*(*CDD "=J J =M<f<f!9JD ;f5r7   rN   rE   r   r)   r  r@   rA   rB   rC   labelsr/  r.  r  r  r  r0  r{   rn   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  " SUUUUUUUUU	UUUUSUS.UD6nUS   nU R                  U5      nSnU
b)  U R                  " SUXR                   R                  S.UD6n[        UUUR                  UR                  UR                  UR                  S9$ )aK  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
image_encoder_embeddings (`torch.FloatTensor`, *optional*):
    The output of the image encoder.
perceiver_embeddings (`torch.FloatTensor`, *optional*):
    The output of the perceiver resampler.
image_attention_mask (`torch.LongTensor`, *optional*):
    The attention mask for the image encoder.

Example:

```python
>>> from transformers import AutoProcessor, IdeficsForVisionText2Text

>>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
>>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")

>>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
>>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"

>>> prompts = [
...     [
...         "User:",
...         dogs_image_url_1,
...         "Describe this image.\nAssistant: An image of two dogs.\n",
...         "User:",
...         dogs_image_url_2,
...         "Describe this image.\nAssistant:",
...     ]
... ]
>>> inputs = processor(prompts, return_tensors="pt")
>>> generate_ids = model.generate(**inputs, max_new_tokens=6)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True)
```NT)rN   rE   r   r)   r  r@   rA   rB   rC   r/  r.  r  r  r  r0  r   )r=   r)  r  )r<   r=   r)   r*   r+   r,   r-   )r  r.  r  r  rd   r  loss_functionr  r:   r)   r*   r+   r,   )rz   rN   rE   r   r)   r  r@   rA   rB   rC   r)  r/  r.  r  r  r  r0  r{   rU  r*   r=   r<   s                         r8   r   !IdeficsForVisionText2Text.forward  s   x 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ** 
)%+'%%=!5!5/!5%=)
  !
&  
m,%%pVF{{OeOepiopD,#33!//)) ' ; ;
 	
r7   c                   > 0 nUb%  U R                   R                  (       a  XS'   O	XS'   OX|S'   UR                  SS5      US'   [        TU ]  " U4UUUUUU
U	S.UDUD6nU	b$  Uc!  US   R
                  S   nU	S S 2U* S 24   US	'   U$ )
NrB   rA   r@   r  F)r)   rE   r  r0  r   r/  rC   rN   r   rC   )r  r  popru   prepare_inputs_for_generationrG   )rz   rN   rE   r   r  r)   r0  r@   r,   rC   r/  r{   images_kwargsmodel_inputsr  r|   s                  r8   r/  7IdeficsForVisionText2Text.prepare_inputs_for_generationD  s      *{{((8K45<O89,8.)4:JJ?Y[`4a01w<
+)')%!5
 
 
  +0E%k288;J3GJ;<3XL/0r7   rU  rR   rP   c                   > [         TU ]  " UUU40 UD6nSU;   aU  US   nUS S 2SS S 24   R                  S5      nUR                  SS5      (       a  XbS'   O[        R
                  " XV/SS9US'   UR                  US'   U$ )NrC   r?   r   r/  Tr   r,   )ru   #_update_model_kwargs_for_generationr   rM   r3   r   r,   )rz   rU  rR   rP   r{   rC   	last_maskr|   s          r8   r4  =IdeficsForVisionText2Text._update_model_kwargs_for_generationq  s     wB
 	
 "\1#/0F#G ,QAX6@@CIT227@347<yyBVAbhi7j34 /6.I.I*+r7   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nr-   c              3   F   >#    U  H  oR                  S T5      v   M     g7f)r   N)rL   )r\   
past_statebeam_idxs     r8   r_   ;IdeficsForVisionText2Text._reorder_cache.<locals>.<genexpr>  s!     $g\fj%<%<Q%I%I\fs   !)r   )pastr:  reordered_past
layer_pasts    `  r8   _reorder_cache(IdeficsForVisionText2Text._reorder_cache  s1    Ju$g\f$ggiiN r7   )r  rd   rZ   )NNNNNNNNNNNNNFNN)	NNNNNNNNNr  )&r.   r/   r0   r1   _tied_weights_keysrv   r  r  r  r  r  r   r'  r   r   r   r3   rE  r   r   r4   r   r   r  r
   r	   r:   r   r/  r   r   r   r   r4  r  r?  r6   r   r   s   @r8   r  r    sh   57GH'(&g*  151537=A5948@D<@7;-1$(,0/338&*59#b
E,,-b
 !.b
 u//0	b

 "$u'8'8"9:b
   1 12b
 u001b
 #+5+<+<"=b
 'u'8'89b
 'u||4b
 ))*b
 D>b
 $D>b
 'tnb
 #+4.b
  d^!b
" !!1!12#b
$ *+%b
& 
u33	4'b
  b
N  !+b $)	 38n !	 
c3h 4  r7   r  )r  r  rq  )r   FNN)r   )r4  )Tr2   dataclassesr   typingr   r   r   r   r   r	   r
   r3   torch.nn.functionalr   r  r   torch.utils.checkpointactivationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   r   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   configuration_ideficsr   	perceiverr    visionr!   r"   !torch.nn.attention.flex_attentionr#   integrations.flex_attentionr$   
get_loggerr.   r  r&   r:   rT   ri   rX   rk   rW   r   Moduler   appendr   r   r   r   r   r   r  r  rG  rX  rq  r  r  r  __all__r-   r7   r8   <module>rY     sI  (  ! D D D      ! . ) > B + X X & 1 h h 0 0 E  !!;J 
		H	% )C[ )C )CX &CK &C &CV *#Z +- k
 k
\>
RYY >
DJRYY J.   N +$
uxx $
N(:P P2 %II%<<% 
% <<	%
 U\\*% % %0a9ryy a9JN")) NbNbii Nb **_ ** **Z ?,j > `) ` `Fk 6 k\ Rr7   