
    fThV                     j   S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK
r	SSK	Jr  SSKJr  SSKJrJr  SS	KJr  SS
KJrJr  SSKJr  \R0                  " \5      r\ " S S\5      5       r " S S\R8                  5      r S#S\R8                  S\	R<                  S\	R<                  S\	R<                  S\\	R<                     S\S\4S jjr  " S S\R8                  5      r! " S S\R8                  5      r" " S S\R8                  5      r# " S S \R8                  5      r$ " S! S"\R8                  5      r%g)$zTPyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object    N)	dataclass)CallableOptionalTupleUnion)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONS)ModelOutputlogging   )IdeficsVisionConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
IdeficsVisionModelOutput&   a  
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

Args:
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
Nimage_embedslast_hidden_state.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r   __static_attributes__r       Z/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/idefics/vision.pyr   r   &   sr    * 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r#   r   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )IdeficsVisionEmbeddingsD   configc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   )
persistent)super__init__r(   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandselfr(   	__class__s     r$   r4    IdeficsVisionEmbeddings.__init__E   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr#   
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  U R                  5      nUR                   S   S-
  nXF:X  a  X#:X  a  U$ USS2S4   nUSS2SS24   nUR                   S   n	X R                  R                  -  n
X0R                  R                  -  nU
S-   US-   p[
        R                  " U5      nUR                  S[        U5      [        U5      U	5      nUR                  SSSS5      nUR                  [        R                  :H  nU(       a4  [        R                  S5        UR                  [        R                   5      n["        R$                  R'                  UX-  X-  4S	S
S9nU(       a  UR                  [        R                  5      n[        U
5      UR                   S   :w  d  [        U5      UR                   S   :w  aB  [)        S[        U
5      [        U5      4 SUR                   S   UR                   S   4 S35      eUR                  SSSS5      R+                  SSU	5      n[        R,                  " UR/                  S5      U4SS9$ )z
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
resolution images.

Source:
https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
r   Nr   r1   g?r	   r/   zUpcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead.bicubicF)scale_factormodealign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()dim)shaperB   r0   r(   r8   mathsqrtreshapeintpermutedtyper   bfloat16loggerwarning_oncetofloatr   
functionalinterpolate
ValueErrorviewcat	unsqueeze)rG   rJ   rK   rL   r?   	pos_embedr@   class_pos_embedpatch_pos_embedr6   num_h_patchesnum_w_patchessqrt_num_positionsfp32_upcastings                 r$   interpolate_pos_encoding0IdeficsVisionEmbeddings.interpolate_pos_encoding\   sV    !&&q)A-++D,=,=>	!*Q.'FO#AqD/#AqrE*$$R(	++"8"88!7!77 (5s':MC<O}!YY}5)11!S9K5LcRdNegpq)11!Q1=(..%..@h .00=O--33'<m>`a	 4 
 -00@O}!6!6r!::c->PTcTiTijlTm>m0]1CSEW1W0X Y00?0E0Eb0I?K`K`acKd0d/eefh  *11!Q1=BB1b)Tyy/33A6HaPPr#   pixel_valuesrp   c                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (z8). You should try to set `interpolate_pos_encoding=True`)r]   r/   r   r1   rU   )rW   r7   re   r>   weightr]   ra   flatten	transposer;   rE   r   rg   rp   rB   r0   )rG   rr   rp   
batch_sizer=   rK   rL   target_dtypepatch_embedsclass_embedsrJ   s              r$   forwardIdeficsVisionEmbeddings.forward   s'   2>2D2D/
&'(E__,D (% 9)4??*;;su 
 ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
 $#&C&CJX]&^^J  $&=&=d>O>O&PPJr#   )	r;   r(   r6   r7   r?   r@   r>   r8   rB   F)r   r   r   r   r   r4   r   Tensorr[   rp   r    boolr|   r"   __classcell__rH   s   @r$   r&   r&   D   sr    q2 q./Q5<< /Q /QUX /Q]b]i]i /QbE$5$5 QU bgbnbn  r#   r&   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr1   rS   )rV   r]   )ptrainingr   r/   )r   matmulrw   r   rc   softmaxfloat32ra   r]   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r$   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r#   c                      ^  \ rS rSrSrS\4U 4S jjr   SS\R                  S\	\R                     S\	\R                     S\	\
   S	\\R                  \	\R                     4   4
S
 jjrSrU =r$ )IdeficsVisionAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr(   c                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r3   r4   r(   r5   r6   num_attention_heads	num_headshead_dimre   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrF   s     r$   r4   IdeficsVisionAttention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar#   r   r   causal_attention_maskoutput_attentionsrM   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                   (       d  SOU R"                  S	9u  pUR%                  XVU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )
z#Input shape: Batch x Time x Channelr   r/   flash_attention_2Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   )rW   r   r   r   rf   r   r   rw   r(   _attn_implementationr   r   r_   r`   r   r   r   r   rZ   r   r   )rG   r   r   r   r   rx   
seq_lengthr6   querieskeysvaluesattention_interfacer   r   s                 r$   r|   IdeficsVisionAttention.forward   s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r#   )r(   r   r6   r   r   r   r   r   r   r   r   )NNF)r   r   r   r   r   r   r4   r   r   r   r   r   r|   r"   r   r   s   @r$   r   r      s    GB2 B. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45) 5)r#   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )IdeficsVisionMLPi  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)r3   r4   r(   r
   
hidden_actactivation_fnr   r   r5   intermediate_sizefc1fc2rF   s     r$   r4   IdeficsVisionMLP.__init__  sb    #F$5$5699V//1I1IJ99V55v7I7IJr#   r   rM   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   )rG   r   s     r$   r|   IdeficsVisionMLP.forward  s4    /**=9/r#   )r   r(   r   r   )
r   r   r   r   r4   r   r   r|   r"   r   r   s   @r$   r   r     s)    KU\\ ell  r#   r   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )IdeficsVisionEncoderLayeri  r(   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g N)eps)r3   r4   r5   r6   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rF   s     r$   r4   "IdeficsVisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr#   r   r   r   r   rM   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   r   r   )r   r   r   r   )rG   r   r   r   r   residualr   outputss           r$   r|   !IdeficsVisionEncoderLayer.forward'  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr#   )r6   r   r   r   r   r~   )r   r   r   r   r   r4   r   r   r   r   r   r    r|   r"   r   r   s   @r$   r   r     sl    S2 S -2&||& &  %||	&
 $D>& 
u  	!& &r#   r   c                      ^  \ rS rSrSrS\4U 4S jjr     SS\\R                     S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jjrSrU =r$ )IdeficsVisionEncoderiQ  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`IdeficsVisionEncoderLayer`].

Args:
    config: IdeficsVisionConfig
r(   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r3   r4   r(   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rG   r(   _rH   s      r$   r4   IdeficsVisionEncoder.__init__Z  sU    mmPUV\VnVnPo$pPo1%>v%FPo$pq&+# %qs   A&r   r   r   output_hidden_statesreturn_dictrM   c                 L   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       Hr  u  pU(       a  Xy4-   nU R                  (       a1  U R                  (       a   U R                  UR                  U	UUU5      nO	U" U	UUUS9nUS   n	U(       d  Mj  XS   4-   nMt     U(       a  Xy4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr   )r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r   ).0vs     r$   	<genexpr>/IdeficsVisionEncoder.forward.<locals>.<genexpr>  s     e$Sq$Ss   	)r   r   r   )r(   r   r   use_return_dict	enumerater   r   r   _gradient_checkpointing_func__call__tupler   )rG   inputs_embedsr   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss                r$   r|   IdeficsVisionEncoder.forward`  s8   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M  !/3C2E!E- #90  +.>>Ne]N$Seee+Vd
 	
r#   )r(   r   r   )NNNNN)r   r   r   r   r   r   r4   r   r   r   r   r   r   r   r|   r"   r   r   s   @r$   r   r   Q  s    ,2 , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
 O
r#   r   c                      ^  \ rS rSrS\4U 4S jjr     SS\\R                     S\\	   S\\	   S\\	   S\\	   S	\
\\4   4S
 jjrSrU =r$ )IdeficsVisionTransformeri  r(   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r   )r3   r4   r(   r5   r&   rJ   r   r   r   pre_layrnormr   encoderpost_layernorm)rG   r(   r6   rH   s      r$   r4   !IdeficsVisionTransformer.__init__  sd    &&	1&9LL8M8MN+F3 ll9:O:OPr#   rr   r   r   rp   r   rM   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  U5      nU R                  UUUUS9nUS   nUSS2SSS24   n	U R                  U	5      n	U(       d	  X4USS -   $ [        UU	UR                  UR                  S9$ )z
Returns:

Nz You have to specify pixel_values)rp   )r   r   r   r   r   r   )r   pooler_outputr   r   )r(   r   r   r   re   rJ   r   r   r   r   r   r   )
rG   rr   r   r   rp   r   r   encoder_outputsr   pooled_outputs
             r$   r|    IdeficsVisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%58KKK)/')77&11	
 	
r#   )r(   rJ   r   r   r   )NNNFN)r   r   r   r   r   r4   r   r   r    r   r   r   r   r|   r"   r   r   s   @r$   r   r     s    Q2 Q 59,0/338&*+
u001+
 $D>+
 'tn	+

 #+4.+
 d^+
 
u00	1+
 +
r#   r   )r   )&r   rX   dataclassesr   typingr   r   r   r   r   torch.utils.checkpointr   activationsr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_ideficsr   
get_loggerr   r_   r   Moduler&   r   rb   r   r   r   r   r   r   r   r#   r$   <module>r     s/   [  ! 3 3    ! K 5 7 
		H	% ?{ ? ?:`bii `V %II%<<% 
% <<	%
 U\\*% % %,L)RYY L)`ryy  /		 /f^
299 ^
D7
ryy 7
r#   