
    fThXi                        S SK rS SKJr  S SKJrJrJrJrJ	r	  S SK
r
S SKJr  S SKr
SSKJr  SSKJr  SSKJrJr  SSKJrJr  SS	KJr  SS
KJrJrJrJr  SSKJ r   SSK!J"r"  SSK#J$r$  SSK%J&r&J'r'J(r(J)r)  SSK*J+r+J,r,  \RZ                  " \.5      r/ S:S\R`                  S\
Rb                  S\
Rb                  S\
Rb                  S\\
Rb                     S\2S\24S jjr3 " S S\$5      r4 " S S\"5      r5\ " S S\5      5       r6\ " S  S!\5      5       r7 " S" S#\R`                  5      r8 " S$ S%\R`                  5      r9 " S& S'\ 5      r:\Rv                  \4S(.r< " S) S*\R`                  5      r= " S+ S,\R`                  5      r>\ " S- S.\65      5       r? " S/ S0\)5      r@SrA " S1 S2\R`                  5      rB " S3 S4\(5      rC " S5 S6\&5      rD " S7 S8\'5      rE/ S9QrFg);    N)	dataclass)CallableListOptionalTupleUnion   )ACT2FN)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging	torch_int   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfigmodulequerykeyvalueattention_maskscalingdropoutc                    UnUn	[         R                  " XR                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR                  S   24   nX-   n
[        R
                  R                  U
SS9n
[        R
                  R                  XU R                  S9n
[         R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr   r	   dim)ptrainingr   )
torchmatmul	transposeshapenn
functionalsoftmaxr&   r-   
contiguous)r    r!   r"   r#   r$   r%   r&   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                e/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forwardr=   .   s     JL<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1 ==((2(>L==((6??([L,,|:K''1-88:K$$    c                       \ rS rSrSrg)InternVLVisionRMSNormI    N__name__
__module____qualname____firstlineno____static_attributes__rB   r>   r<   r@   r@   I       r>   r@   c            
          ^  \ rS rSrS\4U 4S jjr  S
S\R                  S\\R                     S\\R                     S\	\
   4S jjrS	rU =r$ )InternVLVisionAttentionM   configc                 0  > [         TU ]  5         U ?SU l        UR                  nU(       a  [        U R                  5      O[        R                  " 5       U l	        U(       a  [        U R                  5      U l
        g [        R                  " 5       U l
        g NF)super__init__num_key_value_groups	is_causaluse_qk_normr@   	embed_dimr2   Identityq_normk_norm)selfrM   qk_norm	__class__s      r<   rQ    InternVLVisionAttention.__init__N   sb    % $$?F+DNN;BKKM?F+DNN;BKKMr>   hidden_statesr$   output_attentionsr6   c                 8   UR                  5       u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R                  U	5      n	UR                  XVU R                  U R                  5      R                  SS5      nU	R                  XVU R                  U R                  5      R                  SS5      n	U
R                  XVU R                  U R                  5      R                  SS5      n
[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [        R!                  S5        O["        U R                  R                     nU" U UU	U
U4U R$                  (       d  SOU R&                  U R(                  SS	.UD6u  pUR                  XVU R*                  5      nU R-                  U5      nU R/                  U5      nU(       a  X4nU$ US 4nU$ )
Nr   r   eagersdpar^   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r&   r%   rS   )sizeq_projk_projv_projrW   rX   reshape	num_headshead_dimr0   viewr=   rM   _attn_implementationgetloggerwarning_oncer   r-   attention_dropoutscalerU   projection_layerprojection_dropout)rY   r]   r$   r^   r6   
batch_sizeseq_len_query_statesr7   r8   attention_interfacer;   r9   outputoutputss                   r<   forwardInternVLVisionAttention.forwardY   s    "/!3!3!5
Q{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HJJ
%
 
%
! "))*t~~N&&{3((0,=6( EKD>r>   )rS   rX   rW   )NN)rD   rE   rF   rG   r   rQ   r.   Tensorr   r   r   rz   rH   __classcell__r[   s   @r<   rK   rK   M   sf    	Z3 	Z 2648	/||/ !./ $ELL1	/
 -./ /r>   rK   c                   8    \ rS rSr\rSrSrSrS/r	Sr
SrS rSrg)	InternVLVisionPreTrainedModel   internvl_visionpixel_valuesTInternVLVisionLayerc                    [        U[        R                  [        R                  [        R                  45      (       ak  UR
                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR
                  R                  R                  SU R                  R                  S9  UR                  b2  UR
                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR
                  R                  R                  S5        g[        U[         5      (       a  UR"                  R                  R                  5         UR$                  b$  UR$                  R                  R                  5         UR&                  b%  UR&                  R                  R                  5         gg[        U[(        5      (       as  UR*                  R                  R                  U R                  R,                  5        UR.                  R                  R                  U R                  R,                  5        gg)zInitialize the weightsrb   meanstdN      ?)
isinstancer2   LinearConv2dConvTranspose2dweightdatanormal_rM   initializer_rangebiaszero_	Embeddingpadding_idx	LayerNormfill_InternVLVisionEmbeddings	cls_token
mask_tokenposition_embeddingsr   lambda_1layer_scale_init_valuelambda_2)rY   r    s     r<   _init_weights+InternVLVisionPreTrainedModel._init_weights   s   fryy"))R5G5GHII MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) 899!!'')  ,!!&&,,.))5**//557 6 344OO  &&t{{'I'IJOO  &&t{{'I'IJ 5r>   rB   N)rD   rE   rF   rG   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2r   rH   rB   r>   r<   r   r      s2    'L)$O&*#./N!Kr>   r   c                       \ rS rSrSrSrg)$InternVLVisionModelOutputWithPooling   av  
Class for outputs of [`InternVLVisionModel`].

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
rB   N)rD   rE   rF   rG   __doc__rH   rB   r>   r<   r   r      s    r>   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )InternVLVisionPatchEmbeddings   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                 H  > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pTUS   US   -  US   US   -  -  nUS   US   -  US   US   -  4nX l        X0l        X@l        X`l        Xpl        [        R                  " XEX3S9U l
        g )Nr   r   )kernel_sizestride)rP   rQ   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper2   r   
projection)	rY   rM   r   r   r   r   r   r   r[   s	           r<   rQ   &InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L:ir>   r   returnc                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      nUR                   S   UR                   S   pUR	                  S5      R                  SS5      nXgU44$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r	   r   )r1   r   
ValueErrorr   flattenr0   )	rY   r   rs   r   heightwidth
embeddingspatch_heightpatch_widths	            r<   rz   %InternVLVisionPatchEmbeddings.forward   s    2>2D2D/
&,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
+666r>   )r   r   r   r   r   r   )rD   rE   rF   rG   r   rQ   r.   r|   rz   rH   r}   r~   s   @r<   r   r      s.    j7ELL 7U\\ 7 7r>   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
 SS\R                  S\\R                     S\R                  4S jjrSrU =r$ )r      z[
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

rM   r   Nc                 ^  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a<  [        R                  " [        R
                  " SSUR                  5      5      U l	        OS U l	        [        U5      U l        UR                  U l        [        UR                  [        R                   R"                  5      (       a  UR                  OUR                  UR                  4U l        U R                  R$                  nUR&                  (       a?  [        R                  " [        R
                  " SUS-   UR                  5      5      U l        OS U l        [        R*                  " UR,                  5      U l        g )Nr   )rP   rQ   r2   	Parameterr.   zerosr   r   use_mask_tokenr   r   patch_embeddingsr   r   r   collectionsabcIterabler    use_absolute_position_embeddingsr   Dropouthidden_dropout_probr&   )rY   rM   r   r[   s      r<   rQ   !InternVLVisionEmbeddings.__init__   s'   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EFF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r>   r   r   r   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr)   r         ?r	   r   bicubicF)rc   modealign_cornersr*   )r1   r   r.   jit
is_tracingr   r   rg   permuter2   r3   interpolaterj   cat)rY   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedr+   
new_height	new_widthsqrt_num_positionss               r<   interpolate_pos_encoding1InternVLVisionEmbeddings.interpolate_pos_encoding  s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr>   r   bool_masked_posc                    UR                   u    p4nU R                  U5      u  nu  pxUR                  5       u  pnUbI  U R                  R	                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  R	                  U	SS5      n[        R                  " X4SS9nU R                  b  X`R                  XdU5      -   nU R                  U5      nXgU44$ )Nr)   r   r*   )r1   r   rc   r   expand	unsqueezetype_asr   r.   r   r   r   r&   )rY   r   r   ru   r   r   r   r   r   rs   rt   mask_tokensw
cls_tokenss                 r<   rz    InternVLVisionEmbeddings.forward3  s    
 +001e262G2G2U/
/\!+!2
Q&//00bIK))"-55kBA#q1u-?J^^**:r2>
YY
7Q?
##/#&C&CJX]&^^J\\*-
+666r>   )r   r&   r   r   r   r   r   N)rD   rE   rF   rG   r   r   rQ   r.   r|   intr   r   
BoolTensorrz   rH   r}   r~   s   @r<   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 7;7ll7 "%"2"237 
	7 7r>   r   c                       \ rS rSrSrg)InternVLVisionMLPiM  rB   NrC   rB   r>   r<   r   r   M  rI   r>   r   )
layer_normrms_normc                      ^  \ rS rSrSrS\SS4U 4S jjr SS\R                  S\	S\
\\R                     \\R                  \R                  4   4   4S	 jjrS
rU =r$ )r   iT  z?This corresponds to the Block class in the timm implementation.rM   r   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                     " UR                  UR                  S9U l        [        UR                     " UR                  UR                  S9U l        UR                  n[        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R*                  " UR,                  5      U l        g )Nr   epsT)requires_grad)rP   rQ   chunk_size_feed_forwardseq_len_dimrK   	attentionr   mlpNORM2FN	norm_typer   layer_norm_epslayernorm_beforelayernorm_afterr   r2   r   r.   onesr   r   r   r   r&   )rY   rM   init_valuesr[   s      r<   rQ   InternVLVisionLayer.__init__W  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::v?Q?Q3S%Scgh[5::v?Q?Q3S%Scghzz&"<"<=r>   r]   r^   c                    U R                  U R                  U5      US9u  p4U R                  U-  nX1-   nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  b  U R                  U-  nXQ-   nXT4$ )N)r^   )r   r   r   r  r   r&   r   )rY   r]   r^   attention_outputattention_weightslayer_outputs         r<   rz   InternVLVisionLayer.forwardf  s    
 /3nn!!-0/ /= /
+
  ==+;; )8 ++M:xx-||L1==$==<7L $3..r>   )	r   r   r&   r   r   r  r   r   r   )F)rD   rE   rF   rG   r   r   rQ   r.   r|   boolr   r   rz   rH   r}   r~   s   @r<   r   r   T  sn    I>3 > >$ #(/||/  / 
uU\\"E%,,*D$EE	F	/ /r>   r   c                   ~   ^  \ rS rSrS\SS4U 4S jjr\  SS\R                  S\	S\	S\
\\4   4S	 jj5       rS
rU =r$ )InternVLVisionEncoderi  rM   r   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf rO   )
rP   rQ   rM   r2   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rY   rM   ir[   s      r<   rQ   InternVLVisionEncoder.__init__  sS    ]]vOgOgIh#iIhA$7$?Ih#ij
&+# $js   A&r]   r^   output_hidden_statesc                 j   U(       a  SOS nU(       a  SOS n[        U R                  5       Hn  u  pgU(       a  XA4-   nU R                  (       a.  U R                  (       a  U R	                  UR
                  X5      nOU" X5      nUS   nU(       d  Mf  XXS   4-   nMp     U(       a  XA4-   n[        UUUS9$ )NrB   r   r   last_hidden_stater]   
attentions)	enumerater  r  r-   _gradient_checkpointing_func__call__r   )	rY   r]   r^   r  all_hidden_statesall_self_attentionsr  layer_modulelayer_outputss	            r<   rz   InternVLVisionEncoder.forward  s     #7BD$5b4(4OA#$58H$H!**t}} $ A A ))=! !-] N)!,M  &91=M<O&O#  5    14D D++*
 	
r>   )rM   r  r  )FF)rD   rE   rF   rG   r   rQ   r   r.   r|   r
  r   tupler   rz   rH   r}   r~   s   @r<   r  r    sg    ,3 , ,  #(%*	 
|| 
   
 #	 

 
uo%	& 
  
r>   r  c                      ^  \ rS rSrS\SS4U 4S jjrS r\\   SS\	R                  S\\	R                     S	\\   S
\\   S\\\4   4
S jj5       5       rSrU =r$ )InternVLVisionModeli  rM   r   Nc                 8  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  (       a  [        R                  " 5       O([        R                  " UR                  UR                  S9U l        U R                  5         g )Nr   )rP   rQ   rM   r   r   r  encoderuse_mean_poolingr2   rV   r   r   r   	layernorm	post_initrY   rM   r[   s     r<   rQ   InternVLVisionModel.__init__  sm     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r>   c                 .    U R                   R                  $ r   )r   r   )rY   s    r<   get_input_embeddings(InternVLVisionModel.get_input_embeddings  s    ///r>   r   r   r^   r  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  XS9u  pVU R	                  UUUS9nUS   nU R                  U5      n[        UUR                  UR                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
)r   )r^   r  r   r  )	rM   r^   r  r   r&  r(  r   r]   r  )	rY   r   r   r^   r  embedding_outputru   encoder_outputssequence_outputs	            r<   rz   InternVLVisionModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 #oolo\,,/!5 ' 

 *!,..93-)77&11
 	
r>   )rM   r   r&  r(  )NNN)rD   rE   rF   rG   r   rQ   r-  r   r   r.   r|   r   r   r
  r   r"  r   rz   rH   r}   r~   s   @r<   r$  r$    s    3  0  7;,0/3
ll
 "%"2"23
 $D>	

 'tn
 
u::	;
  
r>   r$  c                       \ rS rSrS rSrg)InternVLPreTrainedModeli  c                 4   [        U R                  SU R                  R                  5       R                  5      n[	        U[
        R                  5      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g g )Nr   rb   r   r   )getattrrM   get_text_configr   r   r2   r   r   r   r   r   r   r   r   )rY   r    r   s      r<   r   %InternVLPreTrainedModel._init_weights  s    dkk#68S8S8U8g8ghfbii((MM&&CS&9{{&  &&( '--KK""$MM$$S) .r>   rB   N)rD   rE   rF   rG   r   rH   rB   r>   r<   r5  r5    s    	*r>   r5  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )InternVLMultiModalProjectori  rM   c                 0  > [         TU ]  5         [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  5      U l        [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  UR                  R
                  5      U l        [        UR                     U l        [        R                  " UR                  R
                  UR                  R
                  5      U l        g )Nr   r   )rP   rQ   r2   r   vision_configr   r   downsample_ratior   r   text_configlinear_1r
   projector_hidden_actactlinear_2r*  s     r<   rQ   $InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar>   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r@  rB  rC  )rY   image_featuresr]   s      r<   rz   #InternVLMultiModalProjector.forward  s@    7m4/m4r>   )rB  r   r@  rC  )	rD   rE   rF   rG   r   rQ   rz   rH   r}   r~   s   @r<   r;  r;    s    b~ b r>   r;  c                   x    \ rS rSrSS\R
                  S\4S jjrS\R                  S\	\
\\
   4   S\4S jrS	rg
)InternVLModeli  vision_featuresscale_factorc           
         UR                  5       u  p4pVXR-  S:w  d  XB-  S:w  a  [        S5      eUR                  X4[        XR-  5      [        Xb-  5      5      nUR	                  SSSS5      R                  5       nUR                  U[        XR-  5      [        XB-  5      [        XbS-  -  5      5      nUR	                  SSSS5      R                  5       nU$ )a  Perform pixel shuffle downsampling on vision features.

Args:
    vision_features (`torch.Tensor`):
        Input tensor of shape (batch_size, width, height, channels).
    scale_factor (`float`, *optional*, defaults to `0.5`):
        Factor by which to downsample. Default is 0.5, which halves the dimensions.

Returns:
    vision_features (`torch.Tensor`):
        Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r	   )rc   r   rj   r   r   r5   )rY   rJ  rK  rs   r   r   channelss          r<   pixel_shuffleInternVLModel.pixel_shuffle  s     />.B.B.D+
6 A%)=)Bjkk *..s6#893x?V;W
 *11!Q1=HHJ *..F12C8L4MsS[mn_nSoOp

 *11!Q1=HHJr>   r   vision_feature_layervision_feature_select_strategyc                    U R                   R                  nUS:X  a  U R                  US9R                  nOU R	                  US9R
                  U   nUS:X  a  USS2SS2SS24   nUR                  S   n[        US-  5      nUR                  S   n	UR                  XUS5      nU R                  XeS9nUR                  U	SUR                  S   5      nU R                  U5      nU$ )	a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layer (`int` or `List[int]`):
        Layer index or list of layer indices to extract features from.
Returns:
    vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
r)   )r   defaultNr   r   r   )rK  )rM   r>  vision_towerr  vision_modelr]   r1   r   rg   rN  multi_modal_projector)
rY   r   rP  rQ  r6   r>  rJ  rM  feature_sizers   s
             r<   get_image_features InternVLModel.get_image_features+  s   $  ;;772%"//\/J\\O"//\/JXXYmnO)Y6-aQh7O #((+8S=)$**1-
 *11*LZ\] ,,_,\ *11*b/BWBWXZB[\ 44_Er>   rB   N)r   )rD   rE   rF   rG   r.   r|   floatrN  FloatTensorr   r   r   strrX  rH   rB   r>   r<   rI  rI    sN    !U\\ ! !F+''+ $CcN3+ ),	+r>   rI  c                       \ rS rSrSrg)InternVLCausalLMOutputWithPastiY  rB   NrC   rB   r>   r<   r^  r^  Y  rI   r>   r^  c                   (   ^  \ rS rSrU 4S jrSrU =r$ ) InternVLForConditionalGenerationi]  c                  :   > [        5       R                  " S0 U D6  g)ay  
Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModelForImageTextToText

>>> torch_device = "cuda"
>>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
>>> model = AutoModelForImageTextToText.from_pretrained(
...     "OpenGVLab/InternVL3-1B-hf", torch_dtype=torch.bfloat16, device_map=torch_device
... )

>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {
...                 "type": "image",
...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
...             },
...             {
...                 "type": "image",
...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
...             },
...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
...         ],
...     },
... ]

>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
>>> generate_ids = model.generate(**inputs, max_new_tokens=200)
>>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
The images depict the Statue of Liberty and the Golden Gate Bridge.
```NrB   )rP   rz   )super_kwargsr[   s    r<   rz   (InternVLForConditionalGeneration.forward^  s    H 	','r>   rB   )rD   rE   rF   rG   rz   rH   r}   r~   s   @r<   r`  r`  ]  s    $( $(r>   r`  )r   r$  r5  rI  r`  )rb   )Gcollections.abcr   dataclassesr   typingr   r   r   r   r   r.   torch.nnr2   torch.utils.checkpointactivationsr
   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   clip.modeling_clipr   janus.modeling_janusr   llama.modeling_llamar   llava.modeling_llavar   r   r   r   configuration_internvlr   r   
get_loggerrD   rm   Moduler|   rZ  r=   r@   rK   r   r   r   r   r   r   r   r   r  r$  r5  INTERNVL_INPUTS_DOCSTRINGr;  rI  r^  r`  __all__rB   r>   r<   <module>rx     s  "  ! 9 9    ! B K F & I I ( 7 /  I 
		H	% %II%<<% 
% <<	%
 U\\*% % %6	L 	;2 ;|  KO  K  KF +E  2!7BII !7L[7ryy [7|	 	 3H
I-/")) -/`(
BII (
V 2
7 2
 2
j
*2 
* ! ")) $OJ Od	%@ 	%('D %(Pr>   