
    fTh                     X   S SK rS SKJr  S SKJrJrJrJrJ	r	  S SK
r
S SKJr  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJrJrJr  SS
KJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*  \$RV                  " \,5      r-\" S5       " S S\R\                  5      5       r/ SAS\R\                  S\
R`                  S\
R`                  S\
R`                  S\\
R`                     S\1S\14S jjr2 " S S\R\                  5      r3\! " S S\5      5       r4\ " S  S!\5      5       r5 " S" S#\R\                  5      r6 " S$ S%\R\                  5      r7 " S& S'\R\                  5      r8\Rr                  \/S(.r: " S) S*\R\                  5      r; " S+ S,\R\                  5      r<\! " S- S.\45      5       r=\! " S/ S0\5      5       r> " S1 S2\R\                  5      r?\ " S3 S4\5      5       r@\!" S5S69 " S7 S8\>5      5       rA\ " S9 S:\ 5      5       rB " S; S<\\5      rC\!" S=S69 " S> S?\>\5      5       rD/ S@QrEg)B    N)	dataclass)CallableListOptionalTupleUnion   )ACT2FN)GenerationMixin)use_kernel_forward_from_hub)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tupleis_torchdynamo_compilinglogging	torch_int   )	AutoModel   )InternVLConfigInternVLVisionConfigRMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )InternVLVisionRMSNorm5   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z4
InternVLVisionRMSNorm is equivalent to T5LayerNorm
N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/internvl/modeling_internvl.pyr&   InternVLVisionRMSNorm.__init__7   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   T)keepdim)	dtypetor)   float32powmeanrsqrtr,   r+   )r-   hidden_statesinput_dtypevariances       r1   forwardInternVLVisionRMSNorm.forward?   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r3   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler+   shaper,   r-   s    r1   
extra_repr InternVLVisionRMSNorm.extra_reprF   s*    ))*+6$2G2G1HIIr3   )r,   r+   )gư>)	__name__
__module____qualname____firstlineno__r&   r@   rF   __static_attributes____classcell__r0   s   @r1   r"   r"   5   s    $;J Jr3   r"   modulequerykeyvalueattention_maskscalingdropoutc                    UnUn	[         R                  " XR                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR                  S   24   nX-   n
[        R
                  R                  U
SS9n
[        R
                  R                  XU R                  S9n
[         R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr   r	   r5   dim)ptrainingr   )
r)   matmul	transposerD   r'   
functionalsoftmaxrU   r[   
contiguous)rO   rP   rQ   rR   rS   rT   rU   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r1   eager_attention_forwardrg   J   s     JL<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1 ==((2(>L==((6??([L,,|:K''1-88:K$$r3   c            
          ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\R                     S\	\R                     S\
\   4S	 jjrS
rU =r$ )InternVLVisionAttentione   z+Attention Class for InternVL Vision Encoderconfigc                 $  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  5      U l        US:  a  [        R*                  " U5      O[        R,                  " 5       U l        U(       a  [/        U R                  5      O[        R,                  " 5       U l        U(       a  [/        U R                  5      U l        g [        R,                  " 5       U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r%   r&   rk   r.   	embed_dimnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr'   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentityr"   q_normk_norm)r-   rk   proj_dropoutqk_normr0   s       r1   r&    InternVLVisionAttention.__init__h   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta?F+DNN;BKKM?F+DNN;BKKMr3   r=   rS   output_attentionsra   c                 8   UR                  5       u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R                  U	5      n	UR                  XVU R                  U R                  5      R                  SS5      nU	R                  XVU R                  U R                  5      R                  SS5      n	U
R                  XVU R                  U R                  5      R                  SS5      n
[        nU R                  R                  S:w  ad  U R                  R                  S:X  a-  UR                  SS5      (       a  [        R!                  S5        O["        U R                  R                     nU" U UU	U
U4U R$                  (       d  SOU R&                  U R(                  SS	.UD6u  pUR                  XVU R*                  5      nU R-                  U5      nU R/                  U5      nU(       a  X4nU$ US 4nU$ )
Nr   r   eagersdpar   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )rU   rT   rx   )sizer{   r|   r}   r   r   reshaperq   rr   r]   viewrg   rk   _attn_implementationgetloggerwarning_oncer   r[   ru   rt   ro   r~   rv   )r-   r=   rS   r   ra   
batch_sizeseq_len_query_statesrb   rc   attention_interfacerf   rd   outputoutputss                   r1   r@   InternVLVisionAttention.forward   s    "/!3!3!5
Q{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6{{//69fjjI\^c>d>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HJJ
%
 
%
! "))*t~~N&&{3((0,=6( EKD>r3   )ru   rk   ro   rr   rx   r   r|   rq   rv   r~   r   r{   rt   r}   )NN)rH   rI   rJ   rK   __doc__r   r&   r)   Tensorr   r   r   r@   rL   rM   rN   s   @r1   ri   ri   e   si    5Z3 Z> 2648	/||/ !./ $ELL1	/
 -./ /r3   ri   c                   8    \ rS rSr\rSrSrSrS/r	Sr
SrS rSrg)	InternVLVisionPreTrainedModel   internvl_visionpixel_valuesTInternVLVisionLayerc                    [        U[        R                  [        R                  [        R                  45      (       ak  UR
                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR
                  R                  R                  SU R                  R                  S9  UR                  b2  UR
                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR
                  R                  R                  S5        g[        U[         5      (       a  UR"                  R                  R                  5         UR$                  b$  UR$                  R                  R                  5         UR&                  b%  UR&                  R                  R                  5         gg[        U[(        5      (       as  UR*                  R                  R                  U R                  R,                  5        UR.                  R                  R                  U R                  R,                  5        gg)zInitialize the weightsr   r;   stdN      ?)
isinstancer'   ry   Conv2dConvTranspose2dr+   datanormal_rk   initializer_rangern   zero_	Embeddingpadding_idx	LayerNormfill_InternVLVisionEmbeddings	cls_token
mask_tokenposition_embeddingsr   lambda_1layer_scale_init_valuelambda_2)r-   rO   s     r1   _init_weights+InternVLVisionPreTrainedModel._init_weights   s   fryy"))R5G5GHII MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) 899!!'')  ,!!&&,,.))5**//557 6 344OO  &&t{{'I'IJOO  &&t{{'I'IJ 5r3    N)rH   rI   rJ   rK   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2r   rL   r   r3   r1   r   r      s2    'L)$O&*#./N!Kr3   r   c                       \ rS rSrSrSrg)$InternVLVisionModelOutputWithPooling   av  
Class for outputs of [`InternVLVisionModel`].

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
r   N)rH   rI   rJ   rK   r   rL   r   r3   r1   r   r      s    r3   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )InternVLVisionPatchEmbeddings   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                 H  > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pTUS   US   -  US   US   -  -  nUS   US   -  US   US   -  4nX l        X0l        X@l        X`l        Xpl        [        R                  " XEX3S9U l
        g )Nr   r   )kernel_sizestride)r%   r&   
image_size
patch_sizenum_channelsr.   num_patchespatch_shaper'   r   
projection)	r-   rk   r   r   r   r.   r   r   r0   s	           r1   r&   &InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L:ir3   r   returnc                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      nUR                   S   UR                   S   pUR	                  S5      R                  SS5      nXgU44$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r	   r   )rD   r   rs   r   flattenr]   )	r-   r   r   r   heightwidth
embeddingspatch_heightpatch_widths	            r1   r@   %InternVLVisionPatchEmbeddings.forward
  s    2>2D2D/
&,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
+666r3   )r   r   r   r   r   r   )rH   rI   rJ   rK   r   r&   r)   r   r@   rL   rM   rN   s   @r1   r   r      s.    j7ELL 7U\\ 7 7r3   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
 SS\R                  S\\R                     S\R                  4S jjrSrU =r$ )r   i  z[
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

rk   r   Nc                 ^  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a<  [        R                  " [        R
                  " SSUR                  5      5      U l	        OS U l	        [        U5      U l        UR                  U l        [        UR                  [        R                   R"                  5      (       a  UR                  OUR                  UR                  4U l        U R                  R$                  nUR&                  (       a?  [        R                  " [        R
                  " SUS-   UR                  5      5      U l        OS U l        [        R*                  " UR,                  5      U l        g )Nr   )r%   r&   r'   r(   r)   zerosr.   r   use_mask_tokenr   r   patch_embeddingsr   r   r   collectionsabcIterabler    use_absolute_position_embeddingsr   r   hidden_dropout_probrU   )r-   rk   r   r0   s      r1   r&   !InternVLVisionEmbeddings.__init__   s'   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EFF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r3   r   r   r   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr5   r         ?r	   r   bicubicF)r   modealign_cornersrX   )rD   r   r)   jit
is_tracingr   r   r   permuter'   r^   interpolater   cat)r-   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedrY   
new_height	new_widthsqrt_num_positionss               r1   interpolate_pos_encoding1InternVLVisionEmbeddings.interpolate_pos_encoding6  s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr3   r   bool_masked_posc                    UR                   u    p4nU R                  U5      u  nu  pxUR                  5       u  pnUbI  U R                  R	                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  R	                  U	SS5      n[        R                  " X4SS9nU R                  b  X`R                  XdU5      -   nU R                  U5      nXgU44$ )Nr5   r   rX   )rD   r   r   r   expand	unsqueezetype_asr   r)   r   r   r   rU   )r-   r   r   r   r   r   r   r   r   r   r   mask_tokensw
cls_tokenss                 r1   r@    InternVLVisionEmbeddings.forward^  s    
 +001e262G2G2U/
/\!+!2
Q&//00bIK))"-55kBA#q1u-?J^^**:r2>
YY
7Q?
##/#&C&CJX]&^^J\\*-
+666r3   )r   rU   r   r   r   r   r   N)rH   rI   rJ   rK   r   r   r&   r)   r   intr   r   
BoolTensorr@   rL   rM   rN   s   @r1   r   r     s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 7;7ll7 "%"2"237 
	7 7r3   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InternVLVisionMLPix  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r  )r%   r&   rk   r
   
hidden_actactivation_fnr'   ry   r.   intermediate_sizefc1fc2r-   rk   r0   s     r1   r&   InternVLVisionMLP.__init__y  sb    #F$5$5699V//1I1IJ99V55v7I7IJr3   r=   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r
  r  r  )r-   r=   s     r1   r@   InternVLVisionMLP.forward  s4    /**=9/r3   )r  rk   r
  r  )
rH   rI   rJ   rK   r&   r)   r   r@   rL   rM   rN   s   @r1   r  r  x  s)    KU\\ ell  r3   r  )
layer_normrms_normc                      ^  \ rS rSrSrS\SS4U 4S jjr SS\R                  S\	S\
\\R                     \\R                  \R                  4   4   4S	 jjrS
rU =r$ )r   i  z?This corresponds to the Block class in the timm implementation.rk   r   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                     " UR                  UR                  S9U l        [        UR                     " UR                  UR                  S9U l        UR                  n[        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R*                  " UR,                  5      U l        g )Nr   r/   T)requires_grad)r%   r&   chunk_size_feed_forwardseq_len_dimri   	attentionr  mlpNORM2FN	norm_typer.   layer_norm_epslayernorm_beforelayernorm_afterr   r'   r(   r)   r*   r   r   r   r   rU   )r-   rk   init_valuesr0   s      r1   r&   InternVLVisionLayer.__init__  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::v?Q?Q3S%Scgh[5::v?Q?Q3S%Scghzz&"<"<=r3   r=   r   c                    U R                  U R                  U5      US9u  p4U R                  U-  nX1-   nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  b  U R                  U-  nXQ-   nXT4$ )N)r   )r  r  r   r  r  rU   r   )r-   r=   r   attention_outputattention_weightslayer_outputs         r1   r@   InternVLVisionLayer.forward  s    
 /3nn!!-0/ /= /
+
  ==+;; )8 ++M:xx-||L1==$==<7L $3..r3   )	r  r  rU   r   r   r  r  r  r  )F)rH   rI   rJ   rK   r   r   r&   r)   r   boolr   r   r@   rL   rM   rN   s   @r1   r   r     sn    I>3 > >$ #(/||/  / 
uU\\"E%,,*D$EE	F	/ /r3   r   c                   ~   ^  \ rS rSrS\SS4U 4S jjr\  SS\R                  S\	S\	S\
\\4   4S	 jj5       rS
rU =r$ )InternVLVisionEncoderi  rk   r   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r%   r&   rk   r'   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r-   rk   ir0   s      r1   r&   InternVLVisionEncoder.__init__  sS    ]]vOgOgIh#iIhA$7$?Ih#ij
&+# $js   A&r=   r   output_hidden_statesc                 j   U(       a  SOS nU(       a  SOS n[        U R                  5       Hn  u  pgU(       a  XA4-   nU R                  (       a.  U R                  (       a  U R	                  UR
                  X5      nOU" X5      nUS   nU(       d  Mf  XXS   4-   nMp     U(       a  XA4-   n[        UUUS9$ )Nr   r   r   last_hidden_stater=   
attentions)	enumerater-  r.  r[   _gradient_checkpointing_func__call__r   )	r-   r=   r   r1  all_hidden_statesall_self_attentionsr/  layer_modulelayer_outputss	            r1   r@   InternVLVisionEncoder.forward  s     #7BD$5b4(4OA#$58H$H!**t}} $ A A ))=! !-] N)!,M  &91=M<O&O#  5    14D D++*
 	
r3   )rk   r.  r-  )FF)rH   rI   rJ   rK   r   r&   r   r)   r   r&  r   rC   r   r@   rL   rM   rN   s   @r1   r(  r(    sg    ,3 , ,  #(%*	 
|| 
   
 #	 

 
uo%	& 
  
r3   r(  c                      ^  \ rS rSrS\SS4U 4S jjrS r\\   SS\	R                  S\\	R                     S	\\   S
\\   S\\\4   4
S jj5       5       rSrU =r$ )InternVLVisionModeli  rk   r   Nc                 8  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  (       a  [        R                  " 5       O([        R                  " UR                  UR                  S9U l        U R                  5         g )Nr  )r%   r&   rk   r   r   r(  encoderuse_mean_poolingr'   r   r   r.   r  	layernorm	post_initr  s     r1   r&   InternVLVisionModel.__init__  sm     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r3   c                 .    U R                   R                  $ r  )r   r   rE   s    r1   get_input_embeddings(InternVLVisionModel.get_input_embeddings  s    ///r3   r   r   r   r1  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  XS9u  pVU R	                  UUUS9nUS   nU R                  U5      n[        UUR                  UR                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
)r   )r   r1  r   r3  )	rk   r   r1  r   rA  rC  r   r=   r5  )	r-   r   r   r   r1  embedding_outputr   encoder_outputssequence_outputs	            r1   r@   InternVLVisionModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 #oolo\,,/!5 ' 

 *!,..93-)77&11
 	
r3   )rk   r   rA  rC  )NNN)rH   rI   rJ   rK   r   r&   rG  r   r   r)   r   r   r  r&  r   rC   r   r@   rL   rM   rN   s   @r1   r?  r?    s    3  0  7;,0/3
ll
 "%"2"23
 $D>	

 'tn
 
u::	;
  
r3   r?  c                   B    \ rS rSr\rSrSrSrSr	Sr
SrSrSrSrS rSrg)InternVLPreTrainedModeli   Tpast_key_valuesc                 4   [        U R                  SU R                  R                  5       R                  5      n[	        U[
        R                  5      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g g )Nr   r   r   r   )getattrrk   get_text_configr   r   r'   ry   r+   r   r   rn   r   r   r   )r-   rO   r   s      r1   r   %InternVLPreTrainedModel._init_weights(  s    dkk#68S8S8U8g8ghfbii((MM&&CS&9{{&  &&( '--KK""$MM$$S) .r3   r   N)rH   rI   rJ   rK   r   r   r   r   _skip_keys_device_placement_supports_cache_classr   r   _supports_quantized_cache_supports_static_cache_supports_attention_backendr   rL   r   r3   r1   rO  rO    sA    !L&*#"3 !N $!"&	*r3   rO  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )InternVLMultiModalProjectori4  rk   c                 0  > [         TU ]  5         [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  5      U l        [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  UR                  R
                  5      U l        [        UR                     U l        [        R                  " UR                  R
                  UR                  R
                  5      U l        g )Nr   r   )r%   r&   r'   r   vision_configr.   r  downsample_ratior  ry   text_configlinear_1r
   projector_hidden_actactlinear_2r  s     r1   r&   $InternVLMultiModalProjector.__init__5  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar3   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r  )r  ra  rc  rd  )r-   image_featuresr=   s      r1   r@   #InternVLMultiModalProjector.forward>  s@    7m4/m4r3   )rc  r  ra  rd  )	rH   rI   rJ   rK   r   r&   r@   rL   rM   rN   s   @r1   r\  r\  4  s    b~ b r3   r\  c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)InternVLModelOutputWithPastiF  a  
Base class for InternVL outputs, with hidden states and attentions.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_statesr   )rH   rI   rJ   rK   r   rk  r   r)   FloatTensor__annotations__rL   r   r3   r1   rj  rj  F  s    8 8<%"3"34;r3   rj  zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc            %       .  ^  \ rS rSrSS0rS\4U 4S jjrS rS rS\	R                  S	\\\\   4   S
\4S jr\\              SS\	R$                  S\	R                  S\\	R(                     S\\	R$                     S\\\	R                        S\\	R                     S	\\\\\   4      S
\\   S\\   S\\   S\\   S\\   S\\	R$                     S\	R(                  S\\   S\\\4   4 S jj5       5       rSS\	R(                  S\4S jjrSrU =r$ ) InternVLModelig  zlanguage_model.modellanguage_modelrk   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g r  )r%   r&   r   from_configr^  vision_towerr\  multi_modal_projectorr`  rq  rD  r  s     r1   r&   InternVLModel.__init__o  sY     %11&2F2FG%@%H"'33F4F4FGr3   c                 6    U R                   R                  5       $ r  )rq  rG  rE   s    r1   rG  "InternVLModel.get_input_embeddingsw  s    ""7799r3   c                 :    U R                   R                  U5        g r  )rq  set_input_embeddingsr-   rR   s     r1   rz  "InternVLModel.set_input_embeddingsz  s    007r3   r   vision_feature_layervision_feature_select_strategyc                    U R                   R                  nUS:X  a  U R                  US9R                  nOU R	                  US9R
                  U   nUS:X  a  USS2SS2SS24   nUR                  S   n[        US-  5      nUR                  S   n	UR                  XUS5      nU R                  XeS9nUR                  U	SUR                  S   5      nU R                  U5      nU$ )	a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layer (`int` or `List[int]`):
        Layer index or list of layer indices to extract features from.
Returns:
    vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
r5   )r   defaultNr   r   r   )scale_factor)rk   r_  rt  r4  vision_modelr=   rD   r  r   pixel_shuffleru  )
r-   r   r}  r~  ra   r_  vision_featureschannelsfeature_sizer   s
             r1   get_image_features InternVLModel.get_image_features}  s   $  ;;772%"//\/J\\O"//\/JXXYmnO)Y6-aQh7O #((+8S=)$**1-
 *11*LZ\] ,,_,\ *11*b/BWBWXZB[\ 44_Er3   	input_idsrS   position_idsrQ  inputs_embeds	use_cacher   r1  return_dictcache_positionimage_sizesra   r   c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R
                  nUS L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUGb  U R                  UUUUS9nUcx  X`R                  5       " [        R                  " U R                   R                  [        R                  UR                  S95      :H  nUR                  SS9R                  SS9S   nOwXR                   R                  :H  R                  S5      nUR!                  U5      R#                  UR                  5      nXR                   R                  :H  R                  5       n[%        5       (       d{  UU   R'                  5       UR'                  5       :w  aV  XR                   R                  :H  R                  5       nUR(                  S   UR(                  S   -  n[        SU S	U 35      eUR#                  UR                  UR*                  5      nUR-                  UU5      nU R.                  " SUUUUU	U
US
US.	UD6n[1        UR2                  UR4                  UR6                  UR8                  Ub  WS9$ S S9$ )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r}  r~  r  )r7   devicer   rX   r   r5   z6Image features and image tokens do not match: tokens: z, features T)	rS   r  rQ  r  r  r   r1  r  r  )r4  rQ  r=   r5  rk  r   )rk   r   r1  use_return_dictr}  r~  rs   rG  r  r)   tensorimage_token_idlongr  sumr   	expand_asr8   r   numelrD   r7   masked_scatterrq  rj  r4  rQ  r=   r5  )r-   r  r   rS   r  rQ  r  r}  r~  r  r   r1  r  r  r  ra   rg  special_image_maskn_image_tokensn_image_featuresr   s                        r1   r@   InternVLModel.forward  s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	
 .9 +;; 	' -t";<YZZ  557	BM#!44)%9/M'	 5 N  %26O6O6QLL!;!;5::VcVjVjk7 &" #5!9!9a!9!@!D!D!D!KA!N&/;;3M3M&M%X%XY[%\"%7%A%A-%P%S%STaThTh%i""+{{/I/I"I!N!N!P+---@R2S2Y2Y2[_m_s_s_u2u"+{{/I/I"I!N!N!P#1#7#7#:^=Q=QRS=T#T  L^L\\ghxgyz  ,..}/C/C]EXEXYN)889K^\M%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r3   r  r  c           
         UR                  5       u  p4pVXR-  S:w  d  XB-  S:w  a  [        S5      eUR                  X4[        XR-  5      [        Xb-  5      5      nUR	                  SSSS5      R                  5       nUR                  U[        XR-  5      [        XB-  5      [        XbS-  -  5      5      nUR	                  SSSS5      R                  5       nU$ )a  Perform pixel shuffle downsampling on vision features.

Args:
    vision_features (`torch.Tensor`):
        Input tensor of shape (batch_size, width, height, channels).
    scale_factor (`float`, *optional*, defaults to `0.5`):
        Factor by which to downsample. Default is 0.5, which halves the dimensions.

Returns:
    vision_features (`torch.Tensor`):
        Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r	   )r   rs   r   r  r   r`   )r-   r  r  r   r   r   r  s          r1   r  InternVLModel.pixel_shuffle  s     />.B.B.D+
6 A%)=)Bjkk *..s6#893x?V;W
 *11!Q1=HHJ *..F12C8L4MsS[mn_nSoOp

 *11!Q1=HHJr3   )rq  ru  rt  )NNNNNNNNNNNNNN)r   )rH   rI   rJ   rK   _checkpoint_conversion_mappingr   r&   rG  rz  r)   rl  r   r  r   strr  r   r   
LongTensorr   r   r&  r   r   r   rj  r@   floatr  rL   rM   rN   s   @r1   rp  rp  g  s    '=>N%O"~ :8+''+ $CcN3+ ),	+Z  '+*.1537=A59@D8<$(,0/3&*59$(T
##T
 ''T
 !.	T

 u//0T
 "$u'8'8"9:T
   1 12T
 'uS$s)^'<=T
 )1T
 D>T
 $D>T
 'tnT
 d^T
 !!1!12T
 \\T
  -.!T
" 
u11	2#T
  T
l!U\\ ! ! !r3   rp  c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)InternVLCausalLMOutputWithPasti&  a@  
Base class for InternVL causal language model (or autoregressive) outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
NlosslogitsrQ  r=   r5  rk  r   )rH   rI   rJ   rK   r   r  r   r)   rl  rm  r  rQ  r   r=   r   r5  rk  rL   r   r3   r1   r  r  &  s    < )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r3   r  c                       \ rS rSrSrg)KwargsForCausalLMiN  r   N)rH   rI   rJ   rK   rL   r   r3   r1   r  r  N  s    3r3   r  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c            )         ^  \ rS rSrSSSSS.rS/rS\4U 4S	 jjrS
 rS r	S\
R                  4S jrS r\S 5       r\S 5       r\S 5       r\\                S+S\R*                  S\R,                  S\\R0                     S\\R*                     S\\\R,                        S\\R,                     S\\\\\   4      S\\   S\\R*                     S\\   S\\   S\\   S\\   S\\R*                     S \\\R0                  4   S!\\R0                     S"\\   S\\ \!4   4$S# jj5       5       r"      S,U 4S$ jjr#\$S\R0                  S%\S&\S'\RJ                  S\R0                  S(\4S) j5       r&S*r'U =r($ )- InternVLForConditionalGenerationiQ  zmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightrk   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFrm   )r%   r&   rp  modelr'   ry   r`  r.   
vocab_sizer  rD  r  s     r1   r&   )InternVLForConditionalGeneration.__init___  sS     "6*
yy!3!3!?!?ASASA^A^ejkr3   c                 6    U R                   R                  5       $ r  )r  rG  rE   s    r1   rG  5InternVLForConditionalGeneration.get_input_embeddingse  s    zz..00r3   c                 :    U R                   R                  U5        g r  )r  rz  r{  s     r1   rz  5InternVLForConditionalGeneration.set_input_embeddingsh  s    

''.r3   r   c                     U R                   $ r  r  rE   s    r1   get_output_embeddings6InternVLForConditionalGeneration.get_output_embeddingsk  s    ||r3   c                     Xl         g r  r  )r-   new_embeddingss     r1   set_output_embeddings6InternVLForConditionalGeneration.set_output_embeddingsn  s    %r3   c                 .    U R                   R                  $ r  )r  rq  rE   s    r1   rq  /InternVLForConditionalGeneration.language_modelr  s    zz(((r3   c                 .    U R                   R                  $ r  )r  rt  rE   s    r1   rt  -InternVLForConditionalGeneration.vision_towerv  s    zz&&&r3   c                 .    U R                   R                  $ r  )r  ru  rE   s    r1   ru  6InternVLForConditionalGeneration.multi_modal_projectorz  s    zz///r3   r  r   rS   r  rQ  r  r}  r~  labelsr  r   r1  r  r  logits_to_keepr  ra   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R
                  nU R                  " SUUUUUUUUU
UUSUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	b3  U R                  " SUXR                   R                  R                  S.UD6n[        UUUR                  UR                   UR"                  UR$                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModelForImageTextToText

>>> torch_device = "cuda"
>>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
>>> model = AutoModelForImageTextToText.from_pretrained(
...     "OpenGVLab/InternVL3-1B-hf", torch_dtype=torch.bfloat16, device_map=torch_device
... )

>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {
...                 "type": "image",
...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
...             },
...             {
...                 "type": "image",
...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
...             },
...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
...         ],
...     },
... ]

>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
>>> generate_ids = model.generate(**inputs, max_new_tokens=200)
>>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
The images depict the Statue of Liberty and the Golden Gate Bridge.
```NT)r  r   rS   r  rQ  r  r}  r~  r  r   r1  r  r  r  r   )r  r  r  )r  r  rQ  r=   r5  rk  r   )rk   r   r1  r  r}  r~  r  r   r  slicer  loss_functionr`  r  r  rQ  r=   r5  rk  )r-   r  r   rS   r  rQ  r  r}  r~  r  r  r   r1  r  r  r  r  ra   r   r=   slice_indicesr  r  s                          r1   r@   (InternVLForConditionalGeneration.forward~  s   | 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	
 .9 +;; 	' ** 
%)%+'!5+I/!5)#
 
$  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r3   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  XIS'   U	$ )N)rQ  r  rS   r  r  r   r   )r%   prepare_inputs_for_generation)r-   r  rQ  r  r   rS   r  r  ra   model_inputsr0   s             r1   r  >InternVLForConditionalGeneration.prepare_inputs_for_generation  sT     w<
+')))
 
 !! ,8(r3   sequence_lengthtarget_lengthr7   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer7   r  r   )diagonal)r  r5   r   )rY   r)   finfominfullr  triuaranger   r   clonerD   r8   masked_fill)rS   r  r  r7   r  r   ra   re   	min_dtypemask_lengthpadding_masks              r1   5_prepare_4d_causal_attention_mask_with_cache_positionVInternVLForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  s}   < %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r3   )r  r  )NNNNNNNNNNNNNNr   N)NNNNNN))rH   rI   rJ   rK   r  _tied_weights_keysr   r&   rG  rz  r'   Moduler  r  propertyrq  rt  ru  r   r   r)   r  rl  r   r   r   r   r  r  r&  r   r  r   r  r@   r  staticmethodr7   r  rL   rM   rN   s   @r1   r  r  Q  s    "8-"?#,	&" ++~ 1/ryy & ) ) ' ' 0 0  '+*.1537=A59@D8<-1$(,0/3&*5934.2#n
##n
 ''n
 !.	n

 u//0n
 "$u'8'8"9:n
   1 12n
 'uS$s)^'<=n
 )1n
 ))*n
 D>n
 $D>n
 'tnn
 d^n
 !!1!12n
  c5<</0!n
" ell+#n
$ *+%n
& 
u44	5'n
  n
f < 444 4 {{	4
 4 4 4r3   r  )r   r?  rO  rp  r  )r   )Fcollections.abcr   dataclassesr   typingr   r   r   r   r   r)   torch.nnr'   activationsr
   
generationr   integrationsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   r   autor   configuration_internvlr   r   
get_loggerrH   r   r  r"   r   r  rg   ri   r   r   r   r   r  r   r  r   r(  r?  rO  r\  rj  rp  r  r  r  __all__r   r3   r1   <module>r     s  .  ! 9 9   ! ) 7 B d d F &    H 
		H	% Y'JBII J (J6 %II%<<% 
% <<	%
 U\\*% % %6Nbii Nb  KO  K  KF +E  2!7BII !7L[7ryy [7|		  3H
I-/")) -/`(
BII (
V 2
7 2
 2
j *o * *0")) $ <"9 < <@ 
w+ w
wt $<[ $< $<N ?,j > 
m'> m
m`r3   