
    fTh                     :   S r SSKrSSKrSSKJr  SSKJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJr  SSKJr  SS	KJrJrJrJrJrJr  SS
KJr  SSKJrJrJr  SSKJ r J!r!  SSK"J#r#  \!RH                  " \%5      r&\ " S S\5      5       r' " S S\RP                  5      r) " S S\RP                  5      r* " S S\RP                  5      r+ " S S\RP                  5      r, " S S\RP                  5      r- " S S\RP                  5      r. " S S\RP                  5      r/ " S S \RP                  5      r0 " S! S"\RP                  5      r1 " S# S$\RP                  5      r2\  " S% S&\5      5       r3\  " S' S(\35      5       r4 " S) S*\RP                  5      r5\ " S+S,9 " S- S.\35      5       r6 " S/ S0\RP                  5      r7 " S1 S2\RP                  5      r8\ " S3S,9 " S4 S5\35      5       r9\ " S6S,9 " S7 S8\35      5       r:\ " S9S,9 " S: S;\35      5       r;\  " S< S=\35      5       r</ S>Qr=g)?zPyTorch ViLT model.    N)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)auto_docstringlogging   )
ViltConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\\R                           \	S'   Sr\\\\R                           \	S'   Srg)	(ViltForImagesAndTextClassificationOutput-   aQ  
Class for outputs of [`ViltForImagesAndTextClassification`].

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`List[tuple(torch.FloatTensor)]`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the output of
        the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`List[tuple(torch.FloatTensor)]`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the attention
        weights of shape `(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the
        attention softmax, used to compute the weighted average in the self-attention heads.
Nlosslogitshidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r    __static_attributes__r!       ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vilt/modeling_vilt.pyr   r   -   sq    $ )-D(5$$
%,*.FHU&&'.>BM8Du'8'8!9:;B;?JeE$5$5678?r+   r   c                   B   ^  \ rS rSrSrU 4S jrSS jr SS jrSrU =r	$ )	ViltEmbeddingsG   z
Construct the text and patch embeddings.

Text embeddings are equivalent to BERT embeddings.

Patch embeddings are equivalent to ViT embeddings.
c                 6  > [         TU ]  5         [        U5      U l        [        R
                  " [        R                  " SSUR                  5      5      U l	        [        U5      U l        U R                  R                  n[        R
                  " [        R                  " SUS-   UR                  5      5      U l        [        R                  " UR                  UR                  5      U l        [        R"                  " UR$                  5      U l        Xl        g Nr   )super__init__TextEmbeddingstext_embeddingsr   	Parameterr'   zeroshidden_size	cls_tokenViltPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddings	Embeddingmodality_type_vocab_sizetoken_type_embeddingsDropouthidden_dropout_probdropoutconfig)selfrD   r<   	__class__s      r,   r3   ViltEmbeddings.__init__P   s      .f5ekk!Q8J8J&KL 3F ;++77#%<<A{QPVPbPb0c#d %'\\&2Q2QSYSeSe%f"zz&"<"<=r+   c                 .   U R                   R                  R                  R                  u    pEnU R                  U5      nUS S 2S S S 2S S 24   R	                  5       n[
        R                  R                  XR                  S   UR                  S   4S9R                  5       nUS S 2S4   R                  SS9S S 2S4   n	US S 2S4   R                  SS9S S 2S4   n
UR                  u  ppU R                  R                  U R                  R                  -  nU R                  S S 2SS 2S S 24   R                  SS5      R                  SXU5      n[         R"                  " [%        X5       VVs/ s HP  u  nn[
        R                  R'                  [
        R                  R                  UUU4SSS	9SUU-
  SUU-
  45      PMR     snnSS9nUR)                  S5      R                  SS5      nUR)                  S5      R                  SS5      n[         R*                  " [-        [         R.                  " UR                  S
   5      [         R.                  " UR                  S   5      SS9SS9R1                  UR2                  S9nUS S S S 2S S 2S S 24   nUR5                  UR                  S   UR                  S   SSS5      nUR)                  SS5      nUR)                  S5      nUS:  d  Ub  [7        U[8        5      (       d  X-  nUR;                  5       nOX-  n[=        UR;                  5       U5      nUR?                  SS9nSU-
  R?                  SS9nUS S 2S4   RA                  5       nU Vs/ s H  nUUS S 2S4   U:H     PM     nnU Vs/ s H  nUUS S 2S4   U:H     PM     nnU Vs/ s H  nURC                  S5      PM     nnU Vs/ s H  nURC                  S5      PM     nnU Vs/ s H  nUU-
  PM
     nn/ n [E        [%        UUU5      5       H  u  n!u  nn"n#U#S::  aR  [         RF                  " [         RH                  " U5      R	                  5       U5      n$U RK                  UU!   U$   5        Mb  [         RF                  " [         RH                  " U"5      R	                  5       U#SS9n%U RK                  [         R"                  " UU!   UU!   U%   /SS95        M     [         R"                  " U SS9n UU S S 2S4   U S S 2S4   4   R                  USU5      nUU S S 2S4   U S S 2S4   4   R                  US5      nUU S S 2S4   U S S 2S4   4   R                  USS5      nUU S S 2S4   U S S 2S4   4   R                  USU5      nU RL                  R5                  USS5      n&[         R"                  " U&U4SS9n[         R"                  " U R                  S S 2SS S 24   S S 2S S S 24   R5                  USS5      U4SS9nUU-   nU RO                  U5      n[         R"                  " [         RH                  " UR                  S   S5      R1                  U5      U/SS9nXxUX444$ s  snnf s  snf s  snf s  snf s  snf s  snf )N   r
   )sizer   r   dimbilinearT)rJ   modealign_cornersij)indexingdeviceF)as_tuple)replacement)(r;   
projectionweightshapefloatr   
functionalinterpolatelongsumrD   
image_size
patch_sizer=   	transposeviewr'   catzippadflattenstackr   arangetorU   expand
isinstanceintmaxminnonzerouniquerJ   	enumeratemultinomialonesappendr9   rC   )'rE   pixel_values
pixel_maskmax_image_length_phpwxx_maskx_hx_w
batch_sizenum_channelsheightwidth	patch_dimspatial_poshw	pos_embedpatch_indexeffective_resolution	valid_idxnon_valid_idxunique_rowsuvalid_row_idxnon_valid_row_idxv
valid_numsnon_valid_numspad_numsselectinvpvalid_choice
pad_choice
cls_tokenss'                                          r,   visual_embedViltEmbeddings.visual_embed_   sH   ,,77>>DD1"!!,/AtQM*002**6QWWQZ8P*QVVXQTl1%ad+QTl1%ad+23''/
&KK**dkk.D.DD	..q!"ax8BB1aHMMaQ]jstII  M *DAq !!MM--#V'&*	 .  	1fqj1 * 
	  %%a(221a8	IIaL""1a(kkU\\&,,r"23U\\&,,rBR5S^bcik

"FMM"
" 	 "$aA"56!((a&,,q/2rSUV!))!Q/"a#3#;:N^`cCdCd
 $'9 3779#&9 "#7#;#;#=?OPNNEN2	V,,e,<1o,,.BMN+Q9QT?a#78+NNYZk]=A+>!+CDkZ)67AaffQi
7->?->!&&)->?2<=*Q$q(*=&s:~x'PQMAz2qAv$00A1D1D1FHXYmA.|<="..uzz"~/C/C/EqVZ[
eiiq)9;LQ;OPZ;[(\bcde R 6q)fQTlF1a4L()..z2|Lq!tfQTl2388RH!&A,q!t"<=BB:rSTUfQTlF1a4L89>>z2|\	^^**:r2>
IIz1o1-II%%aAg.q$z:AA*bRTUW`agh
	 	MLLOEJJv||A:==fEvNTUV;888SP OZ7?=s%   1AY8
7Y>Z5ZZ7Zc	           
         U R                  XUS9n	Uc(  U R                  XEU R                  R                  S9u  pznOUR	                  S5      n
Uc  SnXR                  [        R                  " U[        R                  U	R                  S95      -   n	XpR                  [        R                  " X[        R                  U	R                  S95      -   n[        R                  " X/SS9n[        R                  " X*/SS9nX4$ )N)	input_idstoken_type_idsinputs_embeds)rx   r   dtyperU   rK   )r5   r   rD   rx   rg   r@   r'   
zeros_liker^   rU   	full_likerd   )rE   r   attention_maskr   rv   rw   r   image_embedsimage_token_type_idxtext_embedsimage_masksr   
embeddingsmaskss                 r,   forwardViltEmbeddings.forward   s    **m + 

 595F5F4;;;W;W 6G 62L{ %,,Q/K  '#$ !$>$>^5::kFXFXY%
 
 $&@&@OOKUZZXcXjXjk'
 

 YY:B
		>7Q?  r+   )r9   rD   rC   r;   r=   r5   r@   )   )r   )
r"   r#   r$   r%   r&   r3   r   r   r*   __classcell__rF   s   @r,   r.   r.   G   s#    V9B '! '!r+   r.   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )r4      zGConstruct the embeddings from word, position and token_type embeddings.c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  g )N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   rQ   F)
persistentr   r   )r2   r3   r   r>   
vocab_sizer8   pad_token_idword_embeddingsmax_position_embeddingsr=   type_vocab_sizer@   	LayerNormlayer_norm_epsrA   rB   rC   getattrr   register_bufferr'   ri   rk   r7   r   rJ   r^   rE   rD   rF   s     r,   r3   TextEmbeddings.__init__   s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r+   c                 `   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  S:X  a  U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )NrQ   r   r   r   r   r   )rJ   r   hasattrr   rk   r'   r7   r^   rU   r   r@   r   r=   r   rC   )rE   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr@   r   r=   s               r,   r   TextEmbeddings.forward   s:    #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r+   )r   rC   r   r=   r@   r   )NNNN	r"   r#   r$   r%   r&   r3   r   r*   r   r   s   @r,   r4   r4      s    Q
&   r+   r4   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r:   i  z
Image to Patch Embedding.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)r2   r3   r`   ra   r   r8   rl   collectionsabcIterabler<   r   Conv2drX   )rE   rD   r`   ra   r   r8   r<   rF   s          r,   r3   ViltPatchEmbeddings.__init__  s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir+   c                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  R                  R
                  nU R                  UR                  US95      nU$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   )rZ   r   
ValueErrorrX   rY   r   rj   )rE   rv   r   r   r   r   target_dtyper|   s           r,   r   ViltPatchEmbeddings.forward.  sc    2>2D2D/
&,,,w  --33OOLOO,O?@r+   )r`   r   r<   ra   rX   r   r   s   @r,   r:   r:     s    j r+   r:   c                   8   ^  \ rS rSrU 4S jrS rSS jrSrU =r$ )ViltSelfAttentioni9  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                  U R                  UR                  S9U l        [        R                  " UR                   5      U l        g )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)r2   r3   r8   num_attention_headsr   r   rm   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluerA   attention_probs_dropout_probrC   r   s     r,   r3   ViltSelfAttention.__init__:  s1    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr+   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )NrQ   r   rI   r   r
   )rJ   r   r   rc   permute)rE   r|   new_x_shapes      r,   transpose_for_scores&ViltSelfAttention.transpose_for_scoresL  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$r+   c                    U R                  U5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      n[        R
                  " XR                  SS5      5      n	U	[        R                  " U R                  5      -  n	Ub  X-   n	[        R                  " SS9" U	5      n
U R                  U
5      n
Ub  X-  n
[        R
                  " X5      nUR                  SSSS5      R                  5       nUR                  5       S S U R                   4-   nUR"                  " U6 nU(       a  X4nU$ U4nU$ )NrQ   rP   rK   r   rI   r   r
   )r   r   r   r   r'   matmulrb   mathsqrtr   r   SoftmaxrC   r   
contiguousrJ   r   rc   )rE   r   r   	head_maskoutput_attentionsmixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r,   r   ViltSelfAttention.forwardQ  sS    JJ}5--dhh}.EF	//

=0IJ//0AB !<<5H5HR5PQ+dii8P8P.QQ%/@ **,-=> ,,7  -9O_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD6G=2 O\M]r+   )r   r   rC   r   r   r   r   NNF)	r"   r#   r$   r%   r3   r   r   r*   r   r   s   @r,   r   r   9  s    G$%
! !r+   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )ViltSelfOutputiv  z
The residual connection is defined in ViltLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
rD   returnNc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g N)	r2   r3   r   r   r8   denserA   rB   rC   r   s     r,   r3   ViltSelfOutput.__init__|  sB    YYv1163E3EF
zz&"<"<=r+   r   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ r
  r  rC   rE   r   r  s      r,   r   ViltSelfOutput.forward  s$    

=1]3r+   r  )r"   r#   r$   r%   r&   r   r3   r'   Tensorr   r*   r   r   s   @r,   r  r  v  sI    
>z >d >
U\\  RWR^R^  r+   r  c                   8   ^  \ rS rSrU 4S jrS rSS jrSrU =r$ )ViltAttentioni  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r
  )r2   r3   r   	attentionr  outputsetpruned_headsr   s     r,   r3   ViltAttention.__init__  s0    *62$V,Er+   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rK   )lenr   r  r   r   r  r   r   r   r   r  r  r   union)rE   headsindexs      r,   prune_headsViltAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r+   c                 f    U R                  XX45      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r  r  )rE   r   r   r   r   self_outputsattention_outputr  s           r,   r   ViltAttention.forward  s@    ~~mYb;;|AF#%QR(88r+   )r  r  r  r  )	r"   r#   r$   r%   r3   r   r   r*   r   r   s   @r,   r  r    s    ";$ r+   r  c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	ViltIntermediatei  rD   r  Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r
  )r2   r3   r   r   r8   intermediate_sizer  rl   
hidden_actstrr   intermediate_act_fnr   s     r,   r3   ViltIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r+   r   c                 J    U R                  U5      nU R                  U5      nU$ r
  r  r,  rE   r   s     r,   r   ViltIntermediate.forward  s&    

=100?r+   r/  r"   r#   r$   r%   r   r3   r'   r  r   r*   r   r   s   @r,   r'  r'    s6    9z 9d 9U\\ ell  r+   r'  c                      ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S jrS	r	U =r
$ )

ViltOutputi  rD   r  Nc                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r
  )
r2   r3   r   r   r)  r8   r  rA   rB   rC   r   s     r,   r3   ViltOutput.__init__  sB    YYv779K9KL
zz&"<"<=r+   r   r  c                 R    U R                  U5      nU R                  U5      nX-   nU$ r
  r  r  s      r,   r   ViltOutput.forward  s,    

=1]3%4r+   r  r2  r   s   @r,   r4  r4    sD    >z >d >
U\\  RWR^R^  r+   r4  c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )	ViltLayeri  z?This corresponds to the Block class in the timm implementation.c                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   r   )r2   r3   chunk_size_feed_forwardseq_len_dimr  r  r'  intermediater4  r  r   r   r8   r   layernorm_beforelayernorm_afterr   s     r,   r3   ViltLayer.__init__  s    '-'E'E$&v.,V4 ( "V-?-?VEZEZ [!||F,>,>FDYDYZr+   c                    U R                  U R                  U5      UUUS9nUS   nUSS  nXaR                  UR                  5      -   nU R	                  U5      nU R                  U5      nU R                  X5      nU4U-   nU$ )N)r   r   r   )r  r?  rj   rU   r@  r>  r  )	rE   r   r   r   r   self_attention_outputsr$  r  layer_outputs	            r,   r   ViltLayer.forward  s    !%!!-0/	 "0 "
 2!4(, )+;+;<L<S<S+TT ++M:((6 {{<?/G+r+   )r  r<  r>  r@  r?  r  r=  r  r   r   s   @r,   r:  r:    s    I[ r+   r:  c                   <   ^  \ rS rSrU 4S jr     SS jrSrU =r$ )ViltEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r2   r3   rD   r   
ModuleListrangenum_hidden_layersr:  layergradient_checkpointing)rE   rD   ry   rF   s      r,   r3   ViltEncoder.__init__  sR    ]]uVE]E]?^#_?^!If$5?^#_`
&+# $`s   A&c                    U(       a  SOS nU(       a  SOS n[        U R                  5       H{  u  pU(       a  Xq4-   nUb  X9   OS nU R                  (       a1  U R                  (       a   U R	                  U
R
                  UUUU5      nO	U
" XX5      nUS   nU(       d  Ms  XS   4-   nM}     U(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr!   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr
  r!   ).0r   s     r,   	<genexpr>&ViltEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)last_hidden_stater   r    )rr   rL  rM  training_gradient_checkpointing_func__call__tupler   )rE   r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsr   layer_modulelayer_head_masklayer_outputss                r,   r   ViltEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO**t}} $ A A ))!"#%! !-]O o)!,M  &91=M<O&O#)  5,   14D Dm]GZ$[mmm++*
 	
r+   )rD   rM  rL  )NNFFTr"   r#   r$   r%   r3   r   r*   r   r   s   @r,   rG  rG    s#    , "+
 +
r+   rG  c                   .    \ rS rSr\rSrSrSS/rS r	Sr
g)	ViltPreTrainedModeli&  viltTr.   r   c                 8   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       ax  UR                  R
                  R                  SU R                  R                  S9  UR                  b2  UR                  R
                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weightsg        )meanstdNg      ?)rl   r   r   r   rY   datanormal_rD   initializer_ranger   zero_r>   r   r   fill_)rE   modules     r,   _init_weights!ViltPreTrainedModel._init_weights-  s   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S) .r+   r!   N)r"   r#   r$   r%   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesrn  r*   r!   r+   r,   rc  rc  &  s$    L&*#)+>?*r+   rc  c                     ^  \ rS rSrSU 4S jjrS rS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\\\\
R                     4   4S jj5       rSrU =r$ )	ViltModeli>  c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   N)r2   r3   rD   r.   r   rG  encoderr   r   r8   r   	layernorm
ViltPoolerpooler	post_init)rE   rD   add_pooling_layerrF   s      r,   r3   ViltModel.__init__@  si    
 	 (0"6*f&8&8f>S>ST,=j(4 	r+   c                 B    U R                   R                  R                  $ r
  r   r5   r   rE   s    r,   get_input_embeddingsViltModel.get_input_embeddingsQ  s    ..>>>r+   c                 8    XR                   R                  l        g r
  r  )rE   r   s     r,   set_input_embeddingsViltModel.set_input_embeddingsT  s    :?''7r+   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrw  rL  r  r   )rE   heads_to_prunerL  r  s       r,   _prune_headsViltModel._prune_headsW  s<    
 +002LELLu%//;;EB 3r+   r   r   r   rv   rw   r   r   r   r   r   rY  rZ  r  c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [	        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[	        S5      eUu  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUb  Ub  [	        S5      eUc  Uc  [	        S5      eUb  UR                  S   OUR                  S   nUU:w  a  [	        S	5      eUc@  [        R                  " UU R                   R                  U R                   R                  4US9nU R                  X`R                   R                  5      nU R                  UUUUUUUU	S
9u  nnU R                  X-5      nU R!                  UUUU
UUS9nUS   nU R#                  U5      nU R$                  b  U R%                  U5      OSnU(       d
  UU4USS -   $ ['        UUUR(                  UR*                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
image_token_type_idx (`int`, *optional*):
    - The token type ids for images.

Examples:

```python
>>> from transformers import ViltProcessor, ViltModel
>>> from PIL import Image
>>> import requests

>>> # prepare image and text
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "hello world"

>>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
>>> model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")

>>> inputs = processor(image, text, return_tensors="pt")
>>> outputs = model(**inputs)
>>> last_hidden_states = outputs.last_hidden_state
```NzDYou cannot specify both input_ids and inputs_embeds at the same timerQ   z5You have to specify either input_ids or inputs_embedsrT   zFYou cannot specify both pixel_values and image_embeds at the same timez7You have to specify either pixel_values or image_embedsr   zAThe text inputs and image inputs need to have the same batch size)r   )r   r   r   rY  rZ  r   )rT  pooler_outputr   r    )rD   r   rY  use_return_dictr   %warn_if_padding_and_no_attention_maskrJ   rU   r'   rt   rZ   r`   get_head_maskrK  r   get_extended_attention_maskrw  rx  rz  r   r   r    )rE   r   r   r   rv   rw   r   r   r   r   r   rY  rZ  r   text_batch_sizer   rU   image_batch_sizeembedding_outputextended_attention_maskencoder_outputssequence_outputpooled_outputs                          r,   r   ViltModel.forward_  s~   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU&1#%.%:!!@T@T!"ZZ/)FPVWN#(@eff!l&:VWW4@4L<--a0R^RdRdefRg.`aa%5t{{7M7Mt{{OeOe$fouvJ &&y++2O2OP	+/??!5 ,; 	,
(. 150P0PQ_0m,,2/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r+   )rD   r   rw  rx  rz  )TNNNNNNNNNNNN)r"   r#   r$   r%   r3   r  r  r  r   r   r'   
LongTensorr(   rm   boolr   r   r   r   r*   r   r   s   @r,   ru  ru  >  s`   "?@C  156:594815155948.2,0/3&*t
E,,-t
 !!2!23t
 !!1!12	t

 u001t
 U--.t
 E--.t
   1 12t
 u001t
 'smt
 $D>t
 'tnt
 d^t
 
)51B1B+CC	Dt
 t
r+   ru  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ry  i  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r
  )r2   r3   r   r   r8   r  Tanh
activationr   s     r,   r3   ViltPooler.__init__  s9    YYv1163E3EF
'')r+   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r  r  )rE   r   first_token_tensorr  s       r,   r   ViltPooler.forward  s6     +1a40

#566r+   )r  r  ra  r   s   @r,   ry  ry    s    $
 r+   ry  zU
    ViLT Model with a language modeling head on top as done during pretraining.
    )custom_introc                     ^  \ rS rSrSS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\\\\
R                     4   4S jj5       rSrU =r$ )ViltForMaskedLMi  zmlm_score.decoder.weightzmlm_score.decoder.biasc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r
  )r2   r3   ru  rd  ViltMLMHead	mlm_scorer{  r   s     r,   r3   ViltForMaskedLM.__init__  s4     f%	$V, 	r+   c                 .    U R                   R                  $ r
  )r  decoderr  s    r,   get_output_embeddings%ViltForMaskedLM.get_output_embeddings  s    ~~%%%r+   c                 Z    XR                   l        UR                  U R                   l        g r
  )r  r  r   )rE   new_embeddingss     r,   set_output_embeddings%ViltForMaskedLM.set_output_embeddings  s    !/,11r+   r   r   r   rv   rw   r   r   r   labelsr   rY  rZ  r  c                 J   Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUS9nUSS u  pUb  UR                  S   OUR                  S   nUSS2SU24   USS2US24   nnU R	                  U5      nSnU	ba  [        5       nU	R                  UR                  5      n	U" UR                  SU R                   R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a	  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in *[-100, 0, ...,
    config.vocab_size]* (see *input_ids* docstring) Tokens with indices set to *-100* are ignored (masked), the
    loss is only computed for the tokens with labels in *[0, ..., config.vocab_size]*

Examples:

```python
>>> from transformers import ViltProcessor, ViltForMaskedLM
>>> import requests
>>> from PIL import Image
>>> import re
>>> import torch

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "a bunch of [MASK] laying on a [MASK]."

>>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
>>> model = ViltForMaskedLM.from_pretrained("dandelin/vilt-b32-mlm")

>>> # prepare inputs
>>> encoding = processor(image, text, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**encoding)

>>> tl = len(re.findall("\[MASK\]", text))
>>> inferred_token = [text]

>>> # gradually fill in the MASK tokens, one by one
>>> with torch.no_grad():
...     for i in range(tl):
...         encoded = processor.tokenizer(inferred_token)
...         input_ids = torch.tensor(encoded.input_ids)
...         encoded = encoded["input_ids"][0][1:-1]
...         outputs = model(input_ids=input_ids, pixel_values=encoding.pixel_values)
...         mlm_logits = outputs.logits[0]  # shape (seq_len, vocab_size)
...         # only take into account text features (minus CLS and SEP token)
...         mlm_logits = mlm_logits[1 : input_ids.shape[1] - 1, :]
...         mlm_values, mlm_ids = mlm_logits.softmax(dim=-1).max(dim=-1)
...         # only take into account text
...         mlm_values[torch.tensor(encoded) != 103] = 0
...         select = mlm_values.argmax().item()
...         encoded[select] = mlm_ids[select].item()
...         inferred_token = [processor.decode(encoded)]

>>> selected_token = ""
>>> encoded = processor.tokenizer(inferred_token)
>>> output = processor.decode(encoded.input_ids[0], skip_special_tokens=True)
>>> print(output)
a bunch of cats laying on a couch.
```N
r   r   rv   rw   r   r   r   r   rY  rZ  rI   r   rQ   r   r   r   r    )rD   r  rd  rZ   r  r	   rj   rU   rc   r   r   r   r    )rE   r   r   r   rv   rw   r   r   r   r  r   rY  rZ  r  r  r  text_seq_lentext_featuresry   
mlm_logitsmasked_lm_lossloss_fctr  s                          r,   r   ViltForMaskedLM.forward  s[   R &1%<k$++B]B]))))%!'%/!5#  
 *1!&-6-Byq)H[H[\]H^+A}},<=qR^R_O_?`q^^M2
')HYYz001F%joob$++:P:P&QSYS^S^_aSbcN ]WQR[0F3A3M^%.YSYY!//))	
 	
r+   )r  rd  r  )r"   r#   r$   r%   _tied_weights_keysr3   r  r  r   r   r'   r  r(   r  r   r   r   r   r*   r   r   s   @r,   r  r    sm    56NO&2  156:594815155948-1,0/3&*o
E,,-o
 !!2!23o
 !!1!12	o

 u001o
 U--.o
 E--.o
   1 12o
 u001o
 ))*o
 $D>o
 'tno
 d^o
 
~uU%6%677	8o
 o
r+   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ViltPredictionHeadTransformiq  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g )Nr   )r2   r3   r   r   r8   r  rl   r*  r+  r   transform_act_fnr   r   r   s     r,   r3   $ViltPredictionHeadTransform.__init__r  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr+   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r
  )r  r  r   r0  s     r,   r   #ViltPredictionHeadTransform.forward{  s4    

=1--m<}5r+   )r   r  r  ra  r   s   @r,   r  r  q  s    U r+   r  c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )r  i  c                 z  > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  SS9U l	        [
        R                  " [        R                  " UR                  5      5      U l        Ub  X R                  l        U R                  U R                  l        g )NFr   )r2   r3   rD   r  	transformr   r   r8   r   r  r6   r'   r7   r   rY   )rE   rD   rY   rF   s      r,   r3   ViltMLMHead.__init__  s    4V<yy!3!3V5F5FUSLLV->->!?@	"(LL !IIr+   c                 :    U R                   U R                  l         g r
  )r   r  r  s    r,   _tie_weightsViltMLMHead._tie_weights  s     IIr+   c                 J    U R                  U5      nU R                  U5      nU$ r
  )r  r  )rE   r|   s     r,   r   ViltMLMHead.forward  s"    NN1LLOr+   )r   rD   r  r  r
  )	r"   r#   r$   r%   r3   r  r   r*   r   r   s   @r,   r  r    s    
&& r+   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for visual question answering, e.g. for VQAv2.
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )ViltForQuestionAnsweringi  c           	        > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " [
        R                  " UR                  UR                  S-  5      [
        R                  " UR                  S-  5      [
        R                  " 5       [
        R                  " UR                  S-  UR                  5      5      U l        U R                  5         g )NrI   )r2   r3   
num_labelsru  rd  r   
Sequentialr   r8   r   GELU
classifierr{  r   s     r,   r3   !ViltForQuestionAnswering.__init__  s      ++f%	 --IIf((&*<*<q*@ALL++a/0GGIIIf((1,f.?.?@	
 	r+   r   r   r   rv   rw   r   r   r   r  r   rY  rZ  r  c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUS9nU(       a  UR                  OUS   nU R	                  U5      nSnU	bJ  U	R                  UR                  5      n	[        R                  R                  X5      U	R                  S   -  nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
    Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
    all answers that are applicable for a given example in the batch, or a soft encoding indicating which
    answers are applicable, where 1.0 is the highest score.

Examples:

```python
>>> from transformers import ViltProcessor, ViltForQuestionAnswering
>>> import requests
>>> from PIL import Image

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "How many cats are there?"

>>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
>>> model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

>>> # prepare inputs
>>> encoding = processor(image, text, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**encoding)
>>> logits = outputs.logits
>>> idx = logits.argmax(-1).item()
>>> print("Predicted answer:", model.config.id2label[idx])
Predicted answer: 2
```Nr  r   rI   r  )rD   r  rd  r  r  rj   rU   r   r\    binary_cross_entropy_with_logitsrZ   r   r   r    )rE   r   r   r   rv   rw   r   r   r   r  r   rY  rZ  r  r  r   r   r  s                     r,   r    ViltForQuestionAnswering.forward  s   b &1%<k$++B]B]))))%!'%/!5#  
 2=--'!*/YYv}}-F==AA&QTZT`T`abTccD Y,F)-)9TGf$EvE'!//))	
 	
r+   r  r  rd  r  r"   r#   r$   r%   r3   r   r   r'   r  r(   r  r   r   r   r   r*   r   r   s   @r,   r  r    sV   "  156:594815155948-1,0/3&*T
E,,-T
 !!2!23T
 !!1!12	T

 u001T
 U--.T
 E--.T
   1 12T
 u001T
 ))*T
 $D>T
 'tnT
 d^T
 
'u/@/@)AA	BT
 T
r+   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for image-to-text or text-to-image retrieval, e.g. MSCOCO and F30K.
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )ViltForImageAndTextRetrievali  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  S5      U l        U R                  5         g r1   )	r2   r3   ru  rd  r   r   r8   rank_outputr{  r   s     r,   r3   %ViltForImageAndTextRetrieval.__init__  sC     f%	 99V%7%7; 	r+   r   r   r   rv   rw   r   r   r   r  r   rY  rZ  r  c                 R   Ub  UOU R                   R                  nSnU	b  [        S5      eU R                  UUUUUUUUU
UUS9nU(       a  UR                  OUS   nU R                  U5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels are currently not supported.

Examples:

```python
>>> from transformers import ViltProcessor, ViltForImageAndTextRetrieval
>>> import requests
>>> from PIL import Image

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

>>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")
>>> model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco")

>>> # forward pass
>>> scores = dict()
>>> for text in texts:
...     # prepare inputs
...     encoding = processor(image, text, return_tensors="pt")
...     outputs = model(**encoding)
...     scores[text] = outputs.logits[0, :].item()
```NzTraining is not yet supported.r  r   rI   r  )	rD   r  NotImplementedErrorrd  r  r  r   r   r    )rE   r   r   r   rv   rw   r   r   r   r  r   rY  rZ  r   r  r  r   r  s                     r,   r   $ViltForImageAndTextRetrieval.forward  s    Z &1%<k$++B]B]%&FGG))))%!'%/!5#  
 2=--'!*!!-0Y,F)-)9TGf$EvE'!//))	
 	
r+   )r  rd  r  r  r   s   @r,   r  r    sV   	  156:594815155948-1,0/3&*M
E,,-M
 !!2!23M
 !!1!12	M

 u001M
 U--.M
 E--.M
   1 12M
 u001M
 ))*M
 $D>M
 'tnM
 d^M
 
'u/@/@)AA	BM
 M
r+   r  zq
    Vilt Model transformer with a classifier head on top for natural language visual reasoning, e.g. NLVR2.
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )"ViltForImagesAndTextClassificationik  c           	        > [         TU ]  U5        UR                  U l        [        U5      U l        UR
                  n[        R                  " [        R                  " UR                  U-  UR                  U-  5      [        R                  " UR                  U-  5      [        R                  " 5       [        R                  " UR                  U-  UR                  5      5      U l        U R                  5         g r
  )r2   r3   r  ru  rd  
num_imagesr   r  r   r8   r   r  r  r{  )rE   rD   r  rF   s      r,   r3   +ViltForImagesAndTextClassification.__init__q  s      ++f%	 &&
--IIf((:5v7I7IJ7VWLL++j89GGIIIf((:5v7H7HI	
 	r+   r   r   r   rv   rw   r   r   r   r  r   rY  rZ  r  c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb!  UR                  S:X  a  UR                  S5      nUb!  UR                  S:X  a  UR                  S5      nUb  UR                  S   OSnUc  Ub  UR                  S   OSnXR                   R                  :w  a  [        S5      e/ nU(       a  / OSnU
(       a  / OSn[        U5       H  nU R                  UUUUb  USS2USS2SS2SS24   OSUb  USS2USS2SS24   OSUUUb  USS2USS2SS24   OSUS-   U
UUS9nU(       a  UR                  OUS   nUR                  U5        U(       a  UR                  UR                  5        U
(       d  M  UR                  UR                  5        M     [        R                   " USS9nU R#                  U5      nSnU	bW  [%        5       nU	R'                  UR(                  5      n	U" UR+                  SU R,                  5      U	R+                  S5      5      nU(       d  UUU4nUb  U4U-   $ U$ [/        UUUUS	9$ )
as  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Binary classification labels.

Examples:

```python
>>> from transformers import ViltProcessor, ViltForImagesAndTextClassification
>>> import requests
>>> from PIL import Image

>>> image1 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg", stream=True).raw)
>>> image2 = Image.open(requests.get("https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg", stream=True).raw)
>>> text = "The left image contains twice the number of dogs as the right image."

>>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
>>> model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")

>>> # prepare inputs
>>> encoding = processor([image1, image2], text, return_tensors="pt")

>>> # forward pass
>>> outputs = model(input_ids=encoding.input_ids, pixel_values=encoding.pixel_values.unsqueeze(0))
>>> logits = outputs.logits
>>> idx = logits.argmax(-1).item()
>>> print("Predicted answer:", model.config.id2label[idx])
Predicted answer: True
```N   r   r
   z\Make sure to match the number of images in the model with the number of images in the input.)r   r   rv   rw   r   r   r   r   r   rY  rZ  rQ   rK   r  )rD   r   rY  r  ndim	unsqueezerZ   r  r   rJ  rd  r  ru   r   r    r'   rd   r  r	   rj   rU   rc   r  r   )rE   r   r   r   rv   rw   r   r   r   r  r   rY  rZ  r  pooler_outputsr   r    r   r  r  r  r   r   r  r  s                            r,   r   *ViltForImagesAndTextClassification.forward  s   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]#(9(9Q(>'11!4L#(9(9Q(>'11!4L.:.F\''*D
2>2J++A.PTJ///n  2,R$
z"Aii--<H<T\!Q1a-8Z^5?5K:aAqj1QU#+9E9Q\!Q1*5W[%&U"3%9'   G 6AG11gajM!!-0#$$W%:%:;  !!'"4"45+ #. 		.b9/')HYYv}}-FFKKDOO<fkk"oNDmZ8F)-)9TGf$EvE7'!	
 	
r+   r  r  )r"   r#   r$   r%   r3   r   r   r'   r  r(   r  r   r   r   r   r*   r   r   s   @r,   r  r  k  sV   $  156:594815155948-1,0/3&*p
E,,-p
 !!2!23p
 !!1!12	p

 u001p
 U--.p
 E--.p
   1 12p
 u001p
 ))*p
 $D>p
 'tnp
 d^p
 
7u?P?P9QQ	Rp
 p
r+   r  c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )ViltForTokenClassificationi  c                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g )NF)r|  )r2   r3   r  ru  rd  r   rA   rB   rC   r   r8   r  r{  r   s     r,   r3   #ViltForTokenClassification.__init__  sk      ++f>	zz&"<"<=))F$6$68I8IJ 	r+   r   r   r   rv   rw   r   r   r   r  r   rY  rZ  r  c                 :   Ub  UOU R                   R                  nU R                  UUUUUUUUU
UUS9nUS   nUb  UR                  S   OUR                  S   nU R	                  U5      nU R                  USS2SU24   5      nSnU	bW  [        5       nU	R                  UR                  5      n	U" UR                  SU R                  5      U	R                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
Nr  r   r   rQ   rI   r  )rD   r  rd  rZ   rC   r  r	   rj   rU   rc   r  r   r   r    )rE   r   r   r   rv   rw   r   r   r   r  r   rY  rZ  r  r  text_input_sizer   r   r  r  s                       r,   r   "ViltForTokenClassification.forward  s?   0 &1%<k$++B]B]))))%!'%/!5#  
 "!*090E)//!,=K^K^_`Ka,,74D_4D1D!EF')HYYv}}-FFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r+   )r  rC   r  rd  r  )r"   r#   r$   r%   r3   r   r   r'   r  r(   r  r   r   r   r   r*   r   r   s   @r,   r  r    sG   
  156:594815155948-1,0/3&*>
E,,->
 !!2!23>
 !!1!12	>

 u001>
 U--.>
 E--.>
   1 12>
 u001>
 ))*>
 $D>>
 'tn>
 d^>
 
$eE,=,=&>>	?>
 >
r+   r  )r  r  r  r  r  r:  ru  rc  )>r&   collections.abcr   r   dataclassesr   typingr   r   r   r   r'   torch.utils.checkpointr   torch.nnr	   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   configuration_viltr   
get_loggerr"   loggerr   Moduler.   r4   r:   r   r  r  r'  r4  r:  rG  rc  ru  ry  r  r  r  r  r  r  r  __all__r!   r+   r,   <module>r     sU      ! / /    % !  . [ [ , * 
		H	% @{ @ @2W!RYY W!t6RYY 6r")) >9		 9zRYY $BII Fryy " #		 #L2
")) 2
j */ * *. U
# U
 U
p  
C
) C

C
L")) "")) , g
2 g
g
T Z
#6 Z
Z
z 
D
)< D

D
N L
!4 L
 L
^	r+   