
    fTh7                        S r SSKrSSKrSSKJrJrJrJrJ	r	J
r
Jr  SSKrSSKrSSKJr  SSKJrJrJr  SSKJr  SSKJrJrJrJr  SS	KJrJr  SS
KJrJr  SSK J!r!J"r"J#r#  SSK$J%r%  \"RL                  " \'5      r( " S S\RR                  5      r* " S S\RR                  5      r+ S6S\RR                  S\RX                  S\RX                  S\RX                  S\\RX                     S\-S\-4S jjr. " S S\RR                  5      r/ " S S\RR                  5      r0 " S S\RR                  5      r1 " S  S!\RR                  5      r2 " S" S#\RR                  5      r3 " S$ S%\RR                  5      r4 " S& S'\RR                  5      r5\! " S( S)\5      5       r6\! " S* S+\65      5       r7 " S, S-\RR                  5      r8\!" S.S/9 " S0 S1\65      5       r9\!" S2S/9 " S3 S4\65      5       r:/ S5Qr;g)7zPyTorch ViT model.    N)CallableDictListOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )	ViTConfigc            	          ^  \ rS rSrSrSS\S\SS4U 4S jjjrS\R                  S	\
S
\
S\R                  4S jr  SS\R                  S\\R                     S\S\R                  4S jjrSrU =r$ )ViTEmbeddings*   zZ
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
configuse_mask_tokenreturnNc                 `  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        U(       a6  [        R                  " [        R                  " SSUR                  5      5      OS U l	        [        U5      U l        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l        [        R                  " UR                  5      U l        UR"                  U l        Xl        g )Nr   )super__init__r
   	Parametertorchrandnhidden_size	cls_tokenzeros
mask_tokenViTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer    )selfr    r!   r/   	__class__s       \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vit/modeling_vit.pyr%   ViTEmbeddings.__init__/   s    ekk!Q8J8J&KLQ_",,u{{1a9K9K'LMei 26 :++77#%<<A{QPVPbPb0c#d zz&"<"<= ++    
embeddingsheightwidthc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   N      ?r   r      bicubicF)sizemodealign_cornersdim)shaper0   r'   jit
is_tracingr4   r   reshapepermuter
   
functionalinterpolateviewcat)r5   r:   r;   r<   r/   num_positionsclass_pos_embedpatch_pos_embedrF   
new_height	new_widthsqrt_num_positionss               r7   interpolate_pos_encoding&ViTEmbeddings.interpolate_pos_encoding;   sS    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr9   pixel_valuesbool_masked_posrV   c                    UR                   u  pEpgU R                  XS9nUbX  UR                   S   n	U R                  R                  XIS5      n
UR	                  S5      R                  U
5      nUSU-
  -  X-  -   nU R                  R                  USS5      n[        R                  " X4SS9nU(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nU$ )N)rV   r   r>         ?rE   )rG   r.   r,   expand	unsqueezetype_asr*   r'   rO   rV   r0   r3   )r5   rX   rY   rV   
batch_sizenum_channelsr;   r<   r:   
seq_lengthmask_tokensmask
cls_tokenss                r7   forwardViTEmbeddings.forwardc   s     3?2D2D/
&**<*k
&#))!,J//00LK",,R088ED#sTz2[5GGJ ^^**:r2>
YY
7Q?
 $#&C&CJX]&^^J#&>&>>J\\*-
r9   )r*   r    r3   r,   r.   r4   r0   FNF)__name__
__module____qualname____firstlineno____doc__r   boolr%   r'   TensorintrV   r   
BoolTensorre   __static_attributes____classcell__r6   s   @r7   r   r   *   s    
y 
$ 
4 
 
&D5<< &D &DUX &D]b]i]i &DV 7;).	ll "%"2"23 #'	
 
 r9   r   c                   n   ^  \ rS rSrSrU 4S jrS	S\R                  S\S\R                  4S jjr	Sr
U =r$ )
r-      z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)r$   r%   
image_sizer4   r`   r)   
isinstancecollectionsabcIterabler/   r
   Conv2d
projection)r5   r    rz   r4   r`   r)   r/   r6   s          r7   r%   ViTPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir9   rX   rV   r"   c                    UR                   u  p4pVX@R                  :w  a  [        SU R                   SU S35      eU(       dV  XPR                  S   :w  d  X`R                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S	3	5      eU R	                  U5      R                  S
5      R                  SS
5      nU$ )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).r@   )rG   r`   
ValueErrorrz   r   flatten	transpose)r5   rX   rV   r_   r`   r;   r<   r:   s           r7   re   ViTPatchEmbeddings.forward   s    2>2D2D/
&,,,!../yaI  (++u8J/J (% 9+,Adooa.@-AE  __\2::1=GG1M
r9   )rz   r`   r/   r4   r   rg   )ri   rj   rk   rl   rm   r%   r'   ro   rn   re   rr   rs   rt   s   @r7   r-   r-      s8    jELL D ]b]i]i  r9   r-   modulequerykeyvalueattention_maskscalingr3   c                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9nUb  X-  n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr>   )rF   dtype)ptrainingr   r@   )r'   matmulr   r
   rL   softmaxfloat32tor   r3   r   
contiguous)
r   r   r   r   r   r   r3   kwargsattn_weightsattn_outputs
             r7   eager_attention_forwardr      s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#4,,|3K''1-88:K$$r9   c            
          ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jr SS\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )ViTSelfAttention   r    r"   Nc                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   g      F)bias)r$   r%   r)   num_attention_headshasattrr   r    rp   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr
   Linearqkv_biasr   r   r   r5   r    r6   s     r7   r%   ViTSelfAttention.__init__   sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r9   xc                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr>   r   r@   r   r   )rB   r   r   rN   rK   )r5   r   new_x_shapes      r7   transpose_for_scores%ViTSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r9   	head_maskoutput_attentionsc                    U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      n[        nU R
                  R                  S:w  aT  U R
                  R                  S:X  a  U(       a  [        R                  S5        O[        U R
                  R                     nU" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pUR                  5       S S U R                  4-   n
UR!                  U
5      nU(       a  X4nU$ U4nU$ )Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r3   r   )r   r   r   r   r   r    _attn_implementationloggerwarning_oncer   r   r   r   r   rB   r   rJ   )r5   hidden_statesr   r   	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss               r7   re   ViTSelfAttention.forward   s9    --dhh}.EF	//

=0IJ//

=0IJ(?;;++w6{{//69>O##L
 '>dkk>^>^&_#)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF6G=2 O\M]r9   )
r   r   r    r   r   r   r   r   r   r   rh   )ri   rj   rk   rl   r   r%   r'   ro   r   r   rn   r	   r   re   rr   rs   rt   s   @r7   r   r      s    ]y ]T ](%ell %u|| % bg!(0(>!Z^!	uU\\5<</0%2EE	F! !r9   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )ViTSelfOutputi  z
The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r    r"   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g N)	r$   r%   r
   r   r)   denser1   r2   r3   r   s     r7   r%   ViTSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r9   r   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ r   r   r3   r5   r   r   s      r7   re   ViTSelfOutput.forward  s$    

=1]3r9   r   )ri   rj   rk   rl   rm   r   r%   r'   ro   re   rr   rs   rt   s   @r7   r   r     sI    
>y >T >
U\\  RWR^R^  r9   r   c                      ^  \ rS rSrS\SS4U 4S jjrS\\   SS4S jr  SS\	R                  S	\\	R                     S
\S\\\	R                  \	R                  4   \\	R                     4   4S jjrSrU =r$ )ViTAttentioni  r    r"   Nc                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )r$   r%   r   	attentionr   outputsetpruned_headsr   s     r7   r%   ViTAttention.__init__  s0    )&1#F+Er9   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rE   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r5   r   indexs      r7   prune_headsViTAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r9   r   r   r   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r   r   )r5   r   r   r   self_outputsattention_outputr   s          r7   re   ViTAttention.forward0  sC     ~~m@QR;;|AF#%QR(88r9   )r   r   r   rh   )ri   rj   rk   rl   r   r%   r   rp   r   r'   ro   r   rn   r	   r   re   rr   rs   rt   s   @r7   r   r     s    "y "T ";S ;d ;* -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r9   r   c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	ViTIntermediatei>  r    r"   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r$   r%   r
   r   r)   intermediate_sizer   r{   
hidden_actstrr   intermediate_act_fnr   s     r7   r%   ViTIntermediate.__init__?  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r9   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   )r5   r   s     r7   re   ViTIntermediate.forwardG  s&    

=100?r9   r   ri   rj   rk   rl   r   r%   r'   ro   re   rr   rs   rt   s   @r7   r   r   >  s6    9y 9T 9U\\ ell  r9   r   c                      ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S jrS	r	U =r
$ )
	ViTOutputiN  r    r"   Nc                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
r$   r%   r
   r   r   r)   r   r1   r2   r3   r   s     r7   r%   ViTOutput.__init__O  sB    YYv779K9KL
zz&"<"<=r9   r   r   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r7   re   ViTOutput.forwardT  s,    

=1]3%4r9   r   r   rt   s   @r7   r   r   N  sD    >y >T >
U\\  RWR^R^  r9   r   c                      ^  \ rS rSrSrS\SS4U 4S jjr  SS\R                  S\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )ViTLayeri]  z?This corresponds to the Block class in the timm implementation.r    r"   Nc                 j  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  S9U l        g )Nr   eps)r$   r%   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r
   	LayerNormr)   layer_norm_epslayernorm_beforelayernorm_afterr   s     r7   r%   ViTLayer.__init__`  s    '-'E'E$%f-+F3' "V-?-?VEZEZ [!||F,>,>FDYDYZr9   r   r   r   c                     U R                  U R                  U5      UUS9nUS   nUSS  nXQ-   nU R                  U5      nU R                  U5      nU R	                  Xq5      nU4U-   nU$ )N)r   r   r   )r   r   r  r   r   )r5   r   r   r   self_attention_outputsr   r   layer_outputs           r7   re   ViTLayer.forwardj  s     "&!!-0/ "0 "

 2!4(, )8 ++M:((6 {{<?/G+r9   )r   r   r   r  r   r   r   rh   )ri   rj   rk   rl   rm   r   r%   r'   ro   r   rn   r	   r   re   rr   rs   rt   s   @r7   r   r   ]  s    I[y [T [ -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r9   r   c                      ^  \ rS rSrS\SS4U 4S jjr    SS\R                  S\\R                     S\	S	\	S
\	S\
\\4   4S jjrSrU =r$ )
ViTEncoderi  r    r"   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf rh   )
r$   r%   r    r
   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r5   r    _r6   s      r7   r%   ViTEncoder.__init__  sR    ]]eFD\D\>]#^>]HV$4>]#^_
&+# $_s   A&r   r   r   output_hidden_statesreturn_dictc                    U(       a  SOS nU(       a  SOS n[        U R                  5       Hz  u  pU(       a  Xa4-   nUb  X(   OS n
U R                  (       a0  U R                  (       a  U R	                  U	R
                  UU
U5      nO	U	" XU5      nUS   nU(       d  Mr  X{S   4-   nM|     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )N r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r  ).0vs     r7   	<genexpr>%ViTEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)last_hidden_stater   
attentions)	enumerater  r  r   _gradient_checkpointing_func__call__tupler   )r5   r   r   r   r  r  all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r7   re   ViTEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO**t}} $ A A ))!#%	! !-]M^ _)!,M  &91=M<O&O#'  5*   14D Dm]GZ$[mmm++*
 	
r9   )r    r  r  )NFFT)ri   rj   rk   rl   r   r%   r'   ro   r   rn   r	   r  r   re   rr   rs   rt   s   @r7   r  r    s    ,y ,T , -1"'%* )
||)
 ELL))
  	)

 #)
 )
 
uo%	&)
 )
r9   r  c                       \ rS rSr\rSrSrSrSS/r	Sr
SrS\\R                  \R                  \R                   4   SS	4S
 jrSrg	)ViTPreTrainedModeli  vitrX   Tr   r   r   r"   Nc                    [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR                  R                  5      UR                  l        UR                  b%  UR                  R                  R                  5         gg[        U[        R                   5      (       aJ  UR                  R                  R                  5         UR                  R                  R#                  S5        g[        U[$        5      (       Ga_  [        R                  R                  UR&                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR&                  R                  5      UR&                  l        [        R                  R                  UR(                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR(                  R                  5      UR(                  l        UR*                  b%  UR*                  R                  R                  5         ggg)zInitialize the weightsr   )meanstdNr[   )r{   r
   r   r   inittrunc_normal_weightdatar   r'   r   r    initializer_ranger   r   zero_r   fill_r   r0   r*   r,   )r5   r   s     r7   _init_weights ViTPreTrainedModel._init_weights  s   fryy"))455 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '--KK""$MM$$S)...0gg.C.C**//225==AKK11 /D / b++112	 &&+ %'GG$9$9  %%((7KK11 %: % b!!''(	 !   ,!!&&,,. - /r9   r  )ri   rj   rk   rl   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2r	   r
   r   r   r   r4  rr   r  r9   r7   r(  r(    sZ    L$O&*#(*5N!/E"))RYY*L$M /RV /r9   r(  c                   "  ^  \ rS rSrSS\S\S\4U 4S jjjrS\4S jrS\	\
\\
   4   SS	4S
 jr\       SS\\R                      S\\R"                     S\\R                      S\\   S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )ViTModeli  r    add_pooling_layerr!   c                   > [         TU ]  U5        Xl        [        XS9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U(       a  [        U5      OSU l        U R                  5         g)z
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether to use a mask token for masked image modeling.
)r!   r   N)r$   r%   r    r   r:   r  encoderr
   r   r)   r   	layernorm	ViTPoolerpooler	post_init)r5   r    r?  r!   r6   s       r7   r%   ViTModel.__init__  si     	 'N!&)f&8&8f>S>ST+<i'$ 	r9   r"   c                 .    U R                   R                  $ r   )r:   r.   )r5   s    r7   get_input_embeddingsViTModel.get_input_embeddings  s    ///r9   heads_to_pruneNc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrA  r  r   r   )r5   rJ  r  r   s       r7   _prune_headsViTModel._prune_heads  s<    
 +002LELLu%//;;EB 3r9   rX   rY   r   r   r  rV   r  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  X0R                   R                  5      nU R                  R                  R                  R                  R                  nUR                  U:w  a  UR                  U5      nU R                  XUS9n	U R                  U	UUUUS9n
U
S   nU R                  U5      nU R                  b  U R                  U5      OSnU(       d  Ub  X4OU4nXSS -   $ [!        UUU
R"                  U
R$                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
Nz You have to specify pixel_values)rY   rV   )r   r   r  r  r   r   )r  pooler_outputr   r  )r    r   r  use_return_dictr   get_head_maskr  r:   r.   r   r/  r   r   rA  rB  rD  r   r   r  )r5   rX   rY   r   r   r  rV   r  expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputhead_outputss                 r7   re   ViTModel.forward  s|    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y++2O2OP	 99DDKKQQ/'??>:L??Tl + 
 ,,/!5# ' 
 *!,..98<8OO4UY?L?XO;_n^pL!""555)-')77&11	
 	
r9   )r    r:   rA  rB  rD  )TFNNNNNNN)ri   rj   rk   rl   r   rn   r%   r-   rH  r   rp   r   rM  r   r   r'   ro   rq   r	   r   r   re   rr   rs   rt   s   @r7   r>  r>    s   y T Z^  &0&8 0C4T#Y+? CD C  046:,0,0/337&*;
u||,;
 "%"2"23;
 ELL)	;

 $D>;
 'tn;
 #+4.;
 d^;
 
u00	1;
 ;
r9   r>  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )rC  iA  r    c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                     U l	        g r   )
r$   r%   r
   r   r)   pooler_output_sizer   r   
pooler_act
activationr   s     r7   r%   ViTPooler.__init__B  s>    YYv1163L3LM
 !2!23r9   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r_  )r5   r   first_token_tensorrW  s       r7   re   ViTPooler.forwardG  s6     +1a40

#566r9   )r_  r   )	ri   rj   rk   rl   r   r%   re   rr   rs   rt   s   @r7   rC  rC  A  s    4y 4
 r9   rC  a[  
    ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                      ^  \ rS rSrS\SS4U 4S jjr\       SS\\R                     S\\R                     S\\R                     S	\\   S
\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )ViTForMaskedImageModelingiP  r    r"   Nc                 H  > [         TU ]  U5        [        USSS9U l        [        R
                  " [        R                  " UR                  UR                  S-  UR                  -  SS9[        R                  " UR                  5      5      U l        U R                  5         g )NFT)r?  r!   r@   r   )in_channelsout_channelsrx   )r$   r%   r>  r)  r
   
Sequentialr   r)   encoder_strider`   PixelShuffledecoderrE  r   s     r7   r%   "ViTForMaskedImageModeling.__init__]  s     FeDQ}}II"..#22A58K8KK
 OOF112
 	r9   rX   rY   r   r   r  rV   r  c           
         Ub  UOU R                   R                  nUbh  U R                   R                  U R                   R                  :w  a:  [	        SU R                   R                   SU R                   R                   S35      eU R                  UUUUUUUS9nUS   n	U	SS2SS24   n	U	R                  u  pn[        R                  " US-  5      =pU	R                  SS	S5      R                  XX5      n	U R                  U	5      nSnUGb  U R                   R                  U R                   R                  -  nUR                  S
UU5      nUR                  U R                   R                  S5      R                  U R                   R                  S	5      R                  S5      R                  5       n[         R"                  R%                  XSS9nUU-  R'                  5       UR'                  5       S-   -  U R                   R(                  -  nU(       d  U4USS -   nUb  U4U-   $ U$ [+        UUUR,                  UR.                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Examples:
```python
>>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
>>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
>>> list(reconstructed_pixel_values.shape)
[1, 3, 224, 224]
```NzWhen `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that the reconstructed image has the same dimensions as the input. Got `patch_size` = z and `encoder_stride` = r   )rY   r   r   r  rV   r  r   r   r?   r@   r>   none)	reductiongh㈵>)lossreconstructionr   r  )r    rQ  r4   rk  r   r)  rG   mathfloorrK   rJ   rm  rz   repeat_interleaver]   r   r
   rL   l1_losssumr`   r   r   r  )r5   rX   rY   r   r   r  rV   r  r   rV  r_   sequence_lengthr`   r;   r<   reconstructed_pixel_valuesmasked_im_lossrB   rc   reconstruction_lossr   s                        r7   re   !ViTForMaskedImageModeling.forwardn  sF   L &1%<k$++B]B]&DKK,B,BdkkF`F`,`&&*kk&<&<%==UVZVaVaVpVpUqqrt  ((+/!5%=#  
 "!* *!QR%04C4I4I1
\OS$899)11!Q:BB:]ck &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7lr"7"s1D8==?488:PTCTUX\XcXcXpXppN02WQR[@F3A3M^%.YSYY(5!//))	
 	
r9   )rm  r)  rZ  )ri   rj   rk   rl   r   r%   r   r   r'   ro   rq   rn   r	   r  r   re   rr   rs   rt   s   @r7   rf  rf  P  s    y T "  046:,0,0/337&*Y
u||,Y
 "%"2"23Y
 ELL)	Y

 $D>Y
 'tnY
 #+4.Y
 d^Y
 
u//	0Y
 Y
r9   rf  a  
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                      ^  \ rS rSrS\SS4U 4S jjr\       SS\\R                     S\\R                     S\\R                     S	\\
   S
\\
   S\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )ViTForImageClassificationi  r    r"   Nc                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NF)r?  r   )r$   r%   
num_labelsr>  r)  r
   r   r)   Identity
classifierrE  r   s     r7   r%   "ViTForImageClassification.__init__  ss      ++Fe< OUN_N_bcNc"))F$6$68I8IJikititiv 	r9   rX   r   labelsr   r  rV   r  c           	      t   Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   n	U R                  U	SS2SSS24   5      n
SnUGb  UR	                  U
R
                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R                  S:X  a&  U" U
R                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" U
R                  SU R                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [!        5       nU" X5      nU(       d  U
4USS -   nUb  U4U-   $ U$ [#        UU
UR$                  UR&                  S	9$ )
ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r   r   r  rV   r  r   r   
regressionsingle_label_classificationmulti_label_classificationr>   )rr  logitsr   r  )r    rQ  r)  r  r   deviceproblem_typer  r   r'   longrp   r   squeezer   rN   r   r   r   r  )r5   rX   r   r  r   r  rV   r  r   rV  r  rr  loss_fctr   s                 r7   re   !ViTForImageClassification.forward  s   " &1%<k$++B]B]((/!5%=#  
 "!*Aq!9:YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE$!//))	
 	
r9   )r  r  r)  rZ  )ri   rj   rk   rl   r   r%   r   r   r'   ro   rn   r	   r  r   re   rr   rs   rt   s   @r7   r  r    s    
y 
T 
  04,0)-,0/337&*A
u||,A
 ELL)A
 &	A

 $D>A
 'tnA
 #+4.A
 d^A
 
u++	,A
 A
r9   r  )r  rf  r>  r(  )r   )<rm   collections.abcr|   rt  typingr   r   r   r   r   r   r	   r'   torch.utils.checkpointr
   torch.nnr   r   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   configuration_vitr   
get_loggerri   r   Moduler   r-   ro   floatr   r   r   r   r   r   r   r  r(  r>  rC  rf  r  __all__r  r9   r7   <module>r     s      D D D    A A !  G Q 7 7 ( 
		H	%UBII Up$ $\ %II%<<% 
% <<	%
 U\\*% % %<;ryy ;|BII $$299 $Nbii  		 'ryy 'T0
 0
f $/ $/ $/N [
! [
 [
|		  	l
 2 l
l
^ O
 2 O
O
d gr9   