
    fTh|                     <   S r SSKrSSKJrJrJrJrJrJ	r	J
r
  SSKrSSKrSSKJr  SSKJrJrJr  SSKJr  SSKJrJrJrJr  SS	KJrJr  SS
KJrJr  SSKJ r J!r!J"r"  SSK#J$r$  SSK%J&r&  \!RN                  " \(5      r) " S S\RT                  5      r+ " S S\RT                  5      r, S>S\RT                  S\RZ                  S\RZ                  S\RZ                  S\\RZ                     S\.S\.4S jjr/ " S S\RT                  5      r0 " S S\RT                  5      r1 " S S \RT                  5      r2 " S! S"\RT                  5      r3S?S#\RZ                  S$\.S%\4S&\RZ                  4S' jjr5 " S( S)\RT                  5      r6 " S* S+\RT                  5      r7 " S, S-\RT                  5      r8 " S. S/\RT                  5      r9 " S0 S1\RT                  5      r:\  " S2 S3\5      5       r;\  " S4 S5\;5      5       r<\ " S6S79 " S8 S9\;5      5       r=\ " S:S79 " S; S<\;\$5      5       r>/ S=Qr?g)@zPyTorch DINOv2 model.    N)CallableDictListOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int)BackboneMixin   )Dinov2Configc                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
SS\R                  S\\R                     S\R                  4S jjrSrU =r$ )Dinov2Embeddings%   zE
Construct the CLS token, mask token, position and patch embeddings.
configreturnNc                   > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a:  [        R                  " [        R                  " SUR                  5      5      U l
        [        U5      U l        U R                  R                  n[        R                  " [        R
                  " SUS-   UR                  5      5      U l        [        R                  " UR                   5      U l        UR$                  U l        UR                  U l        Xl        g )Nr   )super__init__r
   	Parametertorchrandnhidden_size	cls_tokenuse_mask_tokenzeros
mask_tokenDinov2PatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer!   )selfr!   r0   	__class__s      b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/dinov2/modeling_dinov2.pyr%   Dinov2Embeddings.__init__*   s    ekk!Q8J8J&KL   ll5;;q&:L:L+MNDO 5f =++77#%<<A{QPVPbPb0c#d zz&"<"<= ++$33    
embeddingsheightwidthc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      nUR                  n[        R                  R                  UR                  [        R                  5      X4SS	S
9R                  US9nUR                  SSSS5      R                  SSU5      n[        R                   " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Ng      ?r   r      bicubicF)sizemodealign_cornersdtypedim)shaper1   r'   jit
is_tracingr5   r   reshapepermuterF   r
   
functionalinterpolatetofloat32viewcat)r6   r;   r<   r=   r0   num_positionsclass_pos_embedpatch_pos_embedrH   
new_height	new_widthsqrt_num_positionstarget_dtypes                r8   interpolate_pos_encoding)Dinov2Embeddings.interpolate_pos_encoding8   s~    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=&,,--33u}}-(	 4 

 "<"
  	 *11!Q1=BB1b#Nyy/;CCr:   pixel_valuesbool_masked_posc                 >   UR                   u  p4pVU R                  R                  R                  R                  nU R                  UR                  US95      nUbj  U R                  (       aY  [        R                  " UR                  S5      U R                  R                  UR                  5      R                  S5      U5      nU R                  R                  USS5      n	[        R                  " X4SS9nXR                  XU5      -   nU R                  U5      nU$ )NrE   r?   r   r   rG   )rI   r/   
projectionweightrF   rP   r+   r'   where	unsqueezer-   r*   expandrS   r[   r4   )
r6   r]   r^   
batch_size_r<   r=   rZ   r;   
cls_tokenss
             r8   forwardDinov2Embeddings.forward`   s    '3'9'9$
v,,77>>DD**<???+NO
&4+>+>))"-t/A/A*BRBR/S/]/]^_/`blJ
 ^^**:r2>
YY
7Q?
  "?"?
TY"ZZ
\\*-
r:   )r*   r!   r4   r-   r/   r5   r1   r+   N)__name__
__module____qualname____firstlineno____doc__r   r%   r'   Tensorintr[   r   rh   __static_attributes____classcell__r7   s   @r8   r   r   %   s    |  &D5<< &D &DUX &D]b]i]i &DPELL 8ELLCY ejeqeq  r:   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )r.   v   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)r$   r%   
image_sizer5   num_channelsr)   
isinstancecollectionsabcIterabler0   r
   Conv2dr`   )r6   r!   rz   r5   r{   r)   r0   r7   s          r8   r%   Dinov2PatchEmbeddings.__init__}   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&))L:ir:   r]   r"   c                     UR                   S   nX R                  :w  a  [        SU R                   SU S35      eU R                  U5      R	                  S5      R                  SS5      nU$ )Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r@   )rI   r{   
ValueErrorr`   flatten	transpose)r6   r]   r{   r;   s       r8   rh   Dinov2PatchEmbeddings.forward   sx    #))!,,,,!../yaI  __\2::1=GG1M
r:   )rz   r{   r0   r5   r`   )rk   rl   rm   rn   ro   r%   r'   rp   rh   rr   rs   rt   s   @r8   r.   r.   v   s.    jELL U\\  r:   r.   modulequerykeyvalueattention_maskscalingr4   c                    [         R                  " XR                  SS5      5      U-  n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9nUb  X-  n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr?   )rH   rF   )ptrainingr   r@   )r'   matmulr   r
   rN   softmaxrQ   rP   rF   r4   r   
contiguous)
r   r   r   r   r   r   r4   kwargsattn_weightsattn_outputs
             r8   eager_attention_forwardr      s     <<}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L !#4,,|3K''1-88:K$$r:   c            
          ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jr SS\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )Dinov2SelfAttention   r!   r"   Nc                 0  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        UR                  U l        U R                  S-  U l        SU l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        [        R                  " UR                  U R                  UR                   S9U l        g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   g      Fbias)r$   r%   r)   num_attention_headshasattrr   r!   rq   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr   	is_causalr
   Linearqkv_biasr   r   r   r6   r!   r7   s     r8   r%   Dinov2SelfAttention.__init__   sG    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r:   xc                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )Nr?   r   r@   r   r   )rB   r   r   rR   rM   )r6   r   new_x_shapes      r8   transpose_for_scores(Dinov2SelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r:   	head_maskoutput_attentionsc                    U R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      n[        nU R
                  R                  S:w  aT  U R
                  R                  S:X  a  U(       a  [        R                  S5        O[        U R
                  R                     nU" U UUUUU R                  U R                  U R                  (       d  SOU R                  S9u  pUR                  5       S S U R                  4-   n
UR!                  U
5      nU(       a  X4nU$ U4nU$ )Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r4   r   )r   r   r   r   r   r!   _attn_implementationloggerwarning_oncer   r   r   r   r   rB   r   rL   )r6   hidden_statesr   r   	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss               r8   rh   Dinov2SelfAttention.forward   s9    --dhh}.EF	//

=0IJ//

=0IJ(?;;++w6{{//69>O##L
 '>dkk>^>^&_#)<nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EF6G=2 O\M]r:   )
r   r   r!   r   r   r   r   r   r   r   NF)rk   rl   rm   rn   r   r%   r'   rp   r   r   boolr	   r   rh   rr   rs   rt   s   @r8   r   r      s    ]| ] ](%ell %u|| % bg!(0(>!Z^!	uU\\5<</0%2EE	F! !r:   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S	 jr	S
r
U =r$ )Dinov2SelfOutput   z
The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
layernorm applied before each block.
r!   r"   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g rj   )	r$   r%   r
   r   r)   denser2   r3   r4   r   s     r8   r%   Dinov2SelfOutput.__init__   sB    YYv1163E3EF
zz&"<"<=r:   r   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ rj   r   r4   )r6   r   r   s      r8   rh   Dinov2SelfOutput.forward  s$    

=1]3r:   r   )rk   rl   rm   rn   ro   r   r%   r'   rp   rh   rr   rs   rt   s   @r8   r   r      sI    
>| > >
U\\  RWR^R^  r:   r   c                      ^  \ rS rSrS\SS4U 4S jjrS\\   SS4S jr  SS\	R                  S	\\	R                     S
\S\\\	R                  \	R                  4   \\	R                     4   4S jjrSrU =r$ )Dinov2Attentioni	  r!   r"   Nc                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g rj   )r$   r%   r   	attentionr   outputsetpruned_headsr   s     r8   r%   Dinov2Attention.__init__
  s0    ,V4&v.Er:   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   rG   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)r6   r   indexs      r8   prune_headsDinov2Attention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r:   r   r   r   c                 f    U R                  XU5      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r   r   )r6   r   r   r   self_outputsattention_outputr   s          r8   rh   Dinov2Attention.forward"  sC     ~~m@QR;;|AF#%QR(88r:   )r   r   r   r   )rk   rl   rm   rn   r   r%   r   rq   r   r'   rp   r   r   r	   r   rh   rr   rs   rt   s   @r8   r   r   	  s    "| " ";S ;d ;* -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r:   r   c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )Dinov2LayerScalei0  r"   c                    > [         TU ]  5         [        R                  " UR                  [
        R                  " UR                  5      -  5      U l        g rj   )	r$   r%   r
   r&   layerscale_valuer'   onesr)   lambda1r   s     r8   r%   Dinov2LayerScale.__init__1  s8    ||F$;$;ejjI[I[>\$\]r:   hidden_statec                     XR                   -  $ rj   r   r6   r   s     r8   rh   Dinov2LayerScale.forward5  s    ll**r:   r   r"   N
rk   rl   rm   rn   r%   r'   rp   rh   rr   rs   rt   s   @r8   r   r   0  s)    ^+ELL +U\\ + +r:   r   input	drop_probr   r"   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
r   r   r   )r   )rF   device)rI   ndimr'   randrF   r   floor_div)r   r   r   	keep_probrI   random_tensorr   s          r8   	drop_pathr   :  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr:   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )Dinov2DropPathiO  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r"   c                 .   > [         TU ]  5         Xl        g rj   )r$   r%   r   )r6   r   r7   s     r8   r%   Dinov2DropPath.__init__R  s    "r:   r   c                 B    [        XR                  U R                  5      $ rj   )r   r   r   )r6   r   s     r8   rh   Dinov2DropPath.forwardV  s    FFr:   c                 8    SR                  U R                  5      $ )Nzp={})formatr   r6   s    r8   
extra_reprDinov2DropPath.extra_reprY  s    }}T^^,,r:   )r   rj   )rk   rl   rm   rn   ro   r   floatr%   r'   rp   rh   strr  rr   rs   rt   s   @r8   r   r   O  sQ    b#(5/ #T # #GU\\ Gell G-C - -r:   r   c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )	Dinov2MLPi]  r"   c                 z  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[
        R                  " X$SS9U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [
        R                  " XCSS9U l        g )NTr   )r$   r%   r)   rq   	mlp_ratior
   r   fc1r|   
hidden_actr  r   
activationfc2r6   r!   in_featuresout_featureshidden_featuresr7   s        r8   r%   Dinov2MLP.__init__^  s    %+%7%77f0063C3CCD99[Ef''--$V%6%67DO$//DO99_Fr:   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rj   )r  r  r  r   s     r8   rh   Dinov2MLP.forwardi  s2    xx-|4xx-r:   )r  r  r  r   r   rt   s   @r8   r	  r	  ]  s)    	GELL U\\  r:   r	  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )Dinov2SwiGLUFFNip  r"   c                 $  > [         TU ]  5         UR                  =p#[        UR                  UR                  -  5      n[        US-  S-  5      S-   S-  S-  n[
        R                  " USU-  SS9U l        [
        R                  " XCSS9U l        g )Nr@   r         Tr   )	r$   r%   r)   rq   r  r
   r   
weights_inweights_outr  s        r8   r%   Dinov2SwiGLUFFN.__init__q  s    %+%7%77f0063C3CCD2Q67!;AAE))K_1D4P99_Nr:   r   c                     U R                  U5      nUR                  SSS9u  p#[        R                  R	                  U5      U-  nU R                  U5      $ )Nr@   r?   rG   )r  chunkr
   rN   silur  )r6   r   x1x2hiddens        r8   rh   Dinov2SwiGLUFFN.forwardz  sQ    |4##A2#.##B'",''r:   )r  r  r   r   rt   s   @r8   r  r  p  s)    O(ELL (U\\ ( (r:   r  c                      ^  \ rS rSrSrS\SS4U 4S jjr  SS\R                  S\	\R                     S	\
S\\\R                  \R                  4   \\R                     4   4S
 jjrSrU =r$ )Dinov2Layeri  zCThis corresponds to the Block class in the original implementation.r!   r"   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  S9U l        [        U5      U l        [        U5      U l
        UR                  S:  a  [        UR                  5      O[        R                  " 5       U l        [        R                  " UR                  UR
                  S9U l        UR                   (       a  [#        U5      U l        O['        U5      U l        [        U5      U l        g )Nepsr   )r$   r%   r
   	LayerNormr)   layer_norm_epsnorm1r   r   r   layer_scale1drop_path_rater   Identityr   norm2use_swiglu_ffnr  mlpr	  layer_scale2r   s     r8   r%   Dinov2Layer.__init__  s    \\&"4"4&:O:OP
(0,V4BHBWBWZ]B](=(=>cecncncp\\&"4"4&:O:OP
  &v.DH (DH,V4r:   r   r   r   c                 >   U R                  U R                  U5      UUS9nUS   nU R                  U5      nUSS  nU R                  U5      U-   nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      U-   nU4U-   nU$ )N)r   r   r   )r   r-  r.  r   r1  r3  r4  )r6   r   r   r   self_attention_outputsr   r   layer_outputs           r8   rh   Dinov2Layer.forward  s     "&JJ}%/ "0 "

 2!4,,-=>(, '78=H zz-0xx-((6 ~~l3mC/G+r:   )r   r   r.  r4  r3  r-  r1  r   )rk   rl   rm   rn   ro   r   r%   r'   rp   r   r   r	   r   rh   rr   rs   rt   s   @r8   r'  r'    s    M5| 5 5& -1"'	|| ELL)  	
 
uU\\5<</0%2EE	F r:   r'  c                      ^  \ rS rSrS\SS4U 4S jjr    SS\R                  S\\R                     S\	S	\	S
\	S\
\\4   4S jjrSrU =r$ )Dinov2Encoderi  r!   r"   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
r$   r%   r!   r
   
ModuleListrangenum_hidden_layersr'  layergradient_checkpointingr6   r!   rf   r7   s      r8   r%   Dinov2Encoder.__init__  sR    ]]vG_G_A`#aA`AK$7A`#ab
&+# $bs   A&r   r   r   output_hidden_statesreturn_dictc                    U(       a  SOS nU(       a  SOS n[        U R                  5       Hz  u  pU(       a  Xa4-   nUb  X(   OS n
U R                  (       a0  U R                  (       a  U R	                  U	R
                  UU
U5      nO	U	" XU5      nUS   nU(       d  Mr  X{S   4-   nM|     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )N r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frj   rG  ).0vs     r8   	<genexpr>(Dinov2Encoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)last_hidden_stater   
attentions)	enumerater@  rA  r   _gradient_checkpointing_func__call__tupler   )r6   r   r   r   rD  rE  all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputss               r8   rh   Dinov2Encoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO**t}} $ A A ))!#%	! !-]M^ _)!,M  &91=M<O&O#'  5*   14D Dm]GZ$[mmm++*
 	
r:   )r!   rA  r@  )NFFT)rk   rl   rm   rn   r   r%   r'   rp   r   r   r	   rR  r   rh   rr   rs   rt   s   @r8   r;  r;    s    ,| , , -1"'%* )
||)
 ELL))
  	)

 #)
 )
 
uo%	&)
 )
r:   r;  c                       \ rS rSr\rSrSrSrS/r	Sr
SrS\\R                  \R                  \R                   4   SS4S	 jrS
rg)Dinov2PreTrainedModeli  dinov2r]   Tr  r   r"   Nc                 j   [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR                  R                  5      UR                  l        UR                  b%  UR                  R                  R                  5         gg[        U[        R                   5      (       aJ  UR                  R                  R                  5         UR                  R                  R#                  S5        g[        U[$        5      (       Gam  [        R                  R                  UR&                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR&                  R                  5      UR&                  l        [        R                  R                  UR(                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR(                  R                  5      UR(                  l        U R                  R*                  (       a%  UR,                  R                  R                  5         gg[        U[.        5      (       a:  UR0                  R                  R#                  U R                  R2                  5        gg)zInitialize the weightsr   )meanstdNg      ?)r|   r
   r   r   inittrunc_normal_ra   datarP   r'   rQ   r!   initializer_rangerF   r   zero_r+  fill_r   r1   r*   r+   r-   r   r   r   )r6   r   s     r8   _init_weights#Dinov2PreTrainedModel._init_weights  s-   fryy"))455 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '--KK""$MM$$S) 011.0gg.C.C**//225==AKK11 /D / b++112	 &&+ %'GG$9$9  %%((7KK11 %: % b!!''(	 ! {{))!!&&,,. * 011NN%%dkk&B&BC 2r:   rG  )rk   rl   rm   rn   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2r	   r
   r   r   r+  rf  rr   rG  r:   r8   r[  r[    s[    L $O&*#*+N!DE"))RYY*L$M DRV Dr:   r[  c                   
  ^  \ rS rSrS\4U 4S jjrS\4S jrS\\	\
\	   4   SS4S jr\      SS	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )Dinov2Modeli  r!   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        U R                  5         g )Nr)  )r$   r%   r!   r   r;   r;  encoderr
   r+  r)   r,  	layernorm	post_initr   s     r8   r%   Dinov2Model.__init__  sW     *62$V,f&8&8f>S>ST 	r:   r"   c                 .    U R                   R                  $ rj   r;   r/   r  s    r8   get_input_embeddings Dinov2Model.get_input_embeddings       ///r:   heads_to_pruneNc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrr  r@  r   r   )r6   r{  r@  r   s       r8   _prune_headsDinov2Model._prune_heads#  s<    
 +002LELLu%//;;EB 3r:   r]   r^   r   r   rD  rE  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  X0R                   R                  5      nU R                  XS9nU R                  UUUUUS9nUS   n	U R                  U	5      n	U	SS2SSS24   n
U(       d
  X4nXSS -   $ [        U	U
UR                  UR                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
    pre-training.
Nz You have to specify pixel_values)r^   r   r   rD  rE  r   r   )rM  pooler_outputr   rN  )r!   r   rD  use_return_dictr   get_head_maskr?  r;   rr  rs  r   r   rN  )r6   r]   r^   r   r   rD  rE  embedding_outputencoder_outputssequence_outputpooled_outputhead_outputss               r8   rh   Dinov2Model.forward+  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y++2O2OP	??<?Y,,/!5# ' 
 *!,..9'1a0+;L!""555)-')77&11	
 	
r:   )r!   r;   rr  rs  NNNNNN)rk   rl   rm   rn   r   r%   r.   rx  r   rq   r   r~  r   r   r'   rp   r   r	   r   r   rh   rr   rs   rt   s   @r8   rp  rp    s    
| 
0&; 0C4T#Y+? CD C  0426,0,0/3&*4
u||,4
 "%,,/4
 ELL)	4

 $D>4
 'tn4
 d^4
 
u00	14
 4
r:   rp  z
    Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.
    )custom_introc                      ^  \ rS rSrS\SS4U 4S jjr\      SS\\R                     S\\R                     S\\R                     S	\\
   S
\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )Dinov2ForImageClassificationic  r!   r"   Nc                 6  > [         TU ]  U5        UR                  U l        [        U5      U l        UR                  S:  a.  [
        R                  " UR                  S-  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )Nr   r@   )r$   r%   
num_labelsrp  r\  r
   r   r)   r0  
classifierrt  r   s     r8   r%   %Dinov2ForImageClassification.__init__j  sy      ++!&) EKDUDUXYDYBIIf((1,f.?.?@_a_j_j_l 	
 	r:   r]   r   labelsr   rD  rE  c                    Ub  UOU R                   R                  nU R                  UUUUUS9nUS   nUSS2S4   n	USS2SS24   n
[        R                  " XR                  SS9/SS9nU R                  U5      nSnUGb  UR                  UR                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l	        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l	        OSU R                   l	        U R                   R                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [!        5       nU" UR#                  S	U R                  5      UR#                  S	5      5      nO,U R                   R                  S:X  a  [%        5       nU" X5      nU(       d  U4US
S -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr  r   r   rG   
regressionsingle_label_classificationmulti_label_classificationr?   r@   )losslogitsr   rN  )r!   r  r\  r'   rS   r^  r  rP   r   problem_typer  rF   longrq   r   squeezer   rR   r   r   r   rN  )r6   r]   r   r  r   rD  rE  r   r  r*   patch_tokenslinear_inputr  r  loss_fctr   s                   r8   rh   $Dinov2ForImageClassification.forwardx  s     &1%<k$++B]B]++/!5#  
 "!*#AqD)	&q!"u-yy)->->1->-E!FAN.YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE$!//))	
 	
r:   )r  r\  r  r  )rk   rl   rm   rn   r   r%   r   r   r'   rp   r   r	   rR  r   rh   rr   rs   rt   s   @r8   r  r  c  s    |    04,0)-,0/3&*D
u||,D
 ELL)D
 &	D

 $D>D
 'tnD
 d^D
 
u++	,D
 D
r:   r  zO
    Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                      ^  \ rS rSrU 4S jrS\4S jr\   SS\R                  S\
\   S\
\   S\
\   S\4
S	 jj5       rS
rU =r$ )Dinov2Backbonei  c                 v  > [         TU ]  U5        [         TU ]	  U5        [        UR                  S-   5       Vs/ s H  o!R
                  PM     snU l        [        U5      U l        [        U5      U l
        [        R                  " UR
                  UR                  S9U l        U R                  5         g s  snf )Nr   r)  )r$   r%   _init_backboner>  r?  r)   num_featuresr   r;   r;  rr  r
   r+  r,  rs  rt  rB  s      r8   r%   Dinov2Backbone.__init__  s     v&9>v?W?WZ[?[9\]9\A//9\]*62$V,f&8&8f>S>ST 	 ^s   B6r"   c                 .    U R                   R                  $ rj   rw  r  s    r8   rx  #Dinov2Backbone.get_input_embeddings  rz  r:   r]   rD  r   rE  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nU R                  USX4S9nU(       a  UR                  OUS   nSn[        U R                  U5       H  u  pXR                  ;   d  M  U R                   R                  (       a  U R                  U
5      n
U R                   R                  (       aj  U
SS2SS24   n
UR                  u  ppU R                   R                  nU
R                  XU-  X-  S5      n
U
R!                  SSSS	5      R#                  5       n
X4-  nM     U(       d  U(       a  U4USS -   nU$ U4US	S -   nU$ [%        UU(       a  UR                  OSU(       a  UR&                  S
9$ SS
9$ )a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
>>> model = AutoBackbone.from_pretrained(
...     "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
... )

>>> inputs = processor(image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 768, 16, 16]
```NT)rD  r   rE  r   rG  r?   r   r   r@   )feature_mapsr   rN  )r!   r  rD  r   r;   rr  r   zipstage_namesr  apply_layernormrs  reshape_hidden_statesrI   r5   rL   rM   r   r   rN  )r6   r]   rD  r   rE  r  r   r   r  stager   re   rf   r<   r=   r5   r   s                    r8   rh   Dinov2Backbone.forward  s   @ &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq??<8,,4K\  
 2=--'!*#&t'7'7#GE)));;..#'>>,#?L;;44#/12#6L 4@3E3E0J6!%!7!7J#/#7#7
jDXZ_Zmoq#rL#/#7#71a#C#N#N#PL/ $H #&712;6 M '712;6M%3G'//T->w))
 	
 EI
 	
r:   )r;   rr  rs  r  )NNN)rk   rl   rm   rn   r%   r.   rx  r   r'   rp   r   r   r   rh   rr   rs   rt   s   @r8   r  r    s{    0&; 0  04,0&*G
llG
 'tnG
 $D>	G

 d^G
 
G
 G
r:   r  )r  rp  r[  r  )r   )r   F)@ro   collections.abcr}   typingr   r   r   r   r   r   r	   r'   torch.utils.checkpointr
   torch.nnr   r   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   utils.backbone_utilsr   configuration_dinov2r   
get_loggerrk   r   Moduler   r.   rp   r  r   r   r   r   r   r   r   r   r	  r  r'  r;  r[  rp  r  r  __all__rG  r:   r8   <module>r     s8     D D D    A A ! r r F Q 7 7 1 . 
		H	%Nryy NbBII R %II%<<% 
% <<	%
 U\\*% % %>;")) ;~ryy &$bii $N+ryy +U\\ e T V[VbVb *-RYY -		 &(bii ("0")) 0h0
BII 0
f &DO &D &DR M
' M
 M
` T
#8 T
T
n 
Y
*M Y

Y
x er:   