
    fTh             	       V   S r SSKrSSKrSSKrSSKJr  SSKJrJ	r	J
r
Jr  SSKrSSKrSSKJrJr  SSKJrJrJr  SSKJr  SS	KJrJrJrJrJrJr  SS
KJr  SSKJ r J!r!J"r"  SSK#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*  \%RV                  " \,5      r-/ SQr.Sr/Sr0\ " S S\5      5       r1SPS\R                  S\2S\3S\R                  4S jjr4 " S S\Rj                  5      r6 " S S\Rj                  5      r7 " S S\Rj                  5      r8 " S  S!\Rj                  5      r9 " S" S#\95      r: " S$ S%\Rj                  5      r;\9\:S&.r< " S' S(\Rj                  5      r= " S) S*\Rj                  5      r> " S+ S,\Rj                  5      r? " S- S.\Rj                  5      r@ " S/ S0\Rj                  5      rA " S1 S2\Rj                  5      rB\$ " S3 S4\5      5       rC\$ " S5 S6\C5      5       rD " S7 S8\Rj                  5      rE\$" S9S:9 " S; S<\C5      5       rF\$" S=S:9 " S> S?\C5      5       rG " S@ SA\Rj                  5      rH " SB SC\Rj                  5      rI " SD SE\Rj                  5      rJ " SF SG\Rj                  5      rK " SH SI\Rj                  5      rL\$ " SJ SK\C5      5       rM\$" SLS:9 " SM SN\C\(5      5       rN/ SOQrOg)QzPyTorch BEiT model.    N)	dataclass)ListOptionalTupleUnion)Tensornn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedLMOutputSemanticSegmenterOutput)PreTrainedModel)#compile_compatible_method_lru_cache find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int)BackboneMixin   )
BeitConfig)r      i   zmicrosoft/beit-base-patch16-224ztabby, tabby catc                       \ rS rSrSrSrg)BeitModelOutputWithPooling8   al  
Class for outputs of [`BeitModel`].

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
 N)__name__
__module____qualname____firstlineno____doc____static_attributes__r#       ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/beit/modeling_beit.pyr!   r!   8   s    r*   r!   input	drop_probtrainingreturnc                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtypedevice)shapendimtorchrandr2   r3   floor_div)r,   r-   r.   	keep_probr4   random_tensoroutputs          r+   	drop_pathr=   R   s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr*   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )BeitDropPathf   zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr-   r/   c                 .   > [         TU ]  5         Xl        g N)super__init__r-   )selfr-   	__class__s     r+   rD   BeitDropPath.__init__i   s    "r*   hidden_statesc                 B    [        XR                  U R                  5      $ rB   )r=   r-   r.   rE   rH   s     r+   forwardBeitDropPath.forwardm   s    FFr*   c                 8    SR                  U R                  5      $ )Nzp={})formatr-   rE   s    r+   
extra_reprBeitDropPath.extra_reprp   s    }}T^^,,r*   )r-   rB   )r$   r%   r&   r'   r(   r   floatrD   r6   r   rK   strrP   r)   __classcell__rF   s   @r+   r?   r?   f   sQ    b#(5/ #T # #GU\\ Gell G-C - -r*   r?   c            	          ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
  SS\R                  S\\R                     S\\   S\R                  4S jjrSrU =r$ )BeitEmbeddingsv   z[
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

configr/   Nc                 ^  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a<  [        R                  " [        R
                  " SSUR                  5      5      U l	        OS U l	        [        U5      U l        UR                  U l        [        UR                  [        R                   R"                  5      (       a  UR                  OUR                  UR                  4U l        U R                  R$                  nUR&                  (       a?  [        R                  " [        R
                  " SUS-   UR                  5      5      U l        OS U l        [        R*                  " UR,                  5      U l        g )Nr   )rC   rD   r	   	Parameterr6   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenBeitPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)rE   rY   ri   rF   s      r+   rD   BeitEmbeddings.__init__|   s'   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO 3F ; ++ &++[__-E-EFF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r*   
embeddingsheightwidthc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Ng      ?r   r      bicubicFsizemodealign_cornersdim)r4   rk   r6   jit
is_tracingrc   r   reshapepermuter	   
functionalinterpolateviewcat)rE   rp   rq   rr   ri   num_positionsclass_pos_embedpatch_pos_embedr|   
new_height	new_widthsqrt_num_positionss               r+   interpolate_pos_encoding'BeitEmbeddings.interpolate_pos_encoding   sS    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr*   pixel_valuesbool_masked_posr   c                 ,   U R                   b  Ub  [        R                  " S5        UR                  u    pEnU R	                  U5      u  nu  pUR                  5       u  pnUbI  U R                  R                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  R                  U
SS5      n[        R                  " X4SS9nU R                   b  XpR                  XuU5      -   nU R                  U5      nXxU	44$ )Nz`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always interpolated to the input image size. The argument will be removed in transformers v4.51.0.rt   r   r{   )rk   warningswarnr4   rb   rx   r`   expand	unsqueezetype_asr^   r6   r   r   rn   )rE   r   r   r   _rq   rr   rp   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokenss                  r+   rK   BeitEmbeddings.forward   s    ##/4L4XMMn
 +001e262G2G2U/
/\!+!2
Q&//00bIK))"-55kBA#q1u-?J^^**:r2>
YY
7Q?
##/#&C&CJX]&^^J\\*-
+666r*   )r^   rn   re   r`   rb   rc   rk   NN)r$   r%   r&   r'   r(   r   rD   r6   r   intr   r   
BoolTensorboolrK   r)   rT   rU   s   @r+   rW   rW   v   s    
>z >d >.&D5<< &D &DUX &D]b]i]i &DV 7;37	7ll7 "%"2"237 #+4.	7
 
7 7r*   rW   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )ra      z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                    > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nUS   US   -  US   US   -  4nX l        X0l        X@l        X`l
        Xpl        [        R                  " XEX3S9U l        g )Nr   r   kernel_sizestride)rC   rD   re   rc   num_channelsr]   rd   rf   rg   rh   ri   patch_shaper	   Conv2d
projection)	rE   rY   re   rc   r   r]   ri   r   rF   s	           r+   rD   BeitPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L:ir*   r   r/   c                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      nUR                   S   UR                   S   pUR	                  S5      R                  SS5      nXgU44$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.ru   r   r   )r4   r   
ValueErrorr   flatten	transpose)	rE   r   r   r   rq   rr   rp   r   r   s	            r+   rK   BeitPatchEmbeddings.forward   s    2>2D2D/
&,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
+666r*   )re   r   ri   r   rc   r   )r$   r%   r&   r'   r(   rD   r6   r   rK   r)   rT   rU   s   @r+   ra   ra      s.    j"7ELL 7U\\ 7 7r*   ra   c                   &  ^  \ rS rSrSS\S\\   SS4U 4S jjjrS r     SS\	R                  S	\\	R                     S
\S\\	R                     S\S\\\      S\\\	R                     \\	R                  \	R                  4   4   4S jjrSrU =r$ )BeitSelfAttentioni  NrY   window_sizer/   c                 J  > [         TU ]  5         Xl        UR                  UR                  -  S:w  a7  [        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  SS9U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        [%        U5      U l        U R&                  (       a  [)        XS9U l        g g )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )rC   rD   rY   r]   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer	   Linearquerykeyvaluerl   attention_probs_dropout_probrn   r   has_relative_position_biasBeitRelativePositionBiasrelative_position_biasrE   rY   r   rF   s      r+   rD   BeitSelfAttention.__init__  sN    : ::a?PVXhHiHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1C%PYYv1143E3EF
zz&"E"EF*.{*;'***B6*cD' +r*   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nrt   r   ru   r   r   )rx   r   r   r   r   )rE   xnew_x_shapes      r+   transpose_for_scores&BeitSelfAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$r*   rH   	head_maskoutput_attentionsr   r   
resolutionc                    U R                  U5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      n	U R                  U5      n
[        R
                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nU R                  (       aS  Uu  pXR                  R                  -  XR                  R                  -  4nXR                  XUR                  S   S9-   nUb  X-   n[        R                   R#                  USS9nU R%                  U5      nUb  X-  n[        R
                  " X5      nUR'                  SSSS5      R)                  5       nUR+                  5       S S U R,                  4-   nUR.                  " U6 nU(       a  UU4nU$ U4nU$ )	Nrt   r   dim_sizer{   r   ru   r   )r   r   r   r   r6   matmulr   mathsqrtr   r   rY   rc   r   r4   r	   r   softmaxrn   r   
contiguousrx   r   r   )rE   rH   r   r   r   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresrq   rr   r   attention_probscontext_layernew_context_layer_shapeoutputss                      r+   rK   BeitSelfAttention.forward  s    !JJ}5--dhh}.EF	//

=0IJ//0AB !<<5H5HR5PQ+dii8P8P.QQ **&MF![[%;%;;UkkF\F\=\]K/2M2M@S@STU@V 3N 3  
 "-/H --//0@b/I ,,7  -9O_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD6G=/2 O\M]r*   )
r   r   rY   rn   r   r   r   r   r   r   rB   NFNFN)r$   r%   r&   r'   r   r   tuplerD   r   r6   r   r   r   r   r   rK   r)   rT   rU   s   @r+   r   r     s    dz d dSW d d.% -1"'9=).+/3||3 ELL)3  	3
 !) 63 #'3 U3Z(3 
uU\\"E%,,*D$EE	F3 3r*   r   c                      ^  \ rS rSr     SS\R
                  S\\R
                     S\S\\R
                     S\S\\\	      S\
\\R
                     \\R
                  \R
                  4   4   4U 4S	 jjjrS
rU =r$ )BeitSdpaSelfAttentioniU  rH   r   r   r   r   r   r/   c           
        > U(       d  Ub'  [         R                  S5        [        TU ]  UUUUUUS9$ U R	                  U5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      n	U R                  U5      n
S nU R                  (       aQ  Uu  pXR                  R                  -  XR                  R                  -  4nU R                  XUR                  S   S9nUb
  Uc  UnOX-  nS[        R                  " U R                  5      -  n[         R"                  R$                  R'                  U
UU	UU R(                  (       a  U R                  R*                  OSSUS9nUR-                  SS	SS
5      R/                  5       nUR1                  5       S S U R2                  4-   nUR4                  " U6 nUS 4$ )Na  `BeitSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)rH   r   r   r   r   r   r   r   r1   F)	attn_mask	dropout_p	is_causalscaler   ru   r   r   )loggerwarning_oncerC   rK   r   r   r   r   r   rY   rc   r   r4   r   r   r   r6   r	   r   scaled_dot_product_attentionr.   r   r   r   rx   r   r   )rE   rH   r   r   r   r   r   r   r   r   r   	attn_biasrq   rr   r   scalingr   r   rF   s                     r+   rK   BeitSdpaSelfAttention.forwardV  s    	 5w 7?+#"3'=)A% #   !JJ}5--dhh}.EF	//

=0IJ//0AB	**&MF![[%;%;;UkkF\F\=\]K33@S@STU@V 4 I
 "- 2	3	dii 8 899++HHBF--dkk>>UX I 
 &--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDd""r*   r#   r   )r$   r%   r&   r'   r6   r   r   r   r   r   r   rK   r)   rT   rU   s   @r+   r   r   U  s     -1"'9=).+/:#||:# ELL):#  	:#
 !) 6:# #':# U3Z(:# 
uU\\"E%,,*D$EE	F:# :#r*   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrSS\R                  S\R                  S\R                  4S	 jjr	S
r
U =r$ )BeitSelfOutputi  z
The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the
layernorm applied before each block.
rY   r/   Nc                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g rB   )	rC   rD   r	   r   r]   denserl   rm   rn   rE   rY   rF   s     r+   rD   BeitSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r*   rH   input_tensorc                 J    U R                  U5      nU R                  U5      nU$ rB   r   rn   )rE   rH   r   gammas       r+   rK   BeitSelfOutput.forward  $    

=1]3r*   r   rB   )r$   r%   r&   r'   r(   r   rD   r6   r   rK   r)   rT   rU   s   @r+   r   r     sI    
>z >d >
U\\  ^c^j^j  r*   r   )eagersdpac                   &  ^  \ rS rSrSS\S\\   SS4U 4S jjjrS r     SS\	R                  S	\\	R                     S
\S\\	R                     S\S\\\      S\\\	R                     \\	R                  \	R                  4   4   4S jjrSrU =r$ )BeitAttentioni  NrY   r   r/   c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )Nr   )	rC   rD   BEIT_SELF_ATTENTION_CLASSES_attn_implementation	attentionr   r<   setpruned_headsr   s      r+   rD   BeitAttention.__init__  s<    4V5P5PQRXr$V,Er*   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r{   )lenr   r  r   r   r  r   r   r   r   r<   r   r   union)rE   headsindexs      r+   prune_headsBeitAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r*   rH   r   r   r   r   r   c                 h    U R                  XX4XV5      nU R                  US   U5      nU4USS  -   n	U	$ )Nr   r   )r  r<   )
rE   rH   r   r   r   r   r   self_outputsattention_outputr   s
             r+   rK   BeitAttention.forward  sK     ~~&7Qi
  ;;|AF#%QR(88r*   )r  r<   r  rB   r   )r$   r%   r&   r'   r   r   r   rD   r  r6   r   r   r   r   r   rK   r)   rT   rU   s   @r+   r  r    s    "z " "SW " ";* -1"'9=).+/|| ELL)  	
 !) 6 #' U3Z( 
uU\\"E%,,*D$EE	F r*   r  c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	BeitIntermediatei  rY   r/   Nc                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rB   )rC   rD   r	   r   r]   intermediate_sizer   rd   
hidden_actrS   r   intermediate_act_fnr   s     r+   rD   BeitIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r*   rH   c                 J    U R                  U5      nU R                  U5      nU$ rB   r   r  rJ   s     r+   rK   BeitIntermediate.forward  s&    

=100?r*   r  r$   r%   r&   r'   r   rD   r6   r   rK   r)   rT   rU   s   @r+   r  r    s6    9z 9d 9U\\ ell  r*   r  c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	
BeitOutputi  rY   r/   Nc                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g rB   )
rC   rD   r	   r   r  r]   r   rl   rm   rn   r   s     r+   rD   BeitOutput.__init__  sB    YYv779K9KL
zz&"<"<=r*   rH   c                 J    U R                  U5      nU R                  U5      nU$ rB   r   rJ   s     r+   rK   BeitOutput.forward  r   r*   r   r  rU   s   @r+   r   r     s6    >z >d >
U\\ ell  r*   r   c                   (  ^  \ rS rSrSrSS\S\\   S\SS4U 4S jjjr	     SS	\
R                  S
\\
R                     S\S\\
R                     S\S\\\      S\\\
R                     \\
R                  \
R                  4   4   4S jjrSrU =r$ )	BeitLayeri  z?This corresponds to the Block class in the timm implementation.NrY   r   drop_path_rater/   c                   > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        [        U5      U l        [        U5      U l	        [        R                  " UR                  UR                  S9U l        US:  a  [        U5      O[        R                   " 5       U l        [        R                  " UR                  UR                  S9U l        UR&                  nUS:  aw  [        R(                  " U[*        R,                  " UR                  5      -  SS9U l        [        R(                  " U[*        R,                  " UR                  5      -  SS9U l        g Su  U l        U l        g )	Nr   r   epsr1   r   T)requires_gradr   )rC   rD   chunk_size_feed_forwardseq_len_dimr  r  r  intermediater   r<   r	   	LayerNormr]   layer_norm_epslayernorm_beforer?   Identityr=   layernorm_afterlayer_scale_init_valuer[   r6   oneslambda_1lambda_2)rE   rY   r   r'  init_valuesrF   s        r+   rD   BeitLayer.__init__  s   '-'E'E$&vG,V4 ( "V-?-?VEZEZ [9G#9Mn5SUS^S^S`!||F,>,>FDYDYZ33?LLuzz6CUCU7W)WgklDMLLuzz6CUCU7W)WgklDM+5(DM4=r*   rH   r   r   r   r   r   c           	         U R                  U R                  U5      UUUUUS9nUS   nUSS  n	U R                  b  U R                  U-  nU R                  U5      U-   nU R	                  U5      n
U R                  U
5      n
U R                  U
5      n
U R                  b  U R                  U
-  n
U R                  U
5      U-   n
U
4U	-   n	U	$ )N)r   r   r   r   r   r   )r  r1  r6  r=   r3  r.  r<   r7  )rE   rH   r   r   r   r   r   self_attention_outputsr  r   layer_outputs              r+   rK   BeitLayer.forward	  s     "&!!-0/#9%=! "0 "
 2!4(, ==$#}}/?? '78=H ++M:((6{{<0==$==<7L ~~l3mC/G+r*   )
r  r,  r=   r.  r6  r7  r3  r1  r<   r-  )Nr1   r   )r$   r%   r&   r'   r(   r   r   r   rR   rD   r6   r   r   r   r   r   rK   r)   rT   rU   s   @r+   r&  r&    s    I6z 6 6`e 6pt 6 6* -1"'9=).+/)||) ELL))  	)
 !) 6) #') U3Z() 
uU\\"E%,,*D$EE	F) )r*   r&  c                      ^  \ rS rSrS\S\SS4U 4S jjr\" SS9S\\	\	4   S\
R                  4S	 j5       rSS
\S\
R                  4S jjrSrU =r$ )r   i5  rY   r   r/   Nc                    > [         TU ]  5         X l        SUS   -  S-
  SUS   -  S-
  -  S-   U l        [        R
                  " [        R                  " U R                  UR                  5      5      U l	        g )Nru   r   r   r   )
rC   rD   r   num_relative_distancer	   r[   r6   r\   r   relative_position_bias_tabler   s      r+   rD   !BeitRelativePositionBias.__init__6  sp    &&'+a.&81&<[QR^ASVWAW%X[\%\",.LLKK22F4N4NO-
)r*   
   )maxsizec                    SUS   -  S-
  SUS   -  S-
  -  S-   nUS   US   -  n[         R                  " [         R                  " US   5      [         R                  " US   5      SS9n[         R                  " U5      n[         R                  " US5      nUSS2SS2S4   USS2SSS24   -
  nUR                  SSS5      R                  5       nUSS2SS2S4==   US   S-
  -  ss'   USS2SS2S4==   US   S-
  -  ss'   USS2SS2S4==   SUS   -  S-
  -  ss'   [         R                  " US-   4S-  UR                  S9nUR                  S	5      USS2SS24'   US-
  USSS24'   US-
  USS2S4'   US-
  US
'   U$ )z
This method creates the relative position index, modified to support arbitrary window sizes,
as introduced in [MiDaS v3.1](https://arxiv.org/abs/2307.14460).
ru   r   r   r   ij)indexingN)rx   r2   rt   )r   r   )
r6   meshgridarangestackr   r   r   r\   r2   sum)	rE   r   r@  window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexs	            r+    generate_relative_position_index9BeitRelativePositionBias.generate_relative_position_index?  s    "#[^!3a!7AA<NQR<R SVW W "!n{1~5~~ell;q>:ELLUV<XcghT"vq1(At4~aqj7QQ)11!Q:EEG1a KNQ$66 1a KNQ$66 1a AA$6$:: "'++K!O3E3IQ`QfQf"g*9*=*=b*AAB')>)B12&)>)BA&(=(A%&&r*   r   c                    SU R                   S   -  S-
  nSU R                   S   -  S-
  nSUS   -  S-
  nSUS   -  S-
  nU R                  nU R                  n	Xg-  S-   n
USU	S-
   nUR                  SXTS5      R	                  SSSS5      n[
        R                  R                  U[        U5      [        U5      4SS9nUR	                  SSSS5      R                  U
S-
  S5      n[        R                  " XU	S-
  S /5      nU R                  U5      nXR                  S5         nUR                  US   US   -  S-   US   US   -  S-   S5      nUR	                  SSS5      R                  5       nU(       a?  [
        R                  R                  UR                  S5      X34SS	S
9R                  S5      nUR                  S5      $ )ze
Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
ru   r   r   r   Nrt   bilinear)rx   ry   Frw   )r   rA  r@  r   r   r	   r   r   r   r6   r   rR  r   r   r   squeeze)rE   r   r   r   
old_height	old_widthr   r    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tablerQ  r   s                   r+   rK    BeitRelativePositionBias.forwardX  s!    ))!,,q0
((++a/	Q'!+
A&*	+/+L+L($($>$>!$.$:Q$>!89X;TWX;XY%--aKSSTUWXZ[]^_11:!6	)8L MT^ 2 
 &--aAq9AAB[^_B_acd+099=VYZ=Z=\]^,
( #'"G"G"T!AB^B^_aBb!c "8!<!<N[^+a/Q+a.1PST1TVX"
 "8!?!?1a!H!S!S!U#%']]%>%>&003)#	 &? &
 gaj # &//22r*   )r@  rA  r   )FN)r$   r%   r&   r'   r   r   rD   r   r   r   r6   r   rR  r   rK   r)   rT   rU   s   @r+   r   r   5  sr    
z 
 
$ 
 )4'E#s(O 'PUP\P\ ' 5'0-3T -3]b]i]i -3 -3r*   r   c                      ^  \ rS rSrSS\S\\   SS4U 4S jjjr      SS\R                  S\\R                     S	\
S
\
S\
S\\\\4      S\
S\\\4   4S jjrSrU =r$ )BeitEncoderi  NrY   r   r/   c                   > [         TU ]  5         Xl        UR                  U l        U R                  (       a  [        XS9U l        [        R                  " SUR                  UR                  SS9 Vs/ s H  o3R                  5       PM     nn[        R                  " [        UR                  5       Vs/ s H#  n[        UUR                   (       a  UOS XE   S9PM%     sn5      U l        SU l        g s  snf s  snf )Nr   r   cpu)r3   )r   r'  F)rC   rD   rY   !use_shared_relative_position_biasr   r   r   r6   linspacer'  num_hidden_layersitemr	   
ModuleListranger&  use_relative_position_biaslayergradient_checkpointing)rE   rY   r   r   dprirF   s         r+   rD   BeitEncoder.__init__  s    *0*R*R'***B6*cD' "'63H3H&JbJbkp!qr!qAvvx!qr]] v778 9A /5/P/PVZ#&6
 9	

 ',# ss   3C35*C8rH   r   r   output_hidden_statesr   r   return_dictc           
         U(       a  SOS nU(       a  SOS n	[        U R                  5       H  u  pU(       a  X4-   nU R                  (       aR  Uu  pXR                  R                  -  XR                  R                  -  4nU R                  XUR                  S   S9nOS nUb  X*   OS nU R                  (       a3  U R                  (       a"  U R                  UR                  UUUUUU5      nOU" UUUUUU5      nUS   nU(       d  M  U	US   4-   n	M     U(       a  X4-   nU(       d  [        S XU	4 5       5      $ [        UUU	S9$ )Nr#   r   )r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7frB   r#   ).0vs     r+   	<genexpr>&BeitEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	)last_hidden_staterH   
attentions)	enumeraterk  r   rY   rc   r   r4   rl  r.   _gradient_checkpointing_func__call__r   r   )rE   rH   r   r   rp  r   r   rq  all_hidden_statesall_self_attentionsrn  layer_modulerq   rr   r   r   layer_head_masklayer_outputss                     r+   rK   BeitEncoder.forward  so    #7BD$5b4(4OA#$58H$H!.. *%)?)??++J`J`A`a)-)D)D]j]p]pqr]s *E *& *.&.7.CilO**t}} $ A A ))!#%*,! !-!#%*,! *!,M  &9]1=M<O&O#M  5P   14D Dm]GZ$[mmm++*
 	
r*   )rY   rl  r   rk  r   rB   )NFFFNT)r$   r%   r&   r'   r   r   r   rD   r6   r   r   r   r   r   r   rK   r)   rT   rU   s   @r+   ra  ra    s    ,z , ,SW , ,0 -1"'%*).04 >
||>
 ELL)>
  	>

 #>
 #'>
 U38_->
 >
 
uo%	&>
 >
r*   ra  c                   :    \ rS rSr\rSrSrSrS/r	S/r
SrS rSrg	)
BeitPreTrainedModeli  beitr   Tr&  z.*relative_position_index.*c                 0   [        U[        R                  [        R                  [        R                  45      (       ak  UR
                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR
                  R                  R                  SU R                  R                  S9  UR                  b2  UR
                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR
                  R                  R                  S5        g[        U[         5      (       a  UR"                  R                  R                  5         UR$                  b$  UR$                  R                  R                  5         UR&                  b%  UR&                  R                  R                  5         gg[        U[(        5      (       a%  UR*                  R                  R                  5         g[        U[,        5      (       a  UR.                  bs  UR.                  R                  R                  U R                  R0                  5        UR2                  R                  R                  U R                  R0                  5        ggg)zInitialize the weightsr1   )meanstdNg      ?)rd   r	   r   r   ConvTranspose2dweightdatanormal_rY   initializer_ranger   zero_	Embeddingpadding_idxr/  fill_rW   r^   r`   rk   r   rA  r&  r6  r4  r7  )rE   modules     r+   _init_weights!BeitPreTrainedModel._init_weights  s   fryy"))R5G5GHII MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S)//!!'')  ,!!&&,,.))5**//557 6 899//44::<	***$$**4;;+M+MN$$**4;;+M+MN + +r*   r#   N)r$   r%   r&   r'   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_sdpar  r)   r#   r*   r+   r  r    s4    L$O&*#$*H)I&NOr*   r  c                      ^  \ rS rSrSS\S\SS4U 4S jjjrS rS r\	      SS	\
R                  S
\\
R                     S\\
R                     S\\   S\\   S\S\\   S\\\4   4S jj5       rSrU =r$ )	BeitModeli  rY   add_pooling_layerr/   Nc                   > [         TU ]  U5        Xl        [        U5      U l        [        XR                  R                  R                  S9U l        UR                  (       a  [        R                  " 5       O([        R                  " UR                  UR                  S9U l        U(       a  [!        U5      OSU l        U R%                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
r   r)  N)rC   rD   rY   rW   rp   ra  rb   r   encoderuse_mean_poolingr	   r2  r/  r]   r0  	layernorm
BeitPoolerpooler	post_init)rE   rY   r  rF   s      r+   rD   BeitModel.__init__  s    
 	 (0"67W7W7c7cd $44BKKM",,vGYGY_e_t_t:u 	 ->j(4 	r*   c                 .    U R                   R                  $ rB   rp   rb   rO   s    r+   get_input_embeddingsBeitModel.get_input_embeddings      ///r*   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  rk  r  r  )rE   heads_to_prunerk  r  s       r+   _prune_headsBeitModel._prune_heads  s<    
 +002LELLu%//;;EB 3r*   r   r   r   r   rp  r   rq  c           
      2   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  X0R                   R
                  5      nU R                  XS9u  pUR                  SS n
U R                  UUUUU
UUS9nUS   nU R                  U5      nU R                  b  U R                  U5      OSnU(       d  Ub  X4OU4nXSS -   $ [        UUUR                  UR                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
N)r   ru   )r   r   rp  r   rq  r   r   r   )rx  pooler_outputrH   ry  )rY   r   rp  use_return_dictget_head_maskrf  rp   r4   r  r  r  r!   rH   ry  )rE   r   r   r   r   rp  r   rq  embedding_outputr   r   encoder_outputssequence_outputpooled_outputhead_outputss                  r+   rK   BeitModel.forward&  s;    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] &&y++2O2OP	"oolo\!''+
,,/!5!#%= ' 
 *!,..98<8OO4UY?L?XO;_n^pL!""555)-')77&11	
 	
r*   )rY   rp   r  r  r  )T)NNNNFN)r$   r%   r&   r'   r   r   rD   r  r  r   r6   r   r   r   r   r   r!   rK   r)   rT   rU   s   @r+   r  r    s    z d d  &0C  7;,0,0/3).&*4
ll4
 "%"2"234
 ELL)	4

 $D>4
 'tn4
 #'4
 d^4
 
u00	14
 4
r*   r  c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	r  i^  rY   r/   Nc                    > [         TU ]  5         UR                  (       a/  [        R                  " UR
                  UR                  S9U l        g S U l        g )Nr)  )rC   rD   r  r	   r/  r]   r0  r  r   s     r+   rD   BeitPooler.__init___  sA    KQKbKbBLL++1F1FG 	hl 	r*   rH   c                     U R                   b0  US S 2SS 2S S 24   nU R                  UR                  S5      5      nU$ US S 2S4   nU$ )Nr   r   )r  r  )rE   rH   patch_tokensr  s       r+   rK   BeitPooler.forwarde  sU    >>%(AB2L NN<+<+<Q+?@M
  *!Q$/Mr*   )r  r  rU   s   @r+   r  r  ^  s6    
z 
d 
	U\\ 	ell 	 	r*   r  a  
    Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
    visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
    predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
    will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.
    )custom_introc                     ^  \ rS rSrS\SS4U 4S jjr\        SS\\R                     S\\R                     S\\R                     S	\\R                     S
\\   S\\   S\S\\   S\\\4   4S jj5       rSrU =r$ )BeitForMaskedImageModelingiq  rY   r/   Nc                 @  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  UR                  S9U l	        [
        R                  " UR                  UR                  5      U l        U R                  5         g )NFr  r)  )rC   rD   
num_labelsr  r  r	   r/  r]   r0  r  r   
vocab_sizelm_headr  r   s     r+   rD   #BeitForMaskedImageModeling.__init__z  su      ++f>	 f&8&8f>S>STyy!3!3V5F5FG 	r*   r   r   r   labelsr   rp  r   rq  c	           
      l   Ub  UOU R                   R                  nU R                  UUUUUUUS9n	U	S   n
U R                  U
5      n
U R	                  U
SS2SS24   5      nSnUb  [        5       nU" X   U5      nU(       d  U4U	SS -   nUb  U4U-   $ U$ [        UUU	R                  U	R                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
>>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, logits = outputs.loss, outputs.logits
>>> list(logits.shape)
[1, 196, 8192]
```N)r   r   r   rp  r   rq  r   r   losslogitsrH   ry  )	rY   r  r  r  r  r   r   rH   ry  )rE   r   r   r   r  r   rp  r   rq  r   r  prediction_scoresmasked_lm_lossloss_fctr<   s                  r+   rK   "BeitForMaskedImageModeling.forward  s    X &1%<k$++B]B]))+/!5%=#  
 "!*..9 LLAB)?@')H%&7&H&QN')GABK7F3A3M^%.YSYY$!//))	
 	
r*   )r  r  r  r  )NNNNNNFN)r$   r%   r&   r'   r   rD   r   r   r6   r   r   r   r   r   r   rK   r)   rT   rU   s   @r+   r  r  q  s    z d   046:,0)-,0/3).&*I
u||,I
 "%"2"23I
 ELL)	I

 &I
 $D>I
 'tnI
 #'I
 d^I
 
un$	%I
 I
r*   r  z
    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
    hidden states of the patch tokens) e.g. for ImageNet.
    c                      ^  \ rS rSrS\SS4U 4S jjr\       SS\\R                     S\\R                     S\\R                     S	\\
   S
\\
   S\
S\\
   S\\\4   4S jj5       rSrU =r$ )BeitForImageClassificationi  rY   r/   Nc                 .  > [         TU ]  U5        UR                  U l        [        USS9U l        UR                  S:  a+  [
        R                  " UR                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NTr  r   )rC   rD   r  r  r  r	   r   r]   r2  
classifierr  r   s     r+   rD   #BeitForImageClassification.__init__  ss      ++f=	 OUN_N_bcNc"))F$6$68I8IJikititiv 	r*   r   r   r  r   rp  r   rq  c           	      P   Ub  UOU R                   R                  nU R                  UUUUUUS9nU(       a  UR                  OUS   n	U R	                  U	5      n
SnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       nU R                  S:X  a&  U" U
R                  5       UR                  5       5      nOU" X5      nOU R                   R
                  S:X  a=  [        5       nU" U
R                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       nU" X5      nU(       d  U
4USS -   nUb  U4U-   $ U$ [!        UU
UR"                  UR$                  S	9$ )
ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nr   r   rp  r   rq  r   
regressionsingle_label_classificationmulti_label_classificationrt   ru   r  )rY   r  r  r  r  problem_typer  r2   r6   longr   r   rV  r   r   r
   r   rH   ry  )rE   r   r   r  r   rp  r   rq  r   r  r  r  r  r<   s                 r+   rK   "BeitForImageClassification.forward  s   " &1%<k$++B]B]))/!5%=#  
 2=--'!*/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE$!//))	
 	
r*   )r  r  r  NNNNNFN)r$   r%   r&   r'   r   rD   r   r   r6   r   r   r   r   r   rK   r)   rT   rU   s   @r+   r  r    s    
z 
d 
  04,0)-,0/3).&*=
u||,=
 ELL)=
 &	=

 $D>=
 'tn=
 #'=
 d^=
 
u++	,=
 =
r*   r  c                      ^  \ rS rSrSr   SS\S\S\\\\\4   4   S\\\\\4   \4   S\	S\\\\\4   4   S	S
4U 4S jjjr
S\R                  S	\R                  4S jrSrU =r$ )BeitConvModulei(  a4  
A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
in_channelsout_channelsr   paddingr   dilationr/   Nc           	         > [         TU ]  5         [        R                  " UUUUUUS9U l        [        R
                  " U5      U l        [        R                  " 5       U l        g )N)r  r  r   r  r   r  )	rC   rD   r	   r   convBatchNorm2dbnReLU
activation)rE   r  r  r   r  r   r  rF   s          r+   rD   BeitConvModule.__init__0  sQ     	II#%#
	 ...'')r*   r,   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rB   )r  r  r  )rE   r,   r<   s      r+   rK   BeitConvModule.forwardE  s0    5!(r*   )r  r  r  )r   Fr   )r$   r%   r&   r'   r(   r   r   r   rS   r   rD   r6   r   rK   r)   rT   rU   s   @r+   r  r  (  s     5601$$ $ 3c3h/0	$
 sE#s(OS01$ $ U38_,-$ 
$ $*U\\ ell  r*   r  c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jrS
r	U =r
$ )BeitPyramidPoolingBlockiM  
pool_scaler  channelsr/   Nc                    > [         TU ]  5         [        R                  " U5      [	        X#SS9/U l        [        U R
                  5       H   u  pEU R                  [        U5      U5        M"     g )Nr   r   )	rC   rD   r	   AdaptiveAvgPool2dr  layersrz  
add_modulerS   )rE   r  r  r  rn  rk  rF   s         r+   rD    BeitPyramidPoolingBlock.__init__N  sX      ,;a@
 "$++.HAOOCFE* /r*   r,   c                 @    UnU R                    H  nU" U5      nM     U$ rB   r  )rE   r,   hidden_staterk  s       r+   rK   BeitPyramidPoolingBlock.forwardW  s%    [[E .L !r*   r  )r$   r%   r&   r'   r   rD   r6   r   rK   r)   rT   rU   s   @r+   r  r  M  sD    +3 +S +C +D +U\\ ell  r*   r  c            
          ^  \ rS rSrSrS\\S4   S\S\S\SS	4
U 4S
 jjrS\	R                  S\\	R                     4S jrSrU =r$ )BeitPyramidPoolingModulei^  a  
Pyramid Pooling Module (PPM) used in PSPNet.

Args:
    pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
        Module.
    in_channels (int): Input channels.
    channels (int): Channels after modules, before conv_seg.
    align_corners (bool): align_corners argument of F.interpolate.

Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
pool_scales.r  r  rz   r/   Nc                   > [         TU ]  5         Xl        X@l        X l        X0l        / U l        [        U5       HE  u  pV[        XbUS9nU R                  R                  U5        U R                  [        U5      U5        MG     g )N)r  r  r  )rC   rD   r   rz   r  r  blocksrz  r  appendr  rS   )	rE   r   r  r  rz   rn  r  blockrF   s	           r+   rD   !BeitPyramidPoolingModule.__init__l  sn    &*& &{3MA+zemnEKKu%OOCFE* 4r*   r   c                     / nU R                    HV  nU" U5      n[        R                  R                  XAR	                  5       SS  SU R
                  S9nUR                  U5        MX     U$ )Nru   rU  rw   )r  r	   r   r   rx   rz   r  )rE   r   ppm_outsppmppm_outupsampled_ppm_outs         r+   rK    BeitPyramidPoolingModule.forwardx  sg    ;;C!fG " 9 9ffhqrl4K]K] !: ! OO-.  r*   )rz   r  r  r  r   )r$   r%   r&   r'   r(   r   r   r   rD   r6   r   r   rK   r)   rT   rU   s   @r+   r  r  ^  s`    
+E#s(O 
+# 
+QT 
+ei 
+nr 
+ $u||*<  r*   r  c                   x   ^  \ rS rSrSrS\SS4U 4S jjrS rS\R                  S\R                  4S	 jr
S
rU =r$ )BeitUperHeadi  z
Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
[UPerNet](https://arxiv.org/abs/1807.10221).

Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
rY   r/   Nc                   > [         TU ]  5         UR                  U l        UR                  /S-  U l        UR                  U l        SU l        [        R                  " U R
                  UR                  SS9U l
        [        U R                  U R                  S   U R
                  U R                  S9U l        [        U R                  S   [        U R                  5      U R
                  -  -   U R
                  SSS9U l        [        R                   " 5       U l        [        R                   " 5       U l        U R                  S S  Hm  n[        X R
                  SS9n[        U R
                  U R
                  SSS9nU R"                  R'                  U5        U R$                  R'                  U5        Mo     [        [        U R                  5      U R
                  -  U R
                  SSS9U l        g )	N   Fr   r  rt   )rz   r   r   r  )rC   rD   r   r]   r  r  rz   r	   r   r  r  r  psp_modulesr  r
  
bottleneckrh  lateral_convs	fpn_convsr  fpn_bottleneck)rE   rY   r  l_convfpn_convrF   s        r+   rD   BeitUperHead.__init__  s   !--"../!3**"))DMM63D3DRST 4R MM,,	
 )R 3t'7'7#84==#HHMM	
  ]]_++CR0K#KANF%dmmT]]PQ[\]H%%f-NN!!(+	 1 -  !DMM1MM	
r*   c                     US   nU/nUR                  U R                  U5      5        [        R                  " USS9nU R	                  U5      nU$ )Nrt   r   r{   )extendr  r6   r   r  )rE   inputsr   psp_outsr<   s        r+   psp_forwardBeitUperHead.psp_forward  sL    2J3((+,99X1-*r*   encoder_hidden_statesc           	      @   [        U R                  5       VVs/ s H  u  p#U" X   5      PM     nnnUR                  U R                  U5      5        [	        U5      n[        US-
  SS5       HP  nXBS-
     R                  SS  nXBS-
     [        R                  R                  XB   USU R                  S9-   XBS-
  '   MR     [        US-
  5       Vs/ s H  o R                  U   " XB   5      PM     nnUR                  US   5        [        US-
  SS5       HA  n[        R                  R                  Xr   US   R                  SS  SU R                  S9Xr'   MC     [        R                  " USS9nU R                  U5      nU R                  U5      nU$ s  snnf s  snf )Nr   r   rt   ru   rU  rw   r{   )rz  r  r  r  r
  ri  r4   r	   r   r   rz   r  r6   r   r  r  )	rE   r  rn  lateral_convlateralsused_backbone_levels
prev_shapefpn_outsr<   s	            r+   rK   BeitUperHead.forward  s   R[\`\n\nRopRoqL!6!9:Rop(()>?@  #8}+a/B7A!a%..qr2J&1uo0I0I*:TM_M_ 1J 1 HUO 8 =BBVYZBZ<[\<[qNN1%hk2<[\%+a/B7A--33(1+"3"3AB"7jX\XjXj 4 HK 8 99X1-$$X.(3 q ]s   F F)
rz   r  r  r  r  r  r  r  r   r  )r$   r%   r&   r'   r(   r   rD   r  r6   r   rK   r)   rT   rU   s   @r+   r  r    sA    $
z $
d $
LU\\ ell  r*   r  c                      ^  \ rS rSrSr SS\S\S\S\\\\\4   4   SS4
U 4S	 jjjr	S
\
R                  S\
R                  4S jrSrU =r$ )BeitFCNHeadi  a  
Fully Convolution Networks for Semantic Segmentation. This head is implemented of
[FCNNet](https://arxiv.org/abs/1411.4038>).

Args:
    config (BeitConfig): Configuration.
    in_channels
    kernel_size (int): The kernel size for convs in the head. Default: 3.
    dilation (int): The dilation rate for convs in the head. Default: 1.


Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
rY   in_indexr   r  r/   Nc                 2  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l	        X l
        US-  U-  n/ nUR                  [        U R                  U R
                  X5US95        [        U R                  S-
  5       H2  nUR                  [        U R
                  U R
                  X5US95        M4     U R                  S:X  a  [        R                  " 5       U l        O[        R"                  " U6 U l        U R                  (       a4  [        U R                  U R
                  -   U R
                  X3S-  S9U l        [        R&                  " U R
                  UR(                  SS9U l        g )Nru   )r   r  r  r   r   r  r  )rC   rD   r]   r  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr)  r  r  ri  r	   r2  convs
Sequentialconv_catr   r  r  )	rE   rY   r)  r   r  conv_paddingr0  rn  rF   s	           r+   rD   BeitFCNHead.__init__  sH    	!--1133"99 #q(H4  $--[iq	

 t~~)*ALLMM4==kjr + >>QDJ.DJ*  4==0$--[qrbrDM ))DMM63D3DRSTr*   r  c                     XR                      nU R                  U5      nU R                  (       a%  U R                  [        R
                  " X#/SS95      nU R                  U5      nU$ )Nr   r{   )r)  r0  r/  r2  r6   r   r  )rE   r  rH   r<   s       r+   rK   BeitFCNHead.forward	  sT    -mm<M*]]599m-D!#LMF(r*   )r  r  r/  r2  r0  r  r)  r-  )ru   r   r   )r$   r%   r&   r'   r(   r   r   r   r   rD   r6   r   rK   r)   rT   rU   s   @r+   r(  r(    s     tu U  U,/ UBE UUZ[^`efiknfn`o[oUp U	 U  UDU\\ ell  r*   r(  c                      ^  \ rS rSrS\SS4U 4S jjrS r\       SS\\	R                     S\\	R                     S	\\	R                     S
\\   S\\   S\S\\   S\\\4   4S jj5       rSrU =r$ )BeitForSemanticSegmentationi  rY   r/   Nc                 x  > [         TU ]  U5        UR                  U l        [        USS9U l        [        U R                  R                  5      S:w  a  [        S5      e[        R                  " [        R                  " UR                  UR                  SSS9[        R                  " UR                  5      [        R                  " 5       [        R                  " UR                  UR                  SSS95      U l        [        R                  " [        R                  " UR                  UR                  SSS95      U l        [        R"                  " 5       U l        [        R&                  " SSS9U l        [+        U5      U l        UR.                  (       a  [1        U5      OS U l        U R5                  5         g )NFr  r  zBeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.ru   r   )rC   rD   r  r  r  r
  rY   out_indicesr   r	   r1  r  r]   r  GELUfpn1fpn2r2  fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr(  auxiliary_headr  r   s     r+   rD   $BeitForSemanticSegmentation.__init__  sO     ++f>	 t{{&&'1,- 
 MMv1163E3EST]^_NN6--.GGIv1163E3EST]^_	
	 MMv1163E3EST]^_
	 KKM	LLQq9	 (/5;5N5Nk&1TX 	r*   c                 X   [         R                  R                  XR                  SS  SSS9nUb,  [         R                  R                  X#R                  SS  SSS9n[	        U R
                  R                  S9nU" XC5      nUnUb$  U" WU5      n	XR
                  R                  U	-  -  nU$ )Nr   rU  Frw   )ignore_index)r	   r   r   r4   r   rY   semantic_loss_ignore_indexauxiliary_loss_weight)
rE   r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsr  	main_lossr  auxiliary_losss
             r+   compute_loss(BeitForSemanticSegmentation.compute_loss5  s    ==44bc*5 5 
 ')+)B)B ||BC'8zY^ *C *& $1W1WX-6	'%&@&INKK55FFDr*   r   r   r  r   rp  r   rq  c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb%  U R                   R                  S:X  a  [	        S5      eU R                  UUUSUUS9nU(       a  UR                  OUS   n	[        U	5       V
Vs/ s H&  u  pU
S-   U R                   R                  ;   d  M$  UPM(     nn
nUR                  S   nU R                   R                  U R                   R                  -  nU Vs/ s H2  oSS2SS2SS24   R                  SSS5      R                  USX5      PM4     nnU R                  U R                  U R                   U R"                  /n[%        ['        U5      5       H  nUU   " UU   5      UU'   M     U R)                  U5      nSnU R*                  b  U R+                  U5      nSnUb  U R-                  UUU5      nU(       d%  U(       a
  U4USS -   nO	U4USS -   nUb  U4U-   $ U$ [/        UUU(       a  UR                  OSUR0                  S	9$ s  snn
f s  snf )
a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
>>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")

>>> inputs = image_processor(images=image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> # logits are of shape (batch_size, num_labels, height, width)
>>> logits = outputs.logits
```Nr   z/The number of labels should be greater than oneTr  r   ru   rt   r  )rY   r  rp  r  r   r  rH   rz  r:  r4   re   rc   r   r   r<  r=  r>  r@  ri  r
  rA  rC  rN  r   ry  )rE   r   r   r  r   rp  r   rq  r   r  idxfeaturefeaturesr   patch_resolutionr   opsrn  r  rI  r  r<   s                         r+   rK   #BeitForSemanticSegmentation.forwardH  s[   D &1%<k$++B]B]$8$D $++JjJj 	 $++"8"8A"=NOO))/!%%=#  
 :E 5 5'RS* 1::O0Pw0PTWZ[T[_c_j_j_v_vTvG0Pw!''*
;;11T[[5K5KKnv
nvijaQhK1a(00RAQdnv 	 

 yy$))TYY		:s8}%Aa&!-HQK & !!(+*#228<$$V-=vFD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
; x
s   #H< H<	9I)rC  r  rA  r<  r=  r>  r@  r  r  )r$   r%   r&   r'   r   rD   rN  r   r   r6   r   r   r   r   r   rK   r)   rT   rU   s   @r+   r8  r8    s    z d @&  04,0)-,0/3).&*X
u||,X
 ELL)X
 &	X

 $D>X
 'tnX
 #'X
 d^X
 
u--	.X
 X
r*   r8  zM
    BEiT backbone, to be used with frameworks like DETR and MaskFormer.
    c                   r   ^  \ rS rSrU 4S jrS r\   SS\S\\	   S\\	   S\\	   S\
4
S	 jj5       rS
rU =r$ )BeitBackbonei  c                   > [         TU ]  U5        [         TU ]	  U5        [        UR                  S-   5       Vs/ s H  o!R
                  PM     snU l        [        U5      U l        [        XR                  R                  R                  S9U l        UR                  (       Ga  [        U R                  R                   5      S:w  a  [#        S5      eUR
                  n[$        R&                  " [$        R(                  " X3SSS9[$        R*                  " X1R,                  S9[$        R.                  " 5       [$        R(                  " X3SSS95      U l        [$        R&                  " [$        R(                  " X3SSS95      U l        [$        R4                  " 5       U l        [$        R8                  " SSS9U l        U R=                  5         g s  snf )Nr   r   r  zBeitBackbone requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.ru   r   r)  )rC   rD   _init_backboneri  rf  r]   num_featuresrW   rp   ra  rb   r   r  add_fpnr
  rY   r:  r   r	   r1  r  r  batch_norm_epsr;  r<  r=  r2  r>  r?  r@  r  )rE   rY   r   r]   rF   s       r+   rD   BeitBackbone.__init__  sW    v&9>v?W?WZ[?[9\]9\A//9\](0"67W7W7c7cd>>>4;;**+q0 1 
 !,,K"";STU{0E0EF	"";STU	DI b&8&8_`ij&klDIDI1=DI 	1 ^s   Gc                 .    U R                   R                  $ rB   r  rO   s    r+   r  !BeitBackbone.get_input_embeddings  r  r*   r   rp  r   rq  r/   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUR                  S   nU R                  U5      u  nu  pxUR                  SS n	U R                  USUU	US9n
U(       a  U
R                  OU
S   nSn[        U R                  U5       Hj  u  pXR                  ;   d  M  U R                   R                  (       a4  USS2SS2SS24   nUR                  SSS5      nUR                  USXx5      nX4-  nMl     U R                   R                  (       aY  U R                  US   5      U R!                  US   5      U R#                  US   5      U R%                  US	   5      /n['        U5      nU(       d  U(       a  U4U
SS -   nU$ U4U
SS -   nU$ [)        UU(       a  U
R                  OSU
R*                  S
9$ )a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
>>> model = AutoBackbone.from_pretrained(
...     "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"]
... )

>>> inputs = processor(image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 768, 14, 14]
```Nr   ru   T)rp  r   r   rq  r   r#   rt   r   )feature_mapsrH   ry  )rY   r  rp  r   r4   rp   r  rH   zipstage_namesout_featuresreshape_hidden_statesr   r   r\  r<  r=  r>  r@  r   r   ry  )rE   r   rp  r   rq  r   r  r   r   r   r   rH   rb  stager  r<   s                   r+   rK   BeitBackbone.forward  s    @ &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq!''*
8<8U55<!''+
,,!%/!#  
 2=--'!*#&t'7'7#GE)));;44#/12q#9L#/#7#71a#@L#/#7#7
B#bL/ $H ;;		,q/*		,q/*		,q/*		,q/*	L !.L#&712;6 M '712;6M%3G'//T))
 	
r*   )rp   r  r<  r=  r>  r@  r[  )NNN)r$   r%   r&   r'   rD   r  r   r   r   r   r   rK   r)   rT   rU   s   @r+   rX  rX    so    <0  04,0&*Q
Q
 'tnQ
 $D>	Q

 d^Q
 
Q
 Q
r*   rX  )r  r  r8  r  r  rX  )r1   F)Pr(   collections.abcrf   r   r   dataclassesr   typingr   r   r   r   r6   torch.utils.checkpointr   r	   torch.nnr
   r   r   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   utils.backbone_utilsr   configuration_beitr   
get_loggerr$   r   _EXPECTED_OUTPUT_SHAPE_IMAGE_CLASS_CHECKPOINT_IMAGE_CLASS_EXPECTED_OUTPUTr!   rR   r   r=   Moduler?   rW   ra   r   r   r   r  r  r  r   r&  r   ra  r  r  r  r  r  r  r  r  r  r(  r8  rX  __all__r#   r*   r+   <module>r{     s       ! / /    A A !  . v v 7 7 1 * 
		H	%
 '  < 1  !;  2U\\ e T V[VbVb (-299 - c7RYY c7L#7")) #7LP		 Pf;#- ;#|RYY & ! )BII )Xryy  
 
>		 >BP3ryy P3fT
")) T
n #O/ #O #OL T
# T
 T
n & X
!4 X
X
v K
!4 K
K
\"RYY "Jbii ""ryy "JR299 Rj8")) 8v M
"5 M
 M
` 
t
& t

t
nr*   