
    fThs              	          S r SSKrSSKrSSKJrJr  SSKrSSKrSSK	rSSKJ
r
Jr  SSKJrJrJr  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  \R@                  " \!5      r"S7S\\\#4   4S jjr$ " S S\RJ                  5      r& " S S\RN                  5      r( " S S\RR                  5      r* " S S\RV                  5      r, " S S\RR                  5      r-S8S\R                  S\.S\#S\R                  4S jjr/ " S S\RR                  5      r0S9S  jr1 " S! S"\RR                  5      r2 " S# S$\RR                  5      r3 " S% S&\RR                  5      r4 " S' S(\RR                  5      r5 " S) S*\RR                  5      r6\ " S+ S,\5      5       r7\ " S- S.\75      5       r8\" S/S09 " S1 S2\75      5       r9\" S3S09 " S4 S5\7\5      5       r:/ S6Qr;g):z9PyTorch BiT model. Also supports backbone for ViT hybrid.    N)OptionalTuple)Tensornn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttention)PreTrainedModel)auto_docstringlogging)BackboneMixin   )	BitConfigreturnc                 "   SnU c  US-
  X1S-
  -  -   S-  n X4$ [        U [        5      (       a`  U R                  5       n U S:X  a/  US:X  a!  X1S-
  -  S-  S:X  a  US-
  X1S-
  -  -   S-  n X4$ Sn Sn X4$ U S:X  a  Sn X4$ US-
  X1S-
  -  -   S-  n X4$ )a<  
Utility function to get the tuple padding value given the kernel_size and padding.

Args:
    padding (Union[`str`, `int`], *optional*):
        Padding value, can be either `"same"`, `"valid"`. If a different value is provided the default padding from
        PyTorch is used.
    kernel_size (`int`, *optional*, defaults to 7):
        Kernel size of the convolution layers.
    stride (`int`, *optional*, defaults to 1):
        Stride value of the convolution layers.
    dilation (`int`, *optional*, defaults to 1):
        Dilation value of the convolution layers.
Fr      samer   Tvalid)
isinstancestrlower)paddingkernel_sizestridedilationdynamics        \/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/bit/modeling_bit.pyget_padding_valuer$   +   s     GQJ(Ao">>1D'3--/f{!O <AQF"QJ(Ao*FF1L    G  
h/&BBqHG    c                   B   ^  \ rS rSrSr      SU 4S jjrS rSrU =r$ )WeightStandardizedConv2dT   zConv2d with Weight Standardization. Includes TensorFlow compatible SAME padding. Used for ViT Hybrid model.

Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight
Standardization](https://arxiv.org/abs/1903.10520v2)
c
                    > [        XSXFS9u  pZ[        TU ]	  UUUUUUUUS9  U
(       a  [        X4U5      U l        OS U l        Xl        g )N)r    r!   )r    r   r!   groupsbias)r$   super__init__DynamicPad2dpadeps)self
in_channelout_channelsr   r    r   r!   r*   r+   r0   
is_dynamic	__class__s              r#   r-   !WeightStandardizedConv2d.__init__[   s]     0Vg 	 		
 #KBDHDHr%   c           	         U R                   b  U R                  U5      n[        R                  R                  U R                  R                  SU R                  S5      S S SSU R                  S9R                  U R                  5      n[        R                  R                  XU R                  U R                  U R                  U R                  U R                  5      nU$ )Nr   T        )trainingmomentumr0   )r/   r   
functional
batch_normweightreshaper3   r0   
reshape_asconv2dr+   r    r   r!   r*   )r1   hidden_stater>   s      r#   forward WeightStandardizedConv2d.forwardx   s    8888L1L))KK4#4#4b94PT_bhlhphp * 

*T[[
! 	 }}++$))T[[$,,W[WbWb
 r%   )r0   r/   )r   SAMEr   r   Fgư>	__name__
__module____qualname____firstlineno____doc__r-   rC   __static_attributes____classcell__r5   s   @r#   r'   r'   T   s+     :	 	r%   r'   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )BitGroupNormActivation   zI
A module that combines group normalization with an activation function.
c                    > [         [        U ]  UR                  X#US9  U(       a  [        UR
                     U l        g [        R                  " 5       U l        g )N)r0   affine)	r,   rP   r-   
num_groupsr   
hidden_act
activationr   Identity)r1   confignum_channelsr0   rS   apply_activationr5   s         r#   r-   BitGroupNormActivation.__init__   sC    $d4V5F5Ffl4m$V%6%67DO kkmDOr%   c                     [         R                  R                  XR                  U R                  U R
                  U R                  5      nU R                  U5      nU$ N)r   r<   
group_normrT   r>   r+   r0   rV   )r1   rB   s     r#   rC   BitGroupNormActivation.forward   sF    }}//oot{{\`\e\egkgogop|4r%   )rV   )gh㈵>TTrF   rN   s   @r#   rP   rP      s    , r%   rP   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )r.      z
A module that wraps dynamic padding of any input, given the parameters of the convolutional layer and the input
hidden states.
c                    > [         TU ]  5         [        U[        5      (       a  X4n[        U[        5      (       a  X"4n[        U[        5      (       a  X34nXl        X l        X0l        X@l        S nXPl        g )Nc                 p    [        [        R                  " X-  5      S-
  U-  US-
  U-  -   S-   U -
  S5      $ )Nr   r   )maxmathceil)xr   r    r!   s       r#   compute_padding.DynamicPad2d.__init__.<locals>.compute_padding   s@    		!*-1V;{QRZ>ZZ]^^abbdeffr%   )	r,   r-   r   intr   r    r!   valuerh   )r1   r   r    r!   rk   rh   r5   s         r#   r-   DynamicPad2d.__init__   sp    k3''&4Kfc""%Fh$$ +H& 
	g  /r%   c           	         UR                  5       SS  u  p#U R                  X R                  S   U R                  S   U R                  S   5      nU R                  X0R                  S   U R                  S   U R                  S   5      nUS:  d  US:  a=  [
        R                  R                  UUS-  XUS-  -
  US-  XDS-  -
  /U R                  S9nU$ )Nr   r   r   )rk   )	sizerh   r   r    r!   r   r<   r/   rk   )r1   inputinput_heightinput_widthpadding_heightpadding_widths         r#   rC   DynamicPad2d.forward   s    $)JJL$5! --l<L<LQ<OQUQ\Q\]^Q_aeananopaqr,,[:J:J1:Mt{{[\~_c_l_lmn_op A!2MM%%!Q&!Q$66"a'"q%88	 jj & 	E r%   )rh   r!   r   r    rk   )r   rF   rN   s   @r#   r.   r.      s    
/, r%   r.   c                   J   ^  \ rS rSrSr      SS\4U 4S jjjrS rSrU =r	$ )BitMaxPool2d   z1Tensorflow like 'SAME' wrapper for 2D max poolingr   c                   > [        U[        R                  R                  5      (       a  UOX4n[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34n[        TU ]  XXSU5        U(       a  [        XX65      U l        g [        R                  " 5       U l        g r]   )
r   collectionsabcIterabler,   r-   r.   r/   r   rW   )	r1   r   r    r!   	ceil_moder   padding_valueuse_dynamic_paddingr5   s	           r#   r-   BitMaxPool2d.__init__   s     &0[__=U=U%V%Vk]h\v%fkoo.F.FGGfM])(KOO4L4LMM8T\SggK#KQDH{{}DHr%   c                     U R                  U5      n[        R                  R                  XR                  U R
                  U R                  U R                  U R                  5      $ r]   )	r/   r   r<   
max_pool2dr   r    r   r!   r}   r1   hidden_statess     r#   rC   BitMaxPool2d.forward   sK    /}}''++T[[$,,W[WeWe
 	
r%   )r/   )Nr   F)r   r   r   T)
rG   rH   rI   rJ   rK   rj   r-   rC   rL   rM   rN   s   @r#   rw   rw      s6    ;
  %% %&
 
r%   rw   c                   F   ^  \ rS rSrSrS\4U 4S jjrS\S\4S jrSr	U =r
$ )	BitEmbeddings   zD
BiT Embeddings (stem) composed of a single aggressive convolution.
rX   c           	         > [         TU ]  5         [        UR                  UR                  SSSUR
                  S9U l        [        SSUR                  S9U l	        UR
                  b9  UR
                  R                  5       S:X  a  [        R                  " 5       U l        O[        R                  " SS	S
9U l        UR                  S:X  d  [!        XR                  S9U l        O[        R                  " 5       U l        UR                  U l        g )N   r   :0yE>)r   r    r0   r   r
   )r   r    r   rE   )r   r   r   r   r9   )r   rk   preactivationrY   )r,   r-   r'   rY   embedding_sizeglobal_paddingconvolutionrw   embedding_dynamic_paddingpoolerupperr   rW   r/   ConstantPad2d
layer_typerP   normr1   rX   r5   s     r#   r-   BitEmbeddings.__init__   s    3!!))
 #qPVPpPpq   ,1F1F1L1L1NRX1X{{}DH''CHDH  O3.vDYDYZDIDI"//r%   pixel_valuesr   c                     UR                   S   nX R                  :w  a  [        S5      eU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nU$ )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)shaperY   
ValueErrorr   r/   r   r   )r1   r   rY   	embeddings       r#   rC   BitEmbeddings.forward  sp    #))!,,,,w  $$\2	HHY'	IIi(	KK	*	r%   )r   r   rY   r/   r   )rG   rH   rI   rJ   rK   r   r-   r   rC   rL   rM   rN   s   @r#   r   r      s,    0y 06F v  r%   r   rp   	drop_probr:   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
r9   r   r   )r   )dtypedevice)r   ndimtorchrandr   r   floor_div)rp   r   r:   	keep_probr   random_tensoroutputs          r#   	drop_pathr     s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr%   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )BitDropPathi.  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 .   > [         TU ]  5         Xl        g r]   )r,   r-   r   )r1   r   r5   s     r#   r-   BitDropPath.__init__1  s    "r%   r   c                 B    [        XR                  U R                  5      $ r]   )r   r   r:   r   s     r#   rC   BitDropPath.forward5  s    FFr%   c                 8    SR                  U R                  5      $ )Nzp={})formatr   )r1   s    r#   
extra_reprBitDropPath.extra_repr8  s    }}T^^,,r%   )r   r]   )rG   rH   rI   rJ   rK   r   floatr-   r   r   rC   r   r   rL   rM   rN   s   @r#   r   r   .  sQ    b#(5/ #T # #GU\\ Gell G-C - -r%   r   c                 d    Un[        U[        XS-  -   5      U-  U-  5      nUSU -  :  a  X1-  nU$ )Nr   g?)rd   rj   )rk   divisor	min_value	new_values       r#   make_divr   <  sC    IIs5Q;#677BWLMI3;	r%   c                   F   ^  \ rS rSrSr        SU 4S jjrS rSrU =r$ )BitPreActivationBottleneckLayeriD  zPre-activation (v2) bottleneck block.
Follows the implementation of "Identity Mappings in Deep Residual Networks":
https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua

Except it puts the stride on 3x3 conv when available.
c           
        > [         TU ]  5         U=(       d    UnU=(       d    Un[        X4-  5      nU
(       a  [        UUUUSS9U l        OS U l        [        X5      U l        [        X+SSUR                  S9U l	        [        XS9U l
        [        XSXXSUR                  S9U l        [        X5      U l        [        XSSUR                  S9U l        U	S	:  a  [        U	5      U l        g [        R                   " 5       U l        g )
NTr    preactr   r   r0   r   r   r
   )r    r*   r0   r   r   )r,   r-   r   BitDownsampleConv
downsamplerP   norm1r'   r   conv1norm2conv2norm3conv3r   r   rW   r   )r1   rX   in_channelsr3   bottle_ratior    r!   first_dilationr*   drop_path_rateis_first_layermid_channelsr5   s               r#   r-   (BitPreActivationBottleneckLayer.__init__L  s     	'38#2{ ;</DO #DO+F@
-kPT^d^s^st
+FN
-&T[a[p[p

 ,FA
-l!QU_e_t_tu
8F8J^4PRP[P[P]r%   c                 0   U R                  U5      nUnU R                  b  U R                  U5      nU R                  U5      nU R                  U R	                  U5      5      nU R                  U R                  U5      5      nU R                  U5      nX-   $ r]   )r   r   r   r   r   r   r   r   )r1   r   hidden_states_preactshortcuts       r#   rC   'BitPreActivationBottleneckLayer.forwardx  s    #zz-8 !??&';<H 

#78

4::m#<=

4::m#<=}5''r%   )r   r   r   r   r   r   r   r   N      ?r   r   Nr   r9   FrF   rN   s   @r#   r   r   D  s3     *^X( (r%   r   c                   F   ^  \ rS rSrSr        SU 4S jjrS rSrU =r$ )BitBottleneckLayeri  z\Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT Hybrid.c                 0  > [         TU ]  5         U=(       d    UnU=(       d    Un[        X4-  5      nU
(       a  [        UUUUSS9U l        OS U l        [        X+SSUR                  S9U l        [        XS9U l	        [        UUSUUUSUR                  S9U l
        [        XS9U l        [        XSSUR                  S9U l        [        XSS	9U l        U	S
:  a  [        U	5      O[        R                   " 5       U l        [$        UR&                     U l        g )NFr   r   r   r   r   r
   )r    r!   r*   r0   r   rY   rZ   r   )r,   r-   r   r   r   r'   r   r   rP   r   r   r   r   r   r   r   rW   r   r   rU   rV   )r1   rX   r   r3   r   r    r!   r   r*   r   r   mid_chsr5   s               r#   r-   BitBottleneckLayer.__init__  s    	'38#2{<67/DO #DO-kA4Y_YnYno
+FI
-#))	

 ,FI
-gQDZ`ZoZop
+F`ef
8F8J^4PRP[P[P] !2!23r%   c                 Z   UnU R                   b  U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  X-   5      nU$ r]   )	r   r   r   r   r   r   r   r   rV   )r1   r   r   s      r#   rC   BitBottleneckLayer.forward  s     ??&}5H 

=1

=1

=1

=1

=1

=1}5(@Ar%   )	rV   r   r   r   r   r   r   r   r   r   rF   rN   s   @r#   r   r     s0    f /4b r%   r   c                   6   ^  \ rS rSr  SU 4S jjrS rSrU =r$ )r   i  c           	         > [         TU ]  5         [        X#SUSUR                  S9U l        U(       a  [
        R                  " 5       U l        g [        XSS9U l        g )Nr   r   )r    r0   r   Fr   )	r,   r-   r'   r   convr   rW   rP   r   )r1   rX   r   r3   r    r   r5   s         r#   r-   BitDownsampleConv.__init__  sX     	,qT6K`K`
	
  KKM 		 (\ab 		r%   c                 B    U R                  U R                  U5      5      $ r]   )r   r   )r1   rg   s     r#   rC   BitDownsampleConv.forward  s    yy1&&r%   )r   r   )r   T)rG   rH   rI   rJ   r-   rC   rL   rM   rN   s   @r#   r   r     s     
$' 'r%   r   c                   L   ^  \ rS rSrSr  S	U 4S jjrS rS\S\4S jrSr	U =r
$ )
BitStagei  z/
A ResNet v2 stage composed by stacked layers.
c	                 `  > [         TU ]  5         US;   a  SOSn	UR                  S:X  a  [        n
O[        n
Un[
        R                  " 5       U l        [        U5       HM  nU R                  XU5      u  pMnU R                  R                  [        U5      U
" UUUUUUU	UUS9	5        UnUn	MO     g )N)r   r   r   r   
bottleneck)r    r!   r   r   r   r   )r,   r-   r   r   r   r   
Sequentiallayersrange_get_updated_hyperparameters
add_moduler   )r1   rX   r   r3   r    r!   depthr   layer_dropoutr   	layer_clsprev_chs	layer_idxr   r   r5   s                  r#   r-   BitStage.__init__  s     	&&0a ,*I7ImmouI595V5V=62FN KK""I !%!-#1#1#1
 $H%N+ &r%   c                 @    U(       a  X1   nOSnUS:w  a  SnUS:H  nX$U4$ )zd
Get the new hyper-parameters with respect to the previous ones and the index of the current layer.
r9   r   r    )r1   r   r    r   r   r   s         r#   r   %BitStage._get_updated_hyperparameters  s4     *5N N>F"a~55r%   rp   r   c                 V    Un[        U R                  5       H  u  p4U" U5      nM     U$ r]   )	enumerater   )r1   rp   rB   _layers        r#   rC   BitStage.forward+  s,    !$++.HA .L /r%   )r   )r   N)rG   rH   rI   rJ   rK   r-   r   r   rC   rL   rM   rN   s   @r#   r   r     s3     ,&\6 V   r%   r   c            	       V   ^  \ rS rSrS\4U 4S jjrS r SS\S\S\S\	4S	 jjr
S
rU =r$ )
BitEncoderi2  rX   c                   > [         TU ]  5         [        R                  " / 5      U l        UR
                  nSnSn[        R                  " [        R                  " SUR                  [        UR                  5      5      5      R                  UR                  5       Vs/ s H  nUR                  5       PM     nn[        [!        UR                  UR"                  U5      5       HY  u  nu  pn
U R%                  XsXU5      u  pn['        UUUUUUU
S9nUnX<-  nU R                  R)                  [+        U5      U5        M[     g s  snf )N   r   r   )r    r!   r   r   )r,   r-   r   
ModuleListstagesr   r   r   nplinspacer   sumdepthssplittolistr   ziphidden_sizesr   r   r   r   )r1   rX   r   current_strider!   rg   layer_dropouts	stage_idxcurrent_depthcurrent_hidden_sizer   r3   r    stager5   s                 r#   r-   BitEncoder.__init__3  s5   mmB'((  \\"++a1F1FFMMHZ"[\bbcicpcpq
q HHJq 	 

 OXv22NCO
JIJM .2-N-N+>&.*L( !#+E $H$NKK""3y>59+O

s   Ec                 v    [        X5R                  -  5      nUS:X  a  SOSnX%R                  :  a  XG-  nSnXgU4$ )Nr   r   r   )r   width_factoroutput_stride)r1   r  r
  r  r!   rX   r3   r    s           r#   r   'BitEncoder._get_updated_hyperparametersY  sG     36I6I IJ1n!111HFX--r%   rB   output_hidden_statesreturn_dictr   c                     U(       a  SOS nU R                    H  nU(       a  XA4-   nU" U5      nM     U(       a  XA4-   nU(       d  [        S X4 5       5      $ [        UUS9$ )Nr   c              3   .   #    U  H  oc  M  Uv   M     g 7fr]   r   ).0vs     r#   	<genexpr>%BitEncoder.forward.<locals>.<genexpr>p  s     S$Aq$As   	)last_hidden_stater   )r  tupler   )r1   rB   r  r  r   stage_modules         r#   rC   BitEncoder.forwarda  sk     3 KKL# - ?'5L	 (  )O;MS\$ASSS-*'
 	
r%   )r  )FT)rG   rH   rI   rJ   r   r-   r   r   boolr   rC   rL   rM   rN   s   @r#   r   r   2  sF    $:y $:L. ]a
"
:>
UY
	'
 
r%   r   c                   ,    \ rS rSr\rSrSrS/rS r	Sr
g)BitPreTrainedModelix  bitr   r   c                 b   [        U[        R                  5      (       a*  [        R                  R	                  UR
                  SSS9  g [        U[        R                  5      (       a  [        R                  R                  UR
                  [        R                  " S5      S9  UR                  by  [        R                  R                  UR
                  5      u  p#US:  a  S[        R                  " U5      -  OSn[        R                  R                  UR                  U* U5        g g [        U[        R                  [        R                  45      (       aU  [        R                  R                  UR
                  S5        [        R                  R                  UR                  S5        g g )Nfan_outrelu)modenonlinearity   )ar   r   )r   r   Conv2dinitkaiming_normal_r>   Linearkaiming_uniform_re   sqrtr+   _calculate_fan_in_and_fan_outuniform_BatchNorm2d	GroupNorm	constant_)r1   modulefan_inr   bounds        r#   _init_weights BitPreTrainedModel._init_weights  s   fbii((GG##FMM	PV#W		**GG$$V]]diil$C{{&GGAA&--P	17!DIIf--  ufe< '  >??GGfmmQ/GGfkk1- @r%   r   N)rG   rH   rI   rJ   r   config_classbase_model_prefixmain_input_name_no_split_modulesr:  rL   r   r%   r#   r#  r#  x  s     L$O().r%   r#  c            
       ^   ^  \ rS rSrU 4S jr\ S	S\S\\   S\\   S\	4S jj5       r
SrU =r$ )
BitModeli  c                 F  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:X  a  [        XR                  S   S9O[        R                  " 5       U l        [        R                  " S5      U l        U R                  5         g )Nr   r8   r   )r   r   )r,   r-   rX   r   embedderr   encoderr   rP   r	  r   rW   r   AdaptiveAvgPool2dr   	post_initr   s     r#   r-   BitModel.__init__  s     %f-!&)   O3 #68K8KB8OP 		 **62r%   r   r  r  r   c                 H   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  U5      nU R	                  XBUS9nUS   nU R                  U5      nU R                  U5      nU(       d	  Xg4USS  -   $ [        UUUR                  S9$ )Nr  r  r   r   )r  pooler_outputr   )	rX   r  use_return_dictrC  rD  r   r   r   r   )r1   r   r  r  embedding_outputencoder_outputsr  pooled_outputs           r#   rC   BitModel.forward  s    
 %9$D $++JjJj 	 &1%<k$++B]B]==6,,U` ' 
 ,A. II&78$56%58KKK7/')77
 	
r%   )rX   rC  rD  r   r   NN)rG   rH   rI   rJ   r-   r   r   r   r!  r   rC   rL   rM   rN   s   @r#   rA  rA    sI    " os
"
:B4.
^fgk^l
	1
 
r%   rA  z
    BiT Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                      ^  \ rS rSrU 4S jr\    S
S\\R                     S\\R                     S\\
   S\\
   S\4
S jj5       rS	rU =r$ )BitForImageClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " [
        R                  " 5       UR                  S:  a.  [
        R                  " UR                  S   UR                  5      O[
        R                  " 5       5      U l        U R                  5         g )Nr   r8   )r,   r-   
num_labelsrA  r$  r   r   Flattenr/  r	  rW   
classifierrF  r   s     r#   r-   "BitForImageClassification.__init__  s      ++F#--JJLEKEVEVYZEZBIIf))"-v/@/@A`b`k`k`m

 	r%   r   labelsr  r  r   c                 0   Ub  UOU R                   R                  nU R                  XUS9nU(       a  UR                  OUS   nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       n	U R                  S:X  a&  U	" UR                  5       UR                  5       5      nOU	" Xr5      nOU R                   R
                  S:X  a=  [        5       n	U	" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       n	U	" Xr5      nU(       d  U4USS -   n
Ub  U4U
-   $ U
$ [!        XUR"                  S	9$ )
a  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NrI  r   
regressionsingle_label_classificationmulti_label_classificationr8   r   )losslogitsr   )rX   rK  r$  rJ  rW  problem_typerU  r   r   longrj   r	   squeezer   viewr   r   r   )r1   r   rY  r  r  outputsrN  r_  r^  loss_fctr   s              r#   rC   !BitForImageClassification.forward  s    &1%<k$++B]B]((<`k(l1<--'!*/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F'+'7D7V#CVC3\c\q\qrrr%   )r$  rW  rU  )NNNN)rG   rH   rI   rJ   r-   r   r   r   FloatTensor
LongTensorr!  r   rC   rL   rM   rN   s   @r#   rS  rS    s    
  59-1/3&*/su001/s ))*/s 'tn	/s
 d^/s 
./s /sr%   rS  zL
    BiT backbone, to be used with frameworks like DETR and MaskFormer.
    c            
       ^   ^  \ rS rSrU 4S jr\ S	S\S\\   S\\   S\	4S jj5       r
SrU =r$ )
BitBackbonei  c                    > [         TU ]  U5        [         TU ]	  U5        [        U5      U l        UR
                  /UR                  -   U l        U R                  5         g r]   )	r,   r-   _init_backbonerA  r$  r   r	  num_featuresrF  r   s     r#   r-   BitBackbone.__init__  sQ     v&F##223f6I6II 	r%   r   r  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  USSS9nUR                  nSn[        U R                  5       H  u  pxXR                  ;   d  M  XeU   4-  nM      U(       d  U4n	U(       a  XR                  4-  n	U	$ [        UU(       a  UR                  SS9$ SSS9$ )a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("google/bit-50")
>>> model = AutoBackbone.from_pretrained("google/bit-50")

>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
```NTrI  r   )feature_mapsr   
attentions)	rX   rK  r  r$  r   r   stage_namesout_featuresr   )
r1   r   r  r  rd  r   rp  idxr  r   s
             r#   rC   BitBackbone.forward  s    , &1%<k$++B]B]$8$D $++JjJj 	 ((<dPT(U--#D$4$45JC)))s!3 55 6 "_F#0022M%3G'//
 	
MQ
 	
r%   )r$  rm  rP  )rG   rH   rI   rJ   r-   r   r   r   r!  r   rC   rL   rM   rN   s   @r#   rj  rj    sI     os-
"-
:B4.-
^fgk^l-
	-
 -
r%   rj  )rS  rA  r#  rj  )Nr   r   r   )r9   F)   )<rK   rz   re   typingr   r   numpyr  r   torch.utils.checkpointr   r   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   utils.backbone_utilsr   configuration_bitr   
get_loggerrG   loggerr!  r$   r,  r'   r5  rP   Moduler.   	MaxPool2drw   r   r   r   r   r   r   r   r   r   r   r#  rA  rS  rj  __all__r   r%   r#   <module>r     s   @   "     A A !  . , 1 ( 
		H	%&ERWY]R]L^ &R-ryy -`R\\ $0299 0f
2<< 
:/BII /fU\\ e T V[VbVb *-")) -A(bii A(HF FR'		 '.Gryy GTC
 C
L . . .* .
! .
 .
b =s 2 =s=s@ 
9
$m 9

9
x Yr%   