
    fTh              	       <   S r SSKrSSKrSSKJrJrJrJrJ	r	  SSK
r
SSKr
SSK
Jr  SSKJr  SSKJrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  \R6                  " \5      r " S S\R<                  5      r\
R@                  RB                  S 5       r"S r# " S S\R<                  5      r$S1S\
RJ                  S\&S\'S\
RJ                  4S jjr( " S S\R<                  5      r) " S S\R<                  5      r* " S S\R<                  5      r+ " S S\R<                  5      r,S  r-S! r. " S" S#\R<                  5      r/ " S$ S%\R<                  5      r0S&\R<                  SS4S' jr1\ " S( S)\5      5       r2\ " S* S+\25      5       r3\" S,S-9 " S. S/\2\5      5       r4/ S0Qr5g)2zPyTorch ViTDet backbone.    N)DictListOptionalTupleUnion)nn   )ACT2FN)BackboneOutputBaseModelOutput)PreTrainedModel)auto_docstringlogging)BackboneMixin   )VitDetConfigc                   l   ^  \ rS rSrSrU 4S jrS rS\R                  S\R                  4S jr	Sr
U =r$ )	VitDetEmbeddings$   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) to be consumed by a Transformer.
c                 x  > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l
        X0l        X@l        X`l        UR                  (       a@  US-   n[        R                  " [        R                   " SXqR
                  5      5      U l        OS U l        [        R$                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)super__init__pretrain_image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterable
image_sizenum_patches use_absolute_position_embeddingsr   	Parametertorchzerosposition_embeddingsConv2d
projection)	selfconfigr#   r   r   r   r$   num_positions	__class__s	           b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/vitdet/modeling_vitdet.pyr   VitDetEmbeddings.__init__*   s    !'!;!;V=N=NJ$*$7$79K9Kk#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&22'!OM')||EKK=RdRd4e'fD$'+D$))L:i    c                    U(       a  USS2SS24   nUR                   S   n[        [        R                  " U5      5      nXf-  U:w  a  [	        S5      e[
        R                  R                  5       (       d
  Xc:w  d  Xd:w  aX  [        R                  R                  UR                  SXfS5      R                  SSSS5      X44SS	S
9nUR                  SSSS5      $ UR                  SX4S5      $ )a?  
Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token dimension for the
original embeddings.

Args:
    abs_pos_embeddings (`torch.Tensor`):
        Absolute positional embeddings with (1, num_position, num_channels).
    has_cls_token (`bool`):
        If true, has 1 embedding in abs_pos_embeddings for cls token.
    height (`int`):
        Height of input image tokens.
    width (`int`):
        Width of input image tokens.

Returns:
    Absolute positional embeddings after processing with shape (1, height, width, num_channels)
Nr   z5Absolute position embeddings must be a square number.r   r	      bicubicF)sizemodealign_corners)shapeintmathsqrt
ValueErrorr'   jit
is_tracingr   
functionalinterpolatereshapepermute)r,   abs_pos_embeddingshas_cls_tokenheightwidthnum_positionr7   new_abs_pos_embeddingss           r0   get_absolute_positions'VitDetEmbeddings.get_absolute_positions@   s    $ !3AqrE!:)//2499\*+;,&TUU99!!dn%']]%>%>"**1d"=EEaAqQ_#	 &? &" *11!Q1==%--aCCr2   pixel_valuesreturnc                 z   UR                   S   nX R                  :w  a  [        SU R                   SU S35      eU R                  U5      nU R                  bb  UR                  SSSS5      nX0R                  U R                  SUR                   S   UR                   S   5      -   nUR                  SSSS5      nU$ )	Nr   zoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r5   r	   T)r:   r   r>   r+   r)   rD   rK   )r,   rM   r   
embeddingss       r0   forwardVitDetEmbeddings.forwardf   s    #))!,,,,!../yaI  __\2
##/#++Aq!Q7J#&A&A(($
0@0@0CZEUEUVWEX' J $++Aq!Q7Jr2   )r#   r   r$   r   r)   r+   )__name__
__module____qualname____firstlineno____doc__r   rK   r'   TensorrR   __static_attributes____classcell__r/   s   @r0   r   r   $   s5    
j,$DLELL U\\  r2   r   c                 H   [        S[        X5      -  S-
  5      nUR                  S   U:w  aq  [        R                  R                  UR                  SUR                  S   S5      R                  SSS5      USS9nUR                  SU5      R                  SS5      nOUn[        R                  " U 5      SS2S4   [        X-  S5      -  n[        R                  " U5      SSS24   [        X-  S5      -  nXV-
  US-
  [        X-  S5      -  -   nXGR                  5          $ )	aq  
Get relative positional embeddings according to the relative positions of query and key sizes.

Args:
    q_size (`int`):
        Size of query q.
    k_size (`int`):
        Size of key k.
    rel_pos (`torch.Tensor`):
        Relative position embeddings (num_embeddings, num_channels).

Returns:
    Extracted positional embeddings according to relative positions.
r5   r   r   r4   linear)r7   r8   N      ?)r;   maxr:   r   rA   rB   rC   rD   r'   arangelong)q_sizek_sizerel_posmax_rel_distrel_pos_resizedq_coordsk_coordsrelative_coordss           r0   get_rel_posrk   |   s     q3v..23L}}Q<'--33OOAw}}Q/4<<Q1E 4 

 *11"lCKKAqQ! ||F#AtG,s6?C/HHH||F#D!G,s6?C/HHH*vzSRU=V.VVO//122r2   c                    Uu  pgUu  p[        XhU5      n
[        XyU5      nUR                  u  pnUR                  XX~5      n[        R                  " SX5      n
[        R                  " SX5      nU R                  XXxU	5      U
SS2SS2SS2SS2S4   -   USS2SS2SS2SSS24   -   R                  XU-  X-  5      n U $ )ax  
Calculate decomposed Relative Positional Embeddings as introduced in
[MViT2](https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py).

Args:
    attn (`torch.Tensor`):
        Attention map.
    queries (`torch.Tensor`):
        Query q in the attention layer with shape (batch_size, queries_height * queries_width, num_channels).
    rel_pos_h (`torch.Tensor`):
        Relative position embeddings (Lh, num_channels) for height axis.
    rel_pos_w (`torch.Tensor`):
        Relative position embeddings (Lw, num_channels) for width axis.
    q_size (`Tuple[int]`):
        Spatial sequence size of query q with (queries_height, queries_width).
    k_size (`Tuple[int]`):
        Spatial sequence size of key k with (keys_height, keys_width).

Returns:
    attn (Tensor): attention map with added relative positional embeddings.
zbhwc,hkc->bhwkzbhwc,wkc->bhwkN)rk   r:   rC   r'   einsumview)attnqueries	rel_pos_h	rel_pos_wrc   rd   queries_heightqueries_widthkeys_height
keys_widthrelative_heightrelative_width
batch_size_dimr_qrelative_weights                    r0   !add_decomposed_relative_positionsr~      s    , %+!N$K!.yIO IFN J3
//*m
ICll#3SJOll#3SIO 			*m*U
!Q1d*
+	,
!Q4*
+	, d:5{7OP	 	 Kr2   c                   :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )VitDetAttention   z=Multi-head Attention block with relative position embeddings.c                 (  > [         TU ]  5         UR                  nUR                  nX@l        X4-  nUS-  U l        [        R                  " X3S-  UR                  S9U l	        [        R                  " X35      U l
        UR                  U l        U R                  (       as  [        R                  " [        R                  " SUS   -  S-
  U5      5      U l        [        R                  " [        R                  " SUS   -  S-
  U5      5      U l        gg)z
Args:
    config (`VitDetConfig`):
        Model configuration.
    input_size (`Tuple[int]`, *optional*):
        Input resolution, only required in case relative position embeddings are added.
g      r	   biasr5   r   r   N)r   r   r   num_attention_heads	num_headsscaler   Linearqkv_biasqkvproj use_relative_position_embeddingsr&   r'   r(   rq   rr   )r,   r-   
input_sizer{   r   head_dimr/   s         r0   r   VitDetAttention.__init__   s     	  ..	"#t^
99S'@IIc'	060W0W-00\\%++a*Q-6G!6KX*VWDN\\%++a*Q-6G!6KX*VWDN 1r2   c           	         UR                   u  p4pVU R                  U5      R                  X4U-  SU R                  S5      R	                  SSSSS5      nUR                  SX0R                  -  XE-  S5      R                  S5      u  pn
XR                  -  U	R                  SS5      -  nU R                  (       a%  [        XU R                  U R                  XE4XE45      nUR                  SS9nX-  nUR                  X0R                  XES5      nUR	                  SSSSS5      nUR                  X4US5      nU R                  U5      nU(       a<  UR                  X0R                  UR                   S   UR                   S   5      nX4nU$ U4nU$ )	Nr	   r4   r5   r   r      )r{   )r:   r   rC   r   rD   unbindr   	transposer   r~   rq   rr   softmaxrn   r   )r,   hidden_stateoutput_attentionsry   rG   rH   rz   r   rp   keysvaluesattention_scoresattention_probsoutputss                 r0   rR   VitDetAttention.forward   s   '3'9'9$
Ehh|$,,Z%DNN\^_gghiklnoqrtuv #AzNN/JFN\^ _ f fgh iv#jj0DNN2r4JJ00@ 4>>4>>F?]c\k  +22r2:&/#((^^VTVW#++Aq!Q:#++JrJyy.-55NNO,A,A",EG\G\]_G`O $5G  $oGr2   )r   r   r   rq   rr   r   r   N)F	rT   rU   rV   rW   rX   r   rR   rZ   r[   r\   s   @r0   r   r      s    GX4 r2   r   input	drop_probtrainingrN   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtypedevice)r:   ndimr'   randr   r   floor_div)r   r   r   	keep_probr:   random_tensoroutputs          r0   	drop_pathr   	  s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr2   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )VitDetDropPathi  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rN   c                 .   > [         TU ]  5         Xl        g r   )r   r   r   )r,   r   r/   s     r0   r   VitDetDropPath.__init__!  s    "r2   hidden_statesc                 B    [        XR                  U R                  5      $ r   )r   r   r   )r,   r   s     r0   rR   VitDetDropPath.forward%  s    FFr2   c                 8    SR                  U R                  5      $ )Nzp={})formatr   r,   s    r0   
extra_reprVitDetDropPath.extra_repr(  s    }}T^^,,r2   )r   r   )rT   rU   rV   rW   rX   r   floatr   r'   rY   rR   strr   rZ   r[   r\   s   @r0   r   r     sQ    b#(5/ #T # #GU\\ Gell G-C - -r2   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )VitDetLayerNormi,  a<  
A LayerNorm variant, popularized by Transformers, that performs point-wise mean and variance normalization over the
channel dimension for inputs that have shape (batch_size, channels, height, width).
https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119
c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        [        R                  " [        R                  " U5      5      U l        X l	        U4U l
        g r   )r   r   r   r&   r'   onesweightr(   r   epsnormalized_shape)r,   r   r   r/   s      r0   r   VitDetLayerNorm.__init__3  sR    ll5::.>#?@LL-=!>?	!1 3r2   c                    UR                  SSS9nX-
  R                  S5      R                  SSS9nX-
  [        R                  " X0R                  -   5      -  nU R
                  S S 2S S 4   U-  U R                  S S 2S S 4   -   nU$ )Nr   T)keepdimr5   )meanpowr'   r=   r   r   r   )r,   xuss       r0   rR   VitDetLayerNorm.forward:  s    FF1dF#UKKN40UejjXX..KK4&*TYYq$}-EEr2   )r   r   r   r   )gư>r   r\   s   @r0   r   r   ,  s    4 r2   r   c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )VitDetResBottleneckBlockiB  z
The standard bottleneck residual block without the last activation layer. It contains 3 conv layers with kernels
1x1, 3x3, 1x1.
c                   > [         TU ]  5         [        R                  " X$SSS9U l        [        U5      U l        [        UR                     U l	        [        R                  " XDSSSS9U l
        [        U5      U l        [        UR                     U l        [        R                  " XCSSS9U l        [        U5      U l        g)a"  
Args:
    config (`VitDetConfig`):
        Model configuration.
    in_channels (`int`):
        Number of input channels.
    out_channels (`int`):
        Number of output channels.
    bottleneck_channels (`int`):
        Number of output channels for the 3x3 "bottleneck" conv layers.
r   Fr   r	   )paddingr   N)r   r   r   r*   conv1r   norm1r
   
hidden_actact1conv2norm2act2conv3norm3)r,   r-   in_channelsout_channelsbottleneck_channelsr/   s        r0   r   !VitDetResBottleneckBlock.__init__H  s     	YY{O
$%89
6,,-	YY2TU\ab
$%89
6,,-	YY2!%P
$\2
r2   c                 P    UnU R                  5        H  nU" U5      nM     X-   nU$ r   )children)r,   r   outlayers       r0   rR    VitDetResBottleneckBlock.forward`  s.    ]]_E*C % g
r2   )r   r   r   r   r   r   r   r   r   r\   s   @r0   r   r   B  s    
30 r2   r   c                   r   ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jrS	r	U =r
$ )
	VitDetMlpii  in_featureshidden_featuresrN   Nc                   > [         TU ]  5         [        R                  " X#5      U l        [
        UR                     U l        [        R                  " X25      U l        [        R                  " UR                  5      U l        g r   )r   r   r   r   fc1r
   r   actfc2Dropoutdropout_probdrop)r,   r-   r   r   r/   s       r0   r   VitDetMlp.__init__j  sV    99[:&++,99_:JJv223	r2   r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   )r,   r   s     r0   rR   VitDetMlp.forwardq  sH    HHQKHHQKIIaLHHQKIIaLr2   )r   r   r   r   )rT   rU   rV   rW   r;   r   r'   rY   rR   rZ   r[   r\   s   @r0   r   r   i  s=    4C 4# 4$ 4 %,,  r2   r   c           	      H   U R                   u  p#pEXU-  -
  U-  nXU-  -
  U-  n[        R                  R                  U SSSUSU45      n X6-   XG-   pU R	                  X(U-  XU-  X5      n U R                  SSSSSS5      R                  5       R	                  SXU5      n
XU	44$ )a  
Partition into non-overlapping windows with padding if needed.

Args:
    hidden_state (`torch.Tensor`):
        Input tokens with [batch_size, height, width, num_channels].
    window_size (`int`):
        Window size.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements:
    - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
    - (padded_height, padded_width): padded height and width before partition
r   r   r	   r5   r      r4   )r:   r   rA   padrn   rD   
contiguous)r   window_sizery   rG   rH   r   
pad_height	pad_widthpadded_heightpadded_widthwindowss              r0   window_partitionr   {  s     /;.@.@+J 44CJ{22kAI ==$$\Aq!Y:3VWL"("5u7H<$$[0+{?Z\gL ""1aAq!4??AFFr;eqrGL111r2   c                 (   Uu  pEUu  pgU R                   S   XE-  U-  U-  -  nU R                  XU-  XQ-  XS5      n	U	R                  SSSSSS5      R                  5       n	U	R                  XUS5      n	U	SS2SU2SU2SS24   R                  5       n	U	$ )	a  
Window unpartition into original sequences and removing padding.

Args:
    windows (`torch.Tensor`):
        Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
    window_size (`int`):
        Window size.
    pad_height_width (`Tuple[int]`):
        Padded height and width (padded_height, padded_width).
    height_width (`Tuple[int]`):
        Original height and width before padding.

Returns:
    hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
r   r4   r   r	   r5   r   r   N)r:   rn   rD   r   )
r   r   pad_height_widthheight_widthr   r   rG   rH   ry   r   s
             r0   window_unpartitionr     s    " #3M MFq!m&Bk&QU`&`aJ<<[0,2M{ikL  ''1aAq9DDFL$$ZbQL  7F7FUFA 56AACLr2   c                      ^  \ rS rSrSr SS\S\S\S\SS4
U 4S	 jjjr	  SS
\
R                  S\\
R                     S\S\\\
R                  \
R                  4   \\
R                     4   4S jjrSrU =r$ )VitDetLayeri  zCThis corresponds to the Block class in the original implementation.r-   drop_path_rater   use_residual_blockrN   Nc                   > [         T	U ]  5         UR                  nUR                  n[	        U[
        [        45      (       a  UOXf4nUR                  n[	        U[
        [        45      (       a  UOXw4nUS   US   -  US   US   -  4n[        R                  " XQR                  S9U l        [        XS:X  a  UOX34S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " XQR                  S9U l        [%        X['        XQR(                  -  5      S9U l        X0l        X@l        U R.                  (       a  [1        UUUUS-  S9U l        g g )	Nr   r   )r   )r   r   )r-   r   r   r5   )r-   r   r   r   )r   r   r   r#   r   listtupler   r   	LayerNormlayer_norm_epsr   r   	attentionr   Identityr   r   r   r;   	mlp_ratiomlpr   r  r   residual)
r,   r-   r  r   r  r{   r#   r   r   r/   s
            r0   r   VitDetLayer.__init__  sF    	  &&
#-j4-#H#HZzNf
&&
#-j4-#H#HZzNf
 mz!}4jmzRS}6TU
\\#+@+@A
(A-=zKC]
 <JC;O7UWU`U`Ub\\#+@+@A
FSQTWgWgQgMhi&"4""4 $'1H	DM #r2   r   	head_maskr   c                 b   UR                  SSSS5      nUnU R                  U5      nU R                  S:  a4  UR                  S   UR                  S   pe[	        XR                  5      u  pU R                  UUS9nUS   nUSS  n	U R                  S:  a  [        XR                  WWW45      nX@R                  U5      -   nXR                  U R                  U R                  U5      5      5      -   nUR                  SSSS5      nU R                  (       a  U R                  U5      nU4U	-   n	U	$ )Nr   r5   r	   r   )r   )rD   r   r   r:   r   r	  r   r   r  r   r  r  )
r,   r   r  r   shortcutrG   rH   r   self_attention_outputsr   s
             r0   rR   VitDetLayer.forward  s@    &--aAq9 

=1 a)//2M4G4G4JE.>}N^N^._+M!%/ "0 "
 /q1(, a.}>N>NP`cikpbqrM !>>-#@@%txx

=@Y7Z([[%--aAq9"" MM-8M "W,r2   )r	  r   r  r   r   r  r  r   )r   r   F)NF)rT   rU   rV   rW   rX   r   r   r;   boolr   r'   rY   r   r   r   rR   rZ   r[   r\   s   @r0   r  r    s    M qv!"!49!LO!im!	! !L -1"'	(||( ELL)(  	(
 
uU\\5<</0%2EE	F( (r2   r  c                      ^  \ rS rSrS\SS4U 4S jjr    SS\R                  S\\R                     S\	S	\	S
\	S\
\\4   4S jjrSrU =r$ )VitDetEncoderi  r-   rN   Nc           
        > [         TU ]  5         Xl        UR                  n[        R
                  " SUR                  USS9 Vs/ s H  o3R                  5       PM     nn/ n[        U5       HG  nUR                  [        UXF   XaR                  ;   a  UR                  OSXaR                  ;   S95        MI     [        R                  " U5      U l        SU l        g s  snf )Nr   cpu)r   )r  r   r  F)r   r   r-   num_hidden_layersr'   linspacer  itemrangeappendr  window_block_indicesr   residual_block_indicesr   
ModuleListr   gradient_checkpointing)r,   r-   depthr   r  layersir/   s          r0   r   VitDetEncoder.__init__  s    (( -2NN1f>S>SUZch,ij,iq&&(,ijuAMM#1#467;V;V6V 2 2\]'(,I,I'I	  ]]6*
&+# ks   Cr   r  r   output_hidden_statesreturn_dictc                    U(       a  SOS nU(       a  SOS n[        U R                  5       Hz  u  pU(       a  Xa4-   nUb  X(   OS n
U R                  (       a0  U R                  (       a  U R	                  U	R
                  UU
U5      nO	U	" XU5      nUS   nU(       d  Mr  X{S   4-   nM|     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )N r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   r)  ).0vs     r0   	<genexpr>(VitDetEncoder.forward.<locals>.<genexpr>F  s     m$[q$[s   	last_hidden_stater   
attentions)	enumerater   r!  r   _gradient_checkpointing_func__call__r  r   )r,   r   r  r   r&  r'  all_hidden_statesall_self_attentionsr$  layer_modulelayer_head_masklayer_outputss               r0   rR   VitDetEncoder.forward"  s     #7BD$5b4(4OA#$58H$H!.7.CilO**t}} $ A A ))!#%	! !-]M^ _)!,M  &91=M<O&O#'  5*   14D Dm]GZ$[mmm++*
 	
r2   )r-   r!  r   )NFFT)rT   rU   rV   rW   r   r   r'   rY   r   r  r   r  r   rR   rZ   r[   r\   s   @r0   r  r    s    ,| , ,2 -1"'%* )
||)
 ELL))
  	)

 #)
 )
 
uo%	&)
 )
r2   r  modulec                     [         R                  R                  U R                  SSS9  U R                  b+  [         R                  R                  U R                  S5        gg)a  
Initialize `module.weight` using the "MSRAFill" implemented in Caffe2. Also initializes `module.bias` to 0.

Source: https://detectron2.readthedocs.io/en/latest/_modules/fvcore/nn/weight_init.html.

Args:
    module (torch.nn.Module): module to initialize.
fan_outrelu)r8   nonlinearityNr   )r   initkaiming_normal_r   r   	constant_)r;  s    r0   caffe2_msra_fillrC  N  sH     GGFMM	O{{
&++q) r2   c                       \ rS rSr\rSrSrSr/ r	S\
\R                  \R                  \R                  4   SS4S jrS	rg)
VitDetPreTrainedModeli\  vitdetrM   Tr;  rN   Nc                 l   [        U[        R                  [        R                  45      (       a  [        R                  R                  UR                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR                  R                  5      UR                  l        UR                  b%  UR                  R                  R                  5         gg[        U[        R                   5      (       aJ  UR                  R                  R                  5         UR                  R                  R#                  S5        g[        U[$        5      (       a  [        R                  R                  UR&                  R                  R                  [        R                  5      SU R                  R                  S9R                  UR&                  R                  5      UR&                  l        g[        U[(        5      (       Ga  U R                  R*                  (       a  [        R                  R                  UR,                  R                  R                  [        R                  5      SU R                  R                  S9UR,                  l        [        R                  R                  UR.                  R                  R                  [        R                  5      SU R                  R                  S9UR.                  l        g[        U[0        5      (       a  UR2                  UR4                  UR6                  4 H  n[9        U5        M     UR:                  UR<                  4 HL  nUR                  R                  R#                  S5        UR                  R                  R                  5         MN     UR>                  R                  R                  R                  5         UR>                  R                  R                  R                  5         gg)zInitialize the weightsr   )r   stdNr_   ) r   r   r   r*   r@  trunc_normal_r   datator'   float32r-   initializer_ranger   r   zero_r  fill_r   r)   r   r   rq   rr   r   r   r   r   rC  r   r   r   )r,   r;  r   s      r0   _init_weights#VitDetPreTrainedModel._init_weightsd  s   fryy"))455 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( '--KK""$MM$$S) 011.0gg.C.C**//225==AKK11 /D / b++112	 &&+ 00T[[5a5a$&GG$9$9  %%((7KK11 %: %F!
 %'GG$9$9  %%((7KK11 %: %F!  899 ,,fllC ' D ,,5!!'',

%%' 6 LL$$**,LL""((* :r2   r)  )rT   rU   rV   rW   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r   r*   r  rP  rZ   r)  r2   r0   rE  rE  \  sJ    L $O&*#)+E"))RYY*L$M )+RV )+r2   rE  c                      ^  \ rS rSrS\4U 4S jjrS\4S jrS\\	\
\	   4   SS4S jr\     SS	\\R                     S
\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )VitDetModeli  r-   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )r   r   r-   r   rQ   r  encoder	post_init)r,   r-   r/   s     r0   r   VitDetModel.__init__  s9     *62$V, 	r2   rN   c                 .    U R                   R                  $ r   rQ   r+   r   s    r0   get_input_embeddings VitDetModel.get_input_embeddings      )))r2   heads_to_pruneNc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsrZ  r   r	  prune_heads)r,   rb  r   headss       r0   _prune_headsVitDetModel._prune_heads  s<    
 +002LELLu%//;;EB 3r2   rM   r  r   r&  r'  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  X R                   R                  5      nU R                  U5      nU R                  UUUUUS9nUS   nU(       d	  U4USS -   $ [        UUR                  UR                  S9$ )aw  
Examples:

```python
>>> from transformers import VitDetConfig, VitDetModel
>>> import torch

>>> config = VitDetConfig()
>>> model = VitDetModel(config)

>>> pixel_values = torch.randn(1, 3, 224, 224)

>>> with torch.no_grad():
...     outputs = model(pixel_values)

>>> last_hidden_states = outputs.last_hidden_state
>>> list(last_hidden_states.shape)
[1, 768, 14, 14]
```Nz You have to specify pixel_values)r  r   r&  r'  r   r   r/  )r-   r   r&  use_return_dictr>   get_head_maskr  rQ   rZ  r   r   r1  )	r,   rM   r  r   r&  r'  embedding_outputencoder_outputssequence_outputs	            r0   rR   VitDetModel.forward  s    8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y++2O2OP	??<8,,/!5# ' 
 *!,#%(;;;-)77&11
 	
r2   )r-   rQ   rZ  )NNNNN)rT   rU   rV   rW   r   r   r   r_  r   r;   r   rg  r   r   r'   rY   r  r   r   r   rR   rZ   r[   r\   s   @r0   rX  rX    s    | *&6 *C4T#Y+? CD C  04,0,0/3&*=
u||,=
 ELL)=
 $D>	=

 'tn=
 d^=
 
uo%	&=
 =
r2   rX  zF
    ViTDet backbone, to be used with frameworks like Mask R-CNN.
    )custom_introc                      ^  \ rS rSrU 4S jrS\4S jr\   SS\R                  S\
\   S\
\   S\
\   S\4
S	 jj5       rS
rU =r$ )VitDetBackbonei  c                   > [         TU ]  U5        [         TU ]	  U5        [        U5      U l        [        U5      U l        [        UR                  S-   5       Vs/ s H  o!R                  PM     snU l
        U R                  5         g s  snf )Nr   )r   r   _init_backboner   rQ   r  rZ  r  r  r   num_featuresr[  )r,   r-   rz   r/   s      r0   r   VitDetBackbone.__init__  su     v&*62$V,9>v?W?WZ[?[9\]9\A//9\] 	 ^s   BrN   c                 .    U R                   R                  $ r   r^  r   s    r0   r_  #VitDetBackbone.get_input_embeddings  ra  r2   rM   r&  r   r'  c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nU R                  USUUS9nU(       a  UR                  OUS   nSn[        U R                  U5       H  u  pXR                  ;   d  M  X4-  nM     U(       d  U(       a  U4USS -   nU$ U4USS -   nU$ [        UU(       a  UR                  OSUR                  S9$ )ap  
Examples:

```python
>>> from transformers import VitDetConfig, VitDetBackbone
>>> import torch

>>> config = VitDetConfig()
>>> model = VitDetBackbone(config)

>>> pixel_values = torch.randn(1, 3, 224, 224)

>>> with torch.no_grad():
...     outputs = model(pixel_values)

>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 768, 14, 14]
```NT)r&  r   r'  r   r)  r5   )feature_mapsr   r1  )r-   rj  r&  r   rQ   rZ  r   zipstage_namesout_featuresr   r1  )r,   rM   r&  r   r'  rl  r   r   rz  stager   r   s               r0   rR   VitDetBackbone.forward  s+   6 &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq??<8,,!%/#	  
 2=--'!*#&t'7'7#GE)))/ $H #&712;6 M '712;6M%3G'//T))
 	
r2   )rQ   rZ  ru  )NNN)rT   rU   rV   rW   r   r   r_  r   r'   rY   r   r  r   rR   rZ   r[   r\   s   @r0   rr  rr    st    	*&6 *  04,0&*;
ll;
 'tn;
 $D>	;

 d^;
 
;
 ;
r2   rr  )rX  rE  rr  )r   F)6rX   collections.abcr    r<   typingr   r   r   r   r   r'   torch.utils.checkpointr   activationsr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   utils.backbone_utilsr   configuration_vitdetr   
get_loggerrT   loggerModuler   r?   script_if_tracingrk   r~   r   rY   r   r  r   r   r   r   r   r   r   r  r  rC  rE  rX  rr  __all__r)  r2   r0   <module>r     s      5 5    ! ? - , 1 . 
		H	%Uryy Up !3 !3H&R;bii ;~U\\ e T V[VbVb *-RYY -bii ,$ryy $N		 $2@>N")) Nb@
BII @
F*RYY *4 * 0+O 0+ 0+f T
' T
 T
n 
K
*M K

K
\ Er2   