
    fTh:              	       x   S r SSKrSSKrSSKrSSKJr  SSKJrJ	r	J
r
  SSKrSSKrSSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJrJrJrJr  SSKJr  SSK J!r!  \RD                  " \#5      r$\ " S S\5      5       r%\ " S S\5      5       r&\ " S S\5      5       r'\ " S S\5      5       r(S r)S r*SES\R                  S\+S\,S\R                  4S jjr- " S S\R\                  5      r/ " S  S!\R\                  5      r0 " S" S#\R\                  5      r1 " S$ S%\R\                  5      r2 " S& S'\R\                  5      r3 " S( S)\R\                  5      r4 " S* S+\R\                  5      r5 " S, S-\R\                  5      r6 " S. S/\R\                  5      r7 " S0 S1\R\                  5      r8 " S2 S3\R\                  5      r9 " S4 S5\R\                  5      r:\ " S6 S7\5      5       r;\ " S8 S9\;5      5       r<\" S:S;9 " S< S=\;5      5       r=\" S>S;9 " S? S@\;5      5       r>\" SAS;9 " SB SC\;\5      5       r?/ SDQr@g)Fz!PyTorch Swinv2 Transformer model.    N)	dataclass)OptionalTupleUnion)Tensornn   )ACT2FN)BackboneOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringlogging	torch_int)BackboneMixin   )Swinv2Configc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
Swinv2EncoderOutput*   a6  
Swinv2 encoder's outputs, with potential hidden states and attentions.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r   __static_attributes__r       b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/swinv2/modeling_swinv2.pyr   r   *   s}    2 6:x 1 129=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr(   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)Swinv2ModelOutputL   a  
Swinv2 model's outputs that also contains a pooling of the last hidden states.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nr   pooler_output.r   r   r   r   )r   r    r!   r"   r#   r   r   r$   r%   r&   r-   r   r   r   r   r'   r   r(   r)   r+   r+   L   s    6 6:x 1 12915M8E--.5=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr(   r+   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   \S
 5       rSrg)Swinv2MaskedImageModelingOutputq   au  
Swinv2 masked image model outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nlossreconstruction.r   r   r   c                 P    [         R                  " S[        5        U R                  $ )Nzlogits attribute is deprecated and will be removed in version 5 of Transformers. Please use the reconstruction attribute to retrieve the final output instead.)warningswarnFutureWarningr2   selfs    r)   logits&Swinv2MaskedImageModelingOutput.logits   s%    ]	

 """r(   r   )r   r    r!   r"   r#   r1   r   r$   r%   r&   r2   r   r   r   r   propertyr9   r'   r   r(   r)   r/   r/   q   s    6 )-D(5$$
%,26NHU../6=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJ# #r(   r/   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)Swinv2ImageClassifierOutput   a  
Swinv2 outputs for image classification.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nr1   r9   .r   r   r   r   )r   r    r!   r"   r#   r1   r   r$   r%   r&   r9   r   r   r   r   r'   r   r(   r)   r=   r=      s    6 )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr(   r=   c                     U R                   u  p#pEU R                  X#U-  XU-  X5      n U R                  SSSSSS5      R                  5       R                  SXU5      nU$ )z*
Partitions the given input into windows.
r   r   r	            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowss          r)   window_partitionrP      so     /<.A.A+J!&&k);8LkM ##Aq!Q15@@BGGKfrsGNr(   c                     U R                   S   nU R                  SX!-  X1-  XU5      n U R                  SSSSSS5      R                  5       R                  SX#U5      n U $ )z7
Merges windows to produce higher resolution features.
rC   r   r   r	   r@   rA   rB   rD   )rO   rJ   rL   rM   rN   s        r)   window_reverserR      se     ==$Lll2v4e6JKfrsGooaAq!Q/::<AA"fUabGNr(   input	drop_probtrainingreturnc                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtypedevice)rE   ndimr$   randrY   rZ   floor_div)rS   rT   rU   	keep_probrE   random_tensoroutputs          r)   	drop_pathrb      s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr(   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )Swinv2DropPath   zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).NrT   rV   c                 .   > [         TU ]  5         Xl        g N)super__init__rT   )r8   rT   	__class__s     r)   ri   Swinv2DropPath.__init__   s    "r(   r   c                 B    [        XR                  U R                  5      $ rg   )rb   rT   rU   r8   r   s     r)   forwardSwinv2DropPath.forward   s    FFr(   c                 8    SR                  U R                  5      $ )Nzp={})formatrT   r7   s    r)   
extra_reprSwinv2DropPath.extra_repr   s    }}T^^,,r(   )rT   rg   )r   r    r!   r"   r#   r   floatri   r$   r   rn   strrr   r'   __classcell__rj   s   @r)   rd   rd      sQ    b#(5/ #T # #GU\\ Gell G-C - -r(   rd   c            
          ^  \ rS rSrSrSU 4S jjrS\R                  S\S\S\R                  4S jr	  SS	\
\R                     S
\
\R                     S\S\\R                     4S jjrSrU =r$ )Swinv2Embeddingsi  zO
Construct the patch and position embeddings. Optionally, also the mask token.
c                   > [         TU ]  5         [        U5      U l        U R                  R                  nU R                  R
                  U l        U(       a6  [        R                  " [        R                  " SSUR                  5      5      OS U l        UR                  (       a?  [        R                  " [        R                  " SUS-   UR                  5      5      U l        OS U l        [        R                  " UR                  5      U l        [        R"                  " UR$                  5      U l        UR(                  U l        Xl        g )Nr   )rh   ri   Swinv2PatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr$   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)r8   r   use_mask_tokenr}   rj   s       r)   ri   Swinv2Embeddings.__init__  s     5f =++77//99O]",,u{{1a9I9I'JKcg))')||EKK;QR?TZTdTd4e'fD$'+D$LL!1!12	zz&"<"<= ++r(   
embeddingsrL   rM   rV   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   NrC         ?r   r	   r@   bicubicF)sizemodealign_cornersdim)rE   r   r$   jit
is_tracingr   r   reshaperG   r   
functionalinterpolaterF   cat)r8   r   rL   rM   r}   num_positionsclass_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss               r)   interpolate_pos_encoding)Swinv2Embeddings.interpolate_pos_encoding  sS    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr(   pixel_valuesbool_masked_posr   c                    UR                   u  pEpgU R                  U5      u  pU R                  U5      nUR                  5       u  pnUbI  U R                  R                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  b*  U(       a  XR                  XU5      -   nOXR                  -   nU R                  U5      nX4$ )NrC         ?)rE   r|   r   r   r   expand	unsqueezetype_asr   r   r   )r8   r   r   r   _rN   rL   rM   r   output_dimensionsrK   seq_lenmask_tokensmasks                 r)   rn   Swinv2Embeddings.forwardA  s     *6););&(,(=(=l(K%
YYz*
!+!2
Q&//00bIK",,R088ED#sTz2[5GGJ##/''*G*G
\a*bb
'*B*BB
\\*-
,,r(   )r   r   r   r   r|   r   r   r   )FNF)r   r    r!   r"   r#   ri   r$   r   intr   r   r%   
BoolTensorboolr   rn   r'   rv   rw   s   @r)   ry   ry     s    &&D5<< &D &DUX &D]b]i]i &DV 7;).	-u001- "%"2"23- #'	-
 
u||	- -r(   ry   c                      ^  \ rS rSrSrU 4S jrS rS\\R                     S\
\R                  \
\   4   4S jrSrU =r$ )	r{   i^  z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                   > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pT[        U[        R                  R                  5      (       a  UOX"4n[        U[        R                  R                  5      (       a  UOX34nUS   US   -  US   US   -  -  nX l        X0l        X@l        X`l
        US   US   -  US   US   -  4U l        [        R                  " XEX3S9U l        g )Nr   r   )kernel_sizestride)rh   ri   
image_sizer   rN   r   
isinstancecollectionsabcIterabler}   r~   r   Conv2d
projection)r8   r   r   r   rN   hidden_sizer}   rj   s          r)   ri   Swinv2PatchEmbeddings.__init__e  s    !'!2!2F4E4EJ$*$7$79I9Ik#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY))L:ir(   c                 f   X0R                   S   -  S:w  aB  SU R                   S   X0R                   S   -  -
  4n[        R                  R                  X5      nX R                   S   -  S:w  aD  SSSU R                   S   X R                   S   -  -
  4n[        R                  R                  X5      nU$ )Nr   r   )r   r   r   pad)r8   r   rL   rM   
pad_valuess        r)   	maybe_padSwinv2PatchEmbeddings.maybe_padt  s    ??1%%*T__Q/%//!:L2LLMJ==,,\FLOOA&&!+Q4??1#5QRAS8S#STJ==,,\FLr(   r   rV   c                     UR                   u  p#pEU R                  XU5      nU R                  U5      nUR                   u    p$nXE4nUR                  S5      R	                  SS5      nXg4$ )Nr@   r   )rE   r   r   flatten	transpose)r8   r   r   rN   rL   rM   r   r   s           r)   rn   Swinv2PatchEmbeddings.forward}  sp    )5););&~~lEB__\2
(..1e#O''*44Q:
,,r(   )r~   r   rN   r}   r   r   )r   r    r!   r"   r#   ri   r   r   r$   r%   r   r   r   rn   r'   rv   rw   s   @r)   r{   r{   ^  sK    j	-HU->->$? 	-E%,,X]^aXbJbDc 	- 	-r(   r{   c            	          ^  \ rS rSrSr\R                  4S\\   S\S\R                  SS4U 4S jjjr
S	 rS
\R                  S\\\4   S\R                  4S jrSrU =r$ )Swinv2PatchMergingi  a  
Patch Merging Layer.

Args:
    input_resolution (`Tuple[int]`):
        Resolution of input feature.
    dim (`int`):
        Number of input channels.
    norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
        Normalization layer class.
input_resolutionr   
norm_layerrV   Nc                    > [         TU ]  5         Xl        X l        [        R
                  " SU-  SU-  SS9U l        U" SU-  5      U l        g )NrA   r@   Fbias)rh   ri   r   r   r   Linear	reductionr   )r8   r   r   r   rj   s       r)   ri   Swinv2PatchMerging.__init__  sE     01s7AG%@q3w'	r(   c                     US-  S:H  =(       d    US-  S:H  nU(       a-  SSSUS-  SUS-  4n[         R                  R                  X5      nU$ )Nr@   r   r   )r   r   r   )r8   rI   rL   rM   
should_padr   s         r)   r   Swinv2PatchMerging.maybe_pad  sS    qjAo:519>
Q519a!<JMM--mHMr(   rI   input_dimensionsc                    Uu  p4UR                   u  pVnUR                  XSXG5      nU R                  XU5      nUS S 2SS S2SS S2S S 24   nUS S 2SS S2SS S2S S 24   n	US S 2SS S2SS S2S S 24   n
US S 2SS S2SS S2S S 24   n[        R                  " XX/S5      nUR                  USSU-  5      nU R                  U5      nU R                  U5      nU$ )Nr   r@   r   rC   rA   )rE   rF   r   r$   r   r   r   )r8   rI   r   rL   rM   rK   r   rN   input_feature_0input_feature_1input_feature_2input_feature_3s               r)   rn   Swinv2PatchMerging.forward  s   ((5(;(;%
%**:uS}eD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?_"fhjk%**:r1|;KL}5		-0r(   )r   r   r   r   )r   r    r!   r"   r#   r   r   r   r   Moduleri   r   r$   r   rn   r'   rv   rw   s   @r)   r   r     s|    
 XZWcWc (s (# (299 (hl ( (U\\ U3PS8_ Y^YeYe  r(   r   c                      ^  \ rS rSrSS/4U 4S jjrS r   SS\R                  S\\R                     S\\R                     S\\
   S	\\R                     4
S
 jjrSrU =r$ )Swinv2SelfAttentioni  r   c           
      
  > [         TU ]  5         X#-  S:w  a  [        SU SU S35      eX0l        [	        X#-  5      U l        U R                  U R
                  -  U l        [        U[        R                  R                  5      (       a  UOXD4U l        XPl        [        R                  " [        R                   " S[        R"                  " USS45      -  5      5      U l        [        R&                  " [        R(                  " SSS	S
9[        R*                  " S	S9[        R(                  " SUSS
95      U l        [        R.                  " U R                  S   S-
  * U R                  S   [        R0                  S9R3                  5       n[        R.                  " U R                  S   S-
  * U R                  S   [        R0                  S9R3                  5       n[        R4                  " [7        Xg/SS95      R9                  SSS5      R;                  5       R=                  S5      nUS   S:  a;  US S 2S S 2S S 2S4==   US   S-
  -  ss'   US S 2S S 2S S 2S4==   US   S-
  -  ss'   OTUS:  aN  US S 2S S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S S 2S4==   U R                  S   S-
  -  ss'   US-  n[        R>                  " U5      [        R@                  " [        RB                  " U5      S-   5      -  [D        R@                  " S5      -  nURG                  [I        U R,                  RK                  5       5      RL                  5      nU RO                  SUSS9  [        R.                  " U R                  S   5      n	[        R.                  " U R                  S   5      n
[        R4                  " [7        X/SS95      n[        RP                  " US5      nUS S 2S S 2S 4   US S 2S S S 24   -
  nUR9                  SSS5      R;                  5       nUS S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S4==   U R                  S   S-
  -  ss'   US S 2S S 2S4==   SU R                  S   -  S-
  -  ss'   URS                  S5      nU RO                  SUSS9  [        R(                  " U R                  U R                  URT                  S
9U l+        [        R(                  " U R                  U R                  SS
9U l,        [        R(                  " U R                  U R                  URT                  S
9U l-        [        R\                  " UR^                  5      U l0        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()
   r   r@   i   Tr   )inplaceFrY   ij)indexing   r   relative_coords_table)
persistentrC   relative_position_index)1rh   ri   
ValueErrornum_attention_headsr   attention_head_sizeall_head_sizer   r   r   r   rJ   pretrained_window_sizer   r   r$   logoneslogit_scale
Sequentialr   ReLUcontinuous_position_bias_mlparangeint64rt   stackr   rG   rH   r   signlog2absmathtonext
parametersrY   register_bufferr   sumqkv_biasquerykeyvaluer   attention_probs_dropout_probr   )r8   r   r   	num_headsrJ   r   relative_coords_hrelative_coords_wr   coords_hcoords_wcoordscoords_flattenrelative_coordsr   rj   s                  r)   ri   Swinv2SelfAttention.__init__  s   ?a#C5(^_h^iijk  $- #&s#7 !558P8PP%k;??3K3KLLKS^Rl 	 '=#<<		"uzz9aQRBS7T2T(UV,.MMIIa4("''$*?3PY`eAf-
)
 "LL4+;+;A+>+B)CTEUEUVWEX`e`k`klrrt!LL4+;+;A+>+B)CTEUEUVWEX`e`k`klrrtKK"3!GRVWXWQ1Z\Yq\	 	 "!$q(!!Q1*-1G1JQ1NN-!!Q1*-1G1JQ1NN-1_!!Q1*-1A1A!1Dq1HH-!!Q1*-1A1A!1Dq1HH-"JJ,-

599EZ;[^a;a0bbeienenopeqq 	 !6 8 8d>_>_>j>j>l9m9s9s t46KX]^ << 0 0 34<< 0 0 34Xx&:TJKvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"968O\abYYt1143E3EFOO\
99T//1C1C%PYYt1143E3EFOO\
zz&"E"EFr(   c                     UR                  5       S S U R                  U R                  4-   nUR                  U5      nUR	                  SSSS5      $ )NrC   r   r@   r   r	   )r   r   r   rF   rG   )r8   xnew_x_shapes      r)   transpose_for_scores(Swinv2SelfAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r(   r   attention_mask	head_maskoutput_attentionsrV   c                 j   UR                   u  pVnU R                  U5      nU R                  U R                  U5      5      n	U R                  U R	                  U5      5      n
U R                  U5      n[
        R                  R                  USS9[
        R                  R                  U	SS9R                  SS5      -  n[        R                  " U R                  [        R                  " S5      S9R                  5       nX-  nU R                  U R                   5      R#                  SU R$                  5      nXR&                  R#                  S5         R#                  U R(                  S   U R(                  S   -  U R(                  S   U R(                  S   -  S5      nUR+                  SSS5      R-                  5       nS	[        R.                  " U5      -  nXR1                  S5      -   nUb  UR                   S   nUR#                  UU-  UU R$                  Xf5      UR1                  S5      R1                  S5      -   nXR1                  S5      R1                  S5      -   nUR#                  SU R$                  Xf5      n[
        R                  R3                  USS9nU R5                  U5      nUb  UU-  n[        R6                  " UU
5      nUR+                  SSSS
5      R-                  5       nUR9                  5       S S U R:                  4-   nUR#                  U5      nU(       a  UU4nU$ U4nU$ )NrC   r   g      Y@)maxr   r   r@      r	   )rE   r  r  r  r	  r   r   	normalizer   r$   clampr   r   r   expr   r   rF   r   r   rJ   rG   rH   sigmoidr   softmaxr   matmulr   r   )r8   r   r  r  r  rK   r   rN   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresr   relative_position_bias_tablerelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                        r)   rn   Swinv2SelfAttention.forward  s    )6(;(;%
 JJ}5--dhh}.EF	//

=0IJ//0AB ==22;B2G"--JaJa2 Kb K

)B
 kk$"2"28LMQQS+9'+'H'HIcIc'd'i'i(((
$ ">>Z>Z>_>_`b>c!d!i!iQ$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U!#emm4J&K!K+.N.Nq.QQ%'--a0J/44j(*d6N6NPS ((+55a8 9  02J2J12M2W2WXY2ZZ/44R9Q9QSV\ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r(   )r   r   r   r   r  r   r   r   r  r	  rJ   NNF)r   r    r!   r"   ri   r  r$   r   r   r%   r   r   rn   r'   rv   rw   s   @r)   r   r     s    TUWXSY ;Gz% 7;15,1;||; !!2!23; E--.	;
 $D>; 
u||	; ;r(   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )Swinv2SelfOutputiA  c                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g rg   )rh   ri   r   r   denser   r
  r   r8   r   r   rj   s      r)   ri   Swinv2SelfOutput.__init__B  s4    YYs(
zz&"E"EFr(   r   input_tensorrV   c                 J    U R                  U5      nU R                  U5      nU$ rg   r7  r   )r8   r   r:  s      r)   rn   Swinv2SelfOutput.forwardG  s$    

=1]3r(   r<  
r   r    r!   r"   ri   r$   r   rn   r'   rv   rw   s   @r)   r5  r5  A  s7    G
U\\  RWR^R^  r(   r5  c                      ^  \ rS rSrSU 4S jjrS r   SS\R                  S\\R                     S\\R                     S\\
   S\\R                     4
S	 jjrS
rU =r$ )Swinv2AttentioniN  c           
         > [         TU ]  5         [        UUUU[        U[        R
                  R                  5      (       a  UOXU4S9U l        [        X5      U l	        [        5       U l        g )Nr   r   r  rJ   r   )rh   ri   r   r   r   r   r   r8   r5  ra   setpruned_heads)r8   r   r   r  rJ   r   rj   s         r)   ri   Swinv2Attention.__init__O  sa    '#0+//2J2JKK $:(A
	 'v3Er(   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   r8   r   r   rD  r   r  r  r	  ra   r7  r   union)r8   headsindexs      r)   prune_headsSwinv2Attention.prune_heads]  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r(   r   r  r  r  rV   c                 f    U R                  XX45      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   )r8   ra   )r8   r   r  r  r  self_outputsattention_outputr1  s           r)   rn   Swinv2Attention.forwardo  sB     yy	];;|AF#%QR(88r(   )ra   rD  r8   r   r3  )r   r    r!   r"   ri   rK  r$   r   r   r%   r   r   rn   r'   rv   rw   s   @r)   r@  r@  N  sy    ";* 7;15,1
||
 !!2!23
 E--.	

 $D>
 
u||	
 
r(   r@  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Swinv2Intermediatei}  c                   > [         TU ]  5         [        R                  " U[	        UR
                  U-  5      5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rg   )rh   ri   r   r   r   	mlp_ratior7  r   
hidden_actru   r
   intermediate_act_fnr8  s      r)   ri   Swinv2Intermediate.__init__~  sd    YYsC(8(83(>$?@
f''--'-f.?.?'@D$'-'8'8D$r(   r   rV   c                 J    U R                  U5      nU R                  U5      nU$ rg   r7  rW  rm   s     r)   rn   Swinv2Intermediate.forward  s&    

=100?r(   rZ  r>  rw   s   @r)   rS  rS  }  s(    9U\\ ell  r(   rS  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )Swinv2Outputi  c                    > [         TU ]  5         [        R                  " [	        UR
                  U-  5      U5      U l        [        R                  " UR                  5      U l	        g rg   )
rh   ri   r   r   r   rU  r7  r   r   r   r8  s      r)   ri   Swinv2Output.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r(   r   rV   c                 J    U R                  U5      nU R                  U5      nU$ rg   r<  rm   s     r)   rn   Swinv2Output.forward  s$    

=1]3r(   r<  r>  rw   s   @r)   r]  r]    s(    >
U\\ ell  r(   r]  c                      ^  \ rS rSr SU 4S jjrS\\\\4   \\\4   4   4S jrS rS r	  SS\
R                  S\\\4   S	\\
R                     S
\\   S\\
R                  \
R                  4   4
S jjrSrU =r$ )Swinv2Layeri  c           
      ^  > [         T	U ]  5         X0l        U R                  UR                  UR                  4Xf45      u  pUS   U l        US   U l        [        UUUU R                  [        U[        R                  R                  5      (       a  UOXw4S9U l        [        R                  " X!R                  S9U l        US:  a  [!        U5      O[        R"                  " 5       U l        ['        X5      U l        [+        X5      U l        [        R                  " X!R                  S9U l        g )Nr   rB  epsrX   )rh   ri   r   _compute_window_shiftrJ   
shift_sizer@  r   r   r   r   	attentionr   r   layer_norm_epslayernorm_beforerd   Identityrb   rS  intermediater]  ra   layernorm_after)
r8   r   r   r   r  drop_path_raterh  r   rJ   rj   s
            r)   ri   Swinv2Layer.__init__  s    	 0"&"<"<!3!34z6N#
 'q>$Q-(((0+//2J2JKK $:(A
 !#S6K6K L;IC;O7UWU`U`Ub.v;"6/!||C5J5JKr(   rV   c                     [        U R                  U5       VVs/ s H  u  p4X4::  a  UOUPM     nnn[        U R                  XR5       VVVs/ s H  u  p4ocU::  a  SOUPM     nnnnXW4$ s  snnf s  snnnf Nr   )zipr   )r8   target_window_sizetarget_shift_sizerwrJ   srh  s           r)   rg  !Swinv2Layer._compute_window_shift  st    69$:O:OQc6de6ddaAFq)6de8;D<Q<QS^8rs8rWQ16aq(8r
s&& fss   A)A/c           	         U R                   S:  Gaw  [        R                  " SXS4US9n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4nSnU H  nU H  n	XtS S 2XS S 24'   US-  nM     M     [        X@R                  5      n
U
R                  SU R                  U R                  -  5      n
U
R                  S5      U
R                  S5      -
  nUR                  US:g  [        S5      5      R                  US:H  [        S5      5      nU$ S nU$ )Nr   r   r   rC   r@   g      YrX   )
rh  r$   r   slicerJ   rP   rF   r   masked_fillrt   )r8   rL   rM   rY   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks               r)   get_attn_maskSwinv2Layer.get_attn_mask  s   ??Q{{Ava#8FHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E -#/K@EQ1<=QJE $0 !.
 ,H6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1neFmLXXYbfgYginoristI  Ir(   c                     U R                   X0R                   -  -
  U R                   -  nU R                   X R                   -  -
  U R                   -  nSSSUSU4n[        R                  R                  X5      nX4$ rr  )rJ   r   r   r   )r8   r   rL   rM   	pad_right
pad_bottomr   s          r)   r   Swinv2Layer.maybe_pad  sy    %%0@0@(@@DDTDTT	&&2B2B)BBdFVFVV
Ay!Z8
))-D((r(   r   r   r  r  c                    Uu  pVUR                  5       u  pxn	Un
UR                  XuXi5      nU R                  XU5      u  pUR                  u  ppU R                  S:  a.  [
        R                  " XR                  * U R                  * 4SS9nOUn[        XR                  5      nUR                  SU R                  U R                  -  U	5      nU R                  XUR                  S9nUb  UR                  UR                  5      nU R                  UUX4S9nUS   nUR                  SU R                  U R                  U	5      n[        UU R                  X5      nU R                  S:  a-  [
        R                  " UU R                  U R                  4SS9nOUnUS   S:  =(       d    US   S:  nU(       a  US S 2S U2S U2S S 24   R                  5       nUR                  XuU-  U	5      nU R!                  U5      nXR#                  U5      -   nU R%                  U5      nU R'                  U5      nXR#                  U R)                  U5      5      -   nU(       a	  UUS	   4nU$ U4nU$ )
Nr   )r   r@   )shiftsdimsrC   r   )r  r	   rB   r   )r   rF   r   rE   rh  r$   rollrP   rJ   r  rY   r  rZ   ri  rR   rH   rk  rb   rm  ra   rn  )r8   r   r   r  r  rL   rM   rK   r   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr  attention_outputsrO  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                           r)   rn   Swinv2Layer.forward  s    )"/"4"4"6
x  &**:uO$(NN=%$P!&3&9&9#y??Q$)JJ}FVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&zMDWDW&X	 !%:%A%ABI NN!9i + 
 -Q/,11"d6F6FHXHXZbc():D<L<Ljd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:~xX--.?@ >>-#@@((7{{<0$~~d6J6J<6X'YY@Q'8';< YeWfr(   )	ri  rb   r   rm  rn  rk  ra   rh  rJ   )rX   r   r   r   )r   r    r!   r"   ri   r   r   rg  r  r   r$   r   r   r%   r   rn   r'   rv   rw   s   @r)   rc  rc    s    qrL2'eTYZ]_bZbTcejknpsksetTtNu '
8) 26,18||8  S/8 E--.	8
 $D>8 
u||U\\)	*8 8r(   rc  c                      ^  \ rS rSr S
U 4S jjr  SS\R                  S\\\4   S\	\R                     S\	\   S\\R                     4
S jjrS	rU =r$ )Swinv2Stagei  c	                 d  > [         TU ]  5         Xl        X l        / n	[	        U5       H=  n
[        UUUUXj   U
S-  S:X  a  SOUR                  S-  US9nU	R                  U5        M?     [        R                  " U	5      U l
        Ub  U" X2[        R                  S9U l        OS U l        SU l        g )Nr@   r   )r   r   r   r  ro  rh  r   )r   r   F)rh   ri   r   r   rangerc  rJ   appendr   
ModuleListblocksr   
downsamplepointing)r8   r   r   r   depthr  rb   r  r   r  iblockrj   s               r)   ri   Swinv2Stage.__init__  s     	uA!1#(|!"Q!1&2D2D2I'=E MM%   mmF+ !()9r||\DO"DOr(   r   r   r  r  rV   c                    Uu  pV[        U R                  5       H  u  pxUb  X7   OS n	U" UUU	U5      n
U
S   nM      UnU R                  b%  US-   S-  US-   S-  pXVX4nU R                  X5      nOXVXV4nXU4nU(       a  UW
SS  -  nU$ )Nr   r   r@   )	enumerater  r  )r8   r   r   r  r  rL   rM   r  layer_modulelayer_head_maskr  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledr   stage_outputss                   r)   rn   Swinv2Stage.forward3  s     )(5OA.7.CilO( !	M *!,M  6 -:)??&5;aZA4EPQ	VWGW 1!'0B V OO,M`M!' >&K\]]12..Mr(   )r  r   r   r  r  rQ  r   )r   r    r!   r"   ri   r$   r   r   r   r   r%   r   rn   r'   rv   rw   s   @r)   r  r    sr    mn@ 26,1 ||   S/  E--.	 
 $D>  
u||	   r(   r  c                      ^  \ rS rSrSU 4S jjr     SS\R                  S\\\4   S\	\R                     S\	\   S\	\   S\	\   S	\	\   S
\\\4   4S jjrSrU =r$ )Swinv2EncoderiV  c                 <  > [         T	U ]  5         [        UR                  5      U l        Xl        U R
                  R                  b  UR                  n[        R                  " SUR                  [        UR                  5      SS9 Vs/ s H  oDR                  5       PM     nn/ n[        U R                  5       H  n[        U[        UR                  SU-  -  5      US   SU-  -  US   SU-  -  4UR                  U   UR                   U   U[        UR                  S U 5      [        UR                  S US-    5       XpR                  S-
  :  a  ["        OS X7   S9nUR%                  U5        M     [&        R(                  " U5      U l        SU l        g s  snf )Nr   cpu)rZ   r@   r   )r   r   r   r  r  rb   r  r   F)rh   ri   rG  depths
num_layersr   pretrained_window_sizesr$   linspacero  r  itemr  r  r   r   r  r   r  r   r  layersgradient_checkpointing)
r8   r   r~   r  r  dprr  i_layerstagerj   s
            r)   ri   Swinv2Encoder.__init__W  sm   fmm,;;..:&,&D&D#!&63H3H#fmmJ\ej!kl!kAvvx!klT__-G((1g:56"+A,1g:">	!QRT[Q[@\!]mmG, **73c&--"9:S}QX[\Q\A]=^_29OOa<O2O-VZ'>'G	E MM%  . mmF+&+## ms   	Fr   r   r  r  output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrV   c                    U(       a  SOS nU(       a  SOS n	U(       a  SOS n
U(       aB  UR                   u  pnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  nX4-  n	[        U R                  5       GH0  u  nnUb  X?   OS nU R
                  (       a/  U R                  (       a  U R                  UR                  XU5      nOU" UUUU5      nUS   nUS   nUS   nUS   US   4nU(       aS  U(       aL  UR                   u  pnUR                  " U/US   US   4QUP76 nUR                  SSSS5      nUU4-  nX4-  n	OPU(       aI  U(       dB  UR                   u  pnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  nX4-  n	U(       d  GM(  U
USS  -  n
GM3     U(       d  [        S XX4 5       5      $ [        UUU
U	S	9$ )
Nr   r   r	   r   r@   r  rC   c              3   0   #    U  H  nUc  M  Uv   M     g 7frg   r   ).0vs     r)   	<genexpr>(Swinv2Encoder.forward.<locals>.<genexpr>  s      lA ls   	)r   r   r   r   )rE   rF   rG   r  r  r  rU   _gradient_checkpointing_func__call__tupler   )r8   r   r   r  r  r  r  r  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsrK   r   r   reshaped_hidden_stater  r  r  r  r  r   s                        r)   rn   Swinv2Encoder.forwardp  s_    #7BD+?RT"$5b4)6)<)<&J;$1$6$6z$bDT$bVa$b!$9$A$A!Q1$M!!11&*BB&(5OA|.7.CilO**t}} $ A A ))=O! !-!$#%	! *!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
{ )J(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*.FF*%.V-:-@-@*
{(5(:(::(fHX(fZe(f%(=(E(EaAq(Q%!%55!*.FF*  #}QR'88#Q  6T  '<Ol   #++*#=	
 	
r(   )r   r  r  r  ))r   r   r   r   )NFFFT)r   r    r!   r"   ri   r$   r   r   r   r   r%   r   r   r   rn   r'   rv   rw   s   @r)   r  r  V  s    ,: 26,1/4CH&*L
||L
  S/L
 E--.	L

 $D>L
 'tnL
 3;4.L
 d^L
 
u))	*L
 L
r(   r  c                   0    \ rS rSr\rSrSrSrS/r	S r
Srg)	Swinv2PreTrainedModeli  swinv2r   Tr  c                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       ad  UR                  b$  UR                  R
                  R                  5         UR                  b%  UR                  R
                  R                  5         gg[        U[         5      (       a:  UR"                  R
                  R                  [$        R&                  " S5      5        gg)zInitialize the weightsrX   )meanstdNr   r   )r   r   r   r   weightdatanormal_r   initializer_ranger   zero_r   fill_ry   r   r   r   r   r   r   )r8   modules     r)   _init_weights#Swinv2PreTrainedModel._init_weights  s9   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) 011  ,!!&&,,.))5**//557 6 344##))$((2,7 5r(   r   N)r   r    r!   r"   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr  r'   r   r(   r)   r  r    s%    L $O&*#&8r(   r  c                      ^  \ rS rSrSU 4S jjrS rS r\       SS\\	R                     S\\	R                     S\\	R                     S\\   S	\\   S
\S\\   S\\\4   4S jj5       rSrU =r$ )Swinv2Modeli  c                   > [         TU ]  U5        Xl        [        UR                  5      U l        [        UR                  SU R
                  S-
  -  -  5      U l        [        XS9U l
        [        XR                  R                  5      U l        [        R                  " U R                  UR                   S9U l        U(       a  [        R$                  " S5      OSU l        U R)                  5         g)z
add_pooling_layer (`bool`, *optional*, defaults to `True`):
    Whether or not to apply pooling layer.
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether or not to create and apply mask tokens in the embedding layer.
r@   r   )r   re  N)rh   ri   r   rG  r  r  r   r   num_featuresry   r   r  r   encoderr   r   rj  	layernormAdaptiveAvgPool1dpooler	post_init)r8   r   add_pooling_layerr   rj   s       r)   ri   Swinv2Model.__init__  s     	 fmm, 0 0119L3M MN*6Q$V__-G-GHd&7&7V=R=RS1Bb**1- 	r(   c                 .    U R                   R                  $ rg   r   r|   r7   s    r)   get_input_embeddings Swinv2Model.get_input_embeddings      ///r(   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  layerri  rK  )r8   heads_to_pruner  rI  s       r)   _prune_headsSwinv2Model._prune_heads  s<    
 +002LELLu%//;;EB 3r(   r   r   r  r  r  r   r  rV   c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  U[        U R                   R                  5      5      nU R                  XUS9u  pU R                  UU	UUUUS9n
U
S   nU R                  U5      nSnU R                  b8  U R                  UR                  SS5      5      n[        R                  " US5      nU(       d  X4U
SS -   nU$ [        UUU
R                   U
R"                  U
R$                  S9$ )	z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
Nz You have to specify pixel_values)r   r   )r  r  r  r  r   r   r@   )r   r-   r   r   r   )r   r  r  use_return_dictr   get_head_maskrG  r  r   r  r  r  r   r$   r   r+   r   r   r   )r8   r   r   r  r  r  r   r  embedding_outputr   encoder_outputssequence_outputpooled_outputra   s                 r)   rn   Swinv2Model.forward  si    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y#dkk6H6H2IJ	-1__Tl .= .
* ,,/!5# ' 
 *!,..9;;" KK(A(A!Q(GHM!MM-;M%58KKFM -')77&11#2#I#I
 	
r(   )r   r   r  r  r  r  r  )TFNNNNNFN)r   r    r!   r"   ri   r  r  r   r   r$   r%   r   r   r   r   r+   rn   r'   rv   rw   s   @r)   r  r    s    *0C  596:15,0/3).&*>
u001>
 "%"2"23>
 E--.	>

 $D>>
 'tn>
 #'>
 d^>
 
u''	(>
 >
r(   r  av  
        Swinv2 Model with a decoder on top for masked image modeling, as proposed in
    [SimMIM](https://arxiv.org/abs/2111.09886).

        <Tip>

        Note that we provide a script to pre-train this model on custom data in our [examples
        directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

        </Tip>
    )custom_introc                      ^  \ rS rSrU 4S jr\       SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S\
S	\\
   S
\\\4   4S jj5       rSrU =r$ )Swinv2ForMaskedImageModelingi@  c                   > [         TU ]  U5        [        USSS9U l        [	        UR
                  SUR                  S-
  -  -  5      n[        R                  " [        R                  " X!R                  S-  UR                  -  SS9[        R                  " UR                  5      5      U l        U R                  5         g )NFT)r  r   r@   r   )in_channelsout_channelsr   )rh   ri   r  r  r   r   r  r   r   r   encoder_striderN   PixelShuffledecoderr  )r8   r   r  rj   s      r)   ri   %Swinv2ForMaskedImageModeling.__init__O  s     !&ERVW6++aF4E4E4I.JJK}}II(7L7La7ORXReRe7est OOF112	
 	r(   r   r   r  r  r  r   r  rV   c           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9nUS   n	U	R                  SS5      n	U	R                  u  pn[
        R                  " US-  5      =pU	R                  XX5      n	U R                  U	5      nSnUGb  U R                   R                  U R                   R                  -  nUR                  SUU5      nUR                  U R                   R                  S5      R                  U R                   R                  S5      R                  S5      R                  5       n[        R                  R!                  XSS	9nUU-  R#                  5       UR#                  5       S
-   -  U R                   R$                  -  nU(       d  U4USS -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  UR,                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Examples:
```python
>>> from transformers import AutoImageProcessor, Swinv2ForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
>>> model = Swinv2ForMaskedImageModeling.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
>>> list(reconstructed_pixel_values.shape)
[1, 3, 256, 256]
```N)r   r  r  r  r   r  r   r   r@   r   rC   none)r   gh㈵>)r1   r2   r   r   r   )r   r  r  r   rE   r   floorr   r  r   r   repeat_interleaver   rH   r   r   l1_lossr  rN   r/   r   r   r   )r8   r   r   r  r  r  r   r  r1  r  rK   rN   sequence_lengthrL   rM   reconstructed_pixel_valuesmasked_im_lossr   r   reconstruction_lossra   s                        r)   rn   $Swinv2ForMaskedImageModeling.forward_  s   L &1%<k$++B]B]+++/!5%=#  
 "!*)33Aq94C4I4I1
/OS$899)11*FZ &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7lr"7"s1D8==?488:PTCTUX\XcXcXpXppN02WQR[@F3A3M^%.YSYY.5!//))#*#A#A
 	
r(   )r  r  r  )r   r    r!   r"   ri   r   r   r$   r%   r   r   r   r   r/   rn   r'   rv   rw   s   @r)   r  r  @  s       596:15,0/3).&*R
u001R
 "%"2"23R
 E--.	R

 $D>R
 'tnR
 #'R
 d^R
 
u55	6R
 R
r(   r  a  
    Swinv2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune SwinV2 on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                      ^  \ rS rSrU 4S jr\       SS\\R                     S\\R                     S\\R                     S\\
   S\\
   S\
S	\\
   S
\\\4   4S jj5       rSrU =r$ )Swinv2ForImageClassificationi  c                 D  > [         TU ]  U5        UR                  U l        [        U5      U l        UR                  S:  a5  [
        R                  " U R                  R                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g rr  )rh   ri   
num_labelsr  r  r   r   r  rl  
classifierr  )r8   r   rj   s     r)   ri   %Swinv2ForImageClassification.__init__  sx      ++!&) GMFWFWZ[F[BIIdkk..0A0ABacalalan 	
 	r(   r   r  labelsr  r  r   r  rV   c           	      V   Ub  UOU R                   R                  nU R                  UUUUUUS9nUS   n	U R                  U	5      n
SnUb  U R	                  XXR                   S9nU(       d  U
4USS -   nUb  U4U-   $ U$ [        UU
UR                  UR                  UR                  S9$ )ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r  r  r  r   r  r   )r9   r  pooled_logitsr   r@   )r1   r9   r   r   r   )	r   r  r  r  loss_functionr=   r   r   r   )r8   r   r  r  r  r  r   r  r1  r  r9   r1   ra   s                r)   rn   $Swinv2ForImageClassification.forward  s    " &1%<k$++B]B]++/!5%=#  
  
/%%VRXalal%mDY,F)-)9TGf$EvE*!//))#*#A#A
 	
r(   )r  r  r  r  )r   r    r!   r"   ri   r   r   r$   r%   
LongTensorr   r   r   r=   rn   r'   rv   rw   s   @r)   r  r    s       5915-1,0/3).&*-
u001-
 E--.-
 ))*	-

 $D>-
 'tn-
 #'-
 d^-
 
u11	2-
 -
r(   r  zO
    Swinv2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                   r   ^  \ rS rSrU 4S jrS r\   SS\S\\	   S\\	   S\\	   S\
4
S	 jj5       rS
rU =r$ )Swinv2Backbonei  c           	        > [         TU ]  U5        [         TU ]	  U5        UR                  /[	        [        UR                  5      5       Vs/ s H  n[        UR                  SU-  -  5      PM      sn-   U l        [        U5      U l
        [        XR                  R                  5      U l        U R                  5         g s  snf )Nr@   )rh   ri   _init_backboner   r  rG  r  r   r  ry   r   r  r   r  r  )r8   r   r  rj   s      r)   ri   Swinv2Backbone.__init__
  s     v&#--.X]^abhbobo^pXq1rXqST#f6F6FA6M2NXq1rr*62$V__-G-GH 	 2ss   %C c                 .    U R                   R                  $ rg   r  r7   s    r)   r  #Swinv2Backbone.get_input_embeddings  r  r(   r   r  r  r  rV   c           
      6   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      u  pVU R                  UUSUSSUS9nU(       a  UR                  OUS   nSn	[        U R                  U5       H  u  pXR                  ;   d  M  X4-  n	M     U(       d#  U	4nU(       a  XS   4-  nU(       a  XS   4-  nU$ [        U	U(       a  UR                  OSUR                  S9$ )	a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
>>> model = AutoBackbone.from_pretrained(
...     "microsoft/swinv2-tiny-patch4-window8-256", out_features=["stage1", "stage2", "stage3", "stage4"]
... )

>>> inputs = processor(image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 2048, 7, 7]
```NT)r  r  r  r  r  rC   r   r   r@   )feature_mapsr   r   )r   r  r  r  r   r  r   rs  stage_namesout_featuresr   r   r   )r8   r   r  r  r  r  r   r1  r   r-  r  hidden_statera   s                r)   rn   Swinv2Backbone.forward  s3   @ &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq-1__\-J*,,/!%59#  
 ;F667SU;#&t'7'7#GE)))/ $H "_F#1:-' 1:-'M%3G'//T))
 	
r(   )r   r  r  )NNN)r   r    r!   r"   ri   r  r   r   r   r   r   rn   r'   rv   rw   s   @r)   r&  r&    so    	0  -1/3&*D
D
 $D>D
 'tn	D

 d^D
 
D
 D
r(   r&  )r  r  r  r  r&  )rX   F)Ar#   collections.abcr   r   r4   dataclassesr   typingr   r   r   r$   torch.utils.checkpointr   r   activationsr
   modeling_outputsr   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   utils.backbone_utilsr   configuration_swinv2r   
get_loggerr   loggerr   r+   r/   r=   rP   rR   rt   r   rb   r   rd   ry   r{   r   r   r5  r@  rS  r]  rc  r  r  r  r  r  r  r&  __all__r   r(   r)   <module>r@     s   (    ! ) )    ! . - [ [ D D 1 . 
		H	% K+ K K@  K  K  KF )#k )# )#X  K+  K  KH	U\\ e T V[VbVb *-RYY -Y-ryy Y-z(-BII (-V3 3l~")) ~D
ryy 
+bii +^  	299 	z")) zz=")) =@f
BII f
R 8O 8 86 `
' `
 `
F 
d
#8 d
d
N =
#8 =
=
@ 
T
*M T

T
nr(   