
    fThe             
          S r SSKrSSKJr  SSKJrJrJrJrJ	r	  SSK
r
SSKr
SSK
Jr  SSKJrJrJr  SSKJr  SS	KJrJrJrJrJr  SS
KJr  SSKJrJrJr  SSKJ r   SSK!J"r"  \RF                  " \$5      r%\ " S S\5      5       r&\ " S S\5      5       r'\ " S S\5      5       r(\ " S S\5      5       r) " S S\RT                  5      r+ " S S\RT                  5      r, " S S\RT                  5      r-SIS\
R\                  S\/S\0S \
R\                  4S! jjr1 " S" S#\RT                  5      r2 " S$ S%\RT                  5      r3 " S& S'\RT                  5      r4 " S( S)\RT                  5      r5S*\
R\                  S+\\6   S,\\6   S \
R\                  4S- jr7 " S. S/\RT                  5      r8S*\
R\                  S0\\6\64   S1\\6\64   S2\\\6      S \
R\                  4
S3 jr9\ " S4 S5\5      5       r: " S6 S7\RT                  5      r;\ " S8 S9\:5      5       r< " S: S;\RT                  5      r= " S< S=\RT                  5      r>\" S>S?9 " S@ SA\:5      5       r?\" SBS?9 " SC SD\:5      5       r@\" SES?9 " SF SG\:\ 5      5       rA/ SHQrBg)JzPyTorch Hiera model.    N)	dataclass)DictListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputModelOutput)PreTrainedModel)auto_docstringlogging	torch_int)BackboneMixin   )HieraConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
HieraEncoderOutput+   a  
Hiera encoder's outputs, with potential hidden states and attentions.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. Thesre are the unrolled hidden states of the model.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r    r!   __static_attributes__r"       `/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/hiera/modeling_hiera.pyr   r   +   s}    2 6:x 1 129=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr,   r   c                   J   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\R                  \	S'   Sr\\R                     \	S'   Sr\\\R                  S4      \	S	'   Sr\\\R                  S4      \	S
'   Sr\\\R                  S4      \	S'   Srg)HieraModelOutputL   a  
Hiera model's outputs that also contains a pooling of the last hidden states.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
        Tensor indicating which patches are masked (0) and which are not (1).
    ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Tensor containing the original index of the (shuffled) masked patches.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nr   pooler_outputbool_masked_posids_restore.r   r    r!   r"   )r#   r$   r%   r&   r'   r   r   r(   r)   r*   r1   r2   
BoolTensorr3   
LongTensorr   r   r    r!   r+   r"   r,   r-   r/   r/   L   s    > 6:x 1 12915M8E--.5(,OU%%,.2K%**+2=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr,   r/   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)!HieraForImageClassificationOutputv   a  
Hiera image classification outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
        Loss value for the training task.
    logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
        Prediction scores of the classification head (logits of the output layer).
    hidden_states (`tuple(torch.FloatTensor)`, `optional`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, `optional`):
        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
        heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nlosslogits.r   r    r!   r"   )r#   r$   r%   r&   r'   r9   r   r(   r)   r*   r:   r   r   r    r!   r+   r"   r,   r-   r7   r7   v   s    6 )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>FJHU5+<+<c+A%BCJr,   r7   c                   >   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\R                  \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Srg)HieraForPreTrainingOutput   a  
Class for HieraForPreTraining's outputs, with potential hidden states and attentions.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`):
        Pixel reconstruction loss.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
        Pixel reconstruction logits.
    bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
        Tensor indicating which patches are masked (0) and which are not (1).
    ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Tensor containing the original index of the (shuffled) masked patches.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
        the self-attention heads.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, height, width, hidden_size)`. Hidden-states of the model at the output of each layer
        plus the initial embedding outputs reshaped to include the spatial dimensions.
Nr9   r:   r2   r3   r   r    r!   r"   )r#   r$   r%   r&   r'   r9   r   r(   r)   r*   r:   r2   r4   r3   r5   r   r   r    r!   r+   r"   r,   r-   r<   r<      s    4 )-D(5$$
%,*.FHU&&'.(,OU%%,.2K%**+28<M8E%"3"345<59Ju00129AEHU5+<+<%=>Er,   r<   c                     ^  \ rS rSrSrSS\4U 4S jjjr SS\R                  S\	\R                     S\R                  4S jjr SS\R                  S	\	\R                     S\\R                  \R                  4   4S
 jjr SS\R                  S	\	\R                     S\\R                  \	\R                     \	\R                     4   4S jjrSrU =r$ )HieraPatchEmbeddings   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
is_maec                   > [         TU ]  5         [        UR                  5      U l        U R                  S:w  a  [        SU R                   S35      eUR                  U l        UR                  SS  U l        [        UR                  UR                  5       VVs/ s H	  u  p4X4-  PM     snnU l
        [        U R                  UR                  5       VVs/ s H	  u  p4X4-  PM     snnU l        UR                  U l        X l        [        R                   " U R                  UR"                  UR                  UR                  UR$                  S9U l        g s  snnf s  snnf )N   zAThe number of dimensions of the input image should be 2, but got .)kernel_sizestridepadding)super__init__len
patch_sizespatial_dims
ValueErrornum_channels
image_sizezippatch_stridetokens_spatial_shapemasked_unit_sizemask_spatial_shape
mask_ratiorA   r	   Conv2d	embed_dimpatch_padding
projection)selfconfigrA   is	__class__s        r-   rJ   HieraPatchEmbeddings.__init__   s*      1 12!`aearar`sstuvv"// ++BC08;F<M<MvObOb8c$d8cQV8c$d!69$:S:SU[UlUl6m"n6mda166m"n ++))))&&((
	 %e"ns   EEpixel_valuesr2   returnc                    Uc  U R                  U5      $ UR                  SS nUR                  " UR                  S   S/U R                  Q76 n[        R
                  R                  UR                  5       US9nU R                  X-  5      $ )zyZero-out the masked regions of the input before conv.
Prevents leakage of masked regions when using overlapping kernels.
NrC   r   r   )size)rZ   shapeviewrU   r	   
functionalinterpolatefloat)r[   ra   r2   target_sizes       r-   masked_conv HieraPatchEmbeddings.masked_conv   s     "??<00"((,)..|/A/A!/Dab$JaJab--33O4I4I4KR]3^|=>>r,   noisec                    UR                   S   n[        R                  " U R                  5      n[	        USU R
                  -
  -  5      nUc  [        R                  " X4UR                  S9n[        R                  " USS9n[        R                  " USS9R                  UR                  5      n[        R                  " X4/UR                  S9nSUSS2SU24'   [        R                  " USUS9R                  5       nX4$ )a  
Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
noise.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`)
    noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
        mainly used for testing purposes to control randomness and maintain the reproducibility
r   r   Ndevicedim)rr   index)re   mathprodrU   intrV   r(   randrp   argsorttozerosgatherbool)	r[   ra   rm   
batch_sizenum_windowslen_keepids_shuffler3   r2   s	            r-   random_masking#HieraPatchEmbeddings.random_masking   s     "''*
ii 7 78{a$//&9:;=JJz|?R?RSE mmEq1mmKQ7::<;N;NO  ++z&?H[H[\()9H9%,,A[QVVX++r,   c                     U R                   (       a  U R                  XS9OSu  p4U R                  X5      nUR                  S5      R	                  SS5      nXSU4$ )Nrm   )NNrC   r   )rA   r   rk   flatten	transpose)r[   ra   rm   r2   r3   
embeddingss         r-   forwardHieraPatchEmbeddings.forward  s_     ?CkkD:| 	' %%lD
''*44Q:
K77r,   )rP   rA   rV   rU   rO   rZ   rM   rS   FN)r#   r$   r%   r&   r'   r|   rJ   r(   r)   r   r4   Tensorrk   r   r5   r   r   r+   __classcell__r_   s   @r-   r?   r?      s    
t 
 
, ^b?!--?@HIYIY@Z?	?$ UY ,!-- ,6>u?P?P6Q ,	u!1!11	2 ,J .28''8 ))*8 
u||Xe&6&67%BRBR9SS	T	8 8r,   r?   c                     ^  \ rS rSrSrSS\S\SS4U 4S jjjrS\R                  S	\R                  S
\
S\
S\R                  4
S jrS\R                  S
\
S\
S\S\R                  4
S jr  SS\R                  S\\R                     S\S\\R                  \\R                      \\R"                     4   4S jjrSrU =r$ )HieraEmbeddingsi  z*
Construct position and patch embeddings.
r\   rA   rb   Nc                   > [         TU ]  5         UR                  U l        [        UR                  UR                  5       VVs/ s H	  u  p4X4-  PM     nnn[        XQR
                  5       VVs/ s H	  u  p4X4-  PM     snnU l        [        R                  " U5      U l	        X l
        [        XS9U l        [        R                  " [        R                   " SU R                  UR"                  5      5      U l        g s  snnf s  snnf )NrA   r   )rI   rJ   rR   rQ   rP   rT   rU   rt   ru   
num_tokensrA   r?   patch_embeddingsr	   	Parameterr(   rz   rX   position_embeddings)r[   r\   rA   r]   r^   rS   r_   s         r-   rJ   HieraEmbeddings.__init__"  s    "//36v7H7H&J]J]3^_3^413^_69:NPgPg6h"i6hda166h"i))$89 4V K#%<<AtPVP`P`0a#b   `"is   C=/Dr   
pos_embedsheightwidthc                    UR                   S   nUR                   S   n[        R                  R                  5       (       d  XV:X  a  X4:X  a  U$ UR                   S   nX0R                  S   -  nX@R                  S   -  n	[        US-  5      n
UR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SSS	9nUR                  SSSS5      R                  SSU5      nU$ )
a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing, no class embeddings, and different patch strides.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r         ?r   rC   bicubicF)rd   modealign_corners)re   r(   jit
is_tracingrR   r   reshapepermuter	   rg   rh   rf   )r[   r   r   r   r   num_patchesnum_positionsrr   
new_height	new_widthsqrt_num_positionss              r-   interpolate_pos_encoding(HieraEmbeddings.interpolate_pos_encoding.  s    !&&q)"((+ yy##%%+*F6?r"0033
..q11	&}c'9:''+=SVW
''1a3
]]..(	 / 

  ''1a388BD
r,   r   c                 `    U(       a  U R                  XR                  X#5      $ U R                  $ r   )r   r   )r[   r   r   r   r   s        r-   get_position_embedding&HieraEmbeddings.get_position_embeddingT  s5    
 ( ))*6N6NPV^	
 ))	
r,   ra   rm   c                 x    UR                   SS  u  pEU R                  XS9u  pgnX`R                  XdXS5      -   nXgU4$ )NrE   r   )re   r   r   )	r[   ra   rm   r   r   r   r   r2   r3   s	            r-   r   HieraEmbeddings.forward]  sQ     %**23/373H3H3H3c0
["="=jRW"rr
K77r,   )rA   rU   r   r   rR   r   r   NF)r#   r$   r%   r&   r'   r   r|   rJ   r(   r   rv   r   r)   r   r   r   r4   r5   r   r+   r   r   s   @r-   r   r     s   
c{ 
cD 
cT 
c 
c$,,$49LL$JM$VY$	$L
,,
03
<?
[_
			
 .2).		8''	8 ))*	8 #'		8
 
u||Xe&6&67%BRBR9SS	T	8 	8r,   r   c                      ^  \ rS rSrSr   SS\S\S\S\S\S\S	S
4U 4S jjjr  SS\R                  S\
\R                     S\S	\\R                  \
\R                     4   4S jjrSrU =r$ )HieraMaskUnitAttentionii  z
Computes either Mask Unit or Global Attention. Also is able to perform query pooling.

Note: this assumes the tokens have already been flattened and unrolled into mask units.
hidden_sizehidden_size_output	num_headsquery_stridewindow_sizeuse_mask_unit_attnrb   Nc                   > [         TU ]  5         X0l        X@l        X l        X#-  U l        U R
                  S-  U l        [        R                  " USU-  5      U l	        [        R                  " X"5      U l
        XPl        X`l        g )Ng      r   )rI   rJ   r   r   r   head_dimscaler	   Linearqkvprojr   r   )r[   r   r   r   r   r   r   r_   s          r-   rJ   HieraMaskUnitAttention.__init__p  sp     	"("4*7mm,
99[!.@*@AII0E	&"4r,   r   	head_maskoutput_attentionsc                 
   UR                   u  pEnSnU R                  (       a  XPR                  U R                  -  -  nU R	                  U5      nUR                  USUSU R                  U R                  5      nUR                  SSSSSS5      nUR                  S5      u  pnU R                  S:  aK  U	R                  X@R                  XpR                  SU R                  5      n	U	R                  SS9R                  n	XR                  -  U
R                  SS	5      -  nUR                  SS9nUb  X-  nX-  nUR                  SS5      R                  USU R                   5      nU R#                  U5      nU(       a  X4$ US
4$ )z3Input should be of shape [batch, tokens, channels].r   r   r   r      rC      rq   rE   N)re   r   r   r   r   r   r   r   r   unbindrf   maxvaluesr   r   softmaxr   r   )r[   r   r   r   r}   seq_len_r~   r   querykeyvalueattn_weightsattn_outputs                 r-   r   HieraMaskUnitAttention.forward  so    "/!4!4
Q""!&7&7$:J:J&JKKhh}%kk*b+q$..$--Xkk!Q1a+JJqMEq JJz>>;HYHY[]_c_l_lmEII!I$++E

*cmmB.CC#+++3  '3L"*!++Aq199*b$JaJabii,.?*XkSWEXXr,   )	r   r   r   r   r   r   r   r   r   )r   r   Fr   )r#   r$   r%   r&   r'   rv   r|   rJ   r(   r   r   r)   r   r   r+   r   r   s   @r-   r   r   i  s     #(55  5 	5
 5 5 !5 
5 54 26"'	#Y||#Y E--.#Y  	#Y
 
u||Xell33	4#Y #Yr,   r   input	drop_probtrainingrb   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtyperp   )re   ndimr(   rw   r   rp   floor_div)r   r   r   	keep_probre   random_tensoroutputs          r-   	drop_pathr     s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr,   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )HieraDropPathi  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rb   c                 .   > [         TU ]  5         Xl        g r   )rI   rJ   r   )r[   r   r_   s     r-   rJ   HieraDropPath.__init__  s    "r,   r   c                 B    [        XR                  U R                  5      $ r   )r   r   r   r[   r   s     r-   r   HieraDropPath.forward  s    FFr,   c                 8    SR                  U R                  5      $ )Nzp={})formatr   r[   s    r-   
extra_reprHieraDropPath.extra_repr  s    }}T^^,,r,   )r   r   )r#   r$   r%   r&   r'   r   ri   rJ   r(   r   r   strr   r+   r   r   s   @r-   r   r     sQ    b#(5/ #T # #GU\\ Gell G-C - -r,   r   c                   n   ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	HieraMlpi  rr   rb   Nc                   > [         TU ]  5         [        UR                     U l        [
        R                  " U[        X!R                  -  5      5      U l	        [
        R                  " [        X!R                  -  5      U5      U l
        g r   )rI   rJ   r   
hidden_actactivation_fnr	   r   rv   	mlp_ratiofc1fc2)r[   r\   rr   r_   s      r-   rJ   HieraMlp.__init__  sa    #F$5$5699S#c,<,<&<"=>99S'7'7!78#>r,   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   s     r-   r   HieraMlp.forward  s4    /**=9/r,   )r   r   r   )r#   r$   r%   r&   rv   rJ   r(   r   r   r+   r   r   s   @r-   r   r     s6    ?C ?D ?U\\ ell  r,   r   c                      ^  \ rS rSr    SS\S\S\S\S\S\S\S	S
4U 4S jjjr  SS\R                  S\
\R                     S\S	\\R                  \
\R                     4   4S jjrSrU =r$ )
HieraLayeri  r   r   r   r   r   r   r   rb   Nc	           	        > [         T	U ]  5         X l        X0l        X`l        [
        R                  " X!R                  S9U l        [        UUUUUUS9U l
        [
        R                  " X1R                  S9U l        [        X5      U l        US:  a  [        U5      O[
        R                  " 5       U l        X#:w  a  [
        R"                  " X#5      U l        g g )Neps)r   r   r   r   r   r   r   )rI   rJ   r   r   r   r	   	LayerNormlayer_norm_epslayernorm_beforer   attnlayernorm_afterr   mlpr   Identityr   r   r   )
r[   r\   r   r   r   r   r   r   r   r_   s
            r-   rJ   HieraLayer.__init__  s     	&"4( "[>S>S T*#1%#1
	  "||,>DYDYZF75>]y1,		+BDI -r,   r   r   r   c                    UR                   u  pEnU R                  U5      nU R                  U R                  :w  aO  U R	                  U5      nUR                  X@R                  SU R                  5      R                  SS9R                  nU R                  XrUS9u  pxXR                  U5      -   nUn	U R                  U5      nU R                  U5      nXR                  U5      -   nX4$ )Nr   r   rq   r   )re   r   r   r   r   rf   r   r   r   r   r   r   r  )
r[   r   r   r   r}   r   r   hidden_states_normr   residuals
             r-   r   HieraLayer.forward  s     "/!4!4
Q!22=At666 II&89M "":/@/@"dF]F]^bbghbipp  .2YY=N .7 .
*	 &7I(JJ ,,];/ >>-#@@,,r,   )	r   r   r   r   r   r   r  r   r   )r   r   r   Fr   )r#   r$   r%   r&   rv   ri   r|   rJ   r(   r   r   r)   r   r   r+   r   r   s   @r-   r   r     s     #( C  C  	 C
  C  C  C  C ! C 
 C  CJ 26"'	-||- E--.-  	-
 
u||Xell33	4- -r,   r   c                      ^  \ rS rSr SS\S\S\S\S\\   S\\   S	\S
\S\\   SS4U 4S jjjr	 SS\
R                  S\\
R                     S\S\\
R                  \\
R                     4   4S jjrSrU =r$ )
HieraStagei  Ndepthr   r   r   r   r   r   r   	stage_numrb   c                 2  > [         TU ]  5         SnU
b  UR                  U
S:  a  U
S-
  OS   n[        R                  " [        U5       Vs/ s H4  n[        UUS:X  a  UOUUUXl   X|   UU	=(       d    U=(       a    US:H  S9PM6     sn5      U l        g s  snf )NFr   r   )r\   r   r   r   r   r   r   r   )rI   rJ   masked_unit_attentionr	   
ModuleListranger   layers)r[   r\   r  r   r   r   r   r   r   r   r  $previous_stage_used_masked_attentionr]   r_   s                r-   rJ   HieraStage.__init__   s     	
 05, 393O3OajmnanPY\]P]tu3v0mm u &A !/0Av;M'9''l!- +'9'n>b>mghlmgm	 &
s   ;Br   r   r   c                 j    [        U R                  5       H  u  pEUb  X$   OS nU" XUS9u  pM     UW4$ )Nr  )	enumerater  )r[   r   r   r   r]   layer_modulelayer_head_maskr   s           r-   r   HieraStage.forwardE  sI      )5OA.7.CilO,8BS-)]L  6 l**r,   )r  r   r   )r#   r$   r%   r&   rv   r   ri   r|   r   rJ   r(   r   r)   r   r   r+   r   r   s   @r-   r
  r
    s     $(#
 #
 	#

  #
 #
 ;#
 3i#
 #
 !#
 C=#
 
#
 #
L ns	+"\\	+6>u?P?P6Q	+fj	+	u||Xell33	4	+ 	+r,   r
  r   re   mask_unit_shapec                    U R                   S   U R                   S   pC[        X5       VVs/ s H	  u  pVXV-  PM     nnnU R                  " U/UQUQUP76 n U R                  SSSSSS5      n U R                  " U/UQUP76 n U $ s  snnf )a=  
Restore spatial organization by undoing windowed organization of mask units.

Args:
    hidden_states (`torch.Tensor`): The hidden states tensor of shape `[batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size]`.
    shape (`List[int]`): The original shape of the hidden states tensor before windowing.
    mask_unit_shape (`List[int]`): The shape of the mask units used for windowing.

Returns:
    torch.Tensor: The restored hidden states tensor of shape [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size].
r   r   r   r   rC   r   r   )re   rQ   rf   r   r   )r   re   r  r}   r   r^   munum_mask_unitss           r-   undo_windowingr  Q  s     ,11!4m6I6I"6M ,/u+FG+F%!ag+FNG!&&zbNb_bVabM "))!Q1a;M!))*JuJkJM Hs   Bc                     ^  \ rS rSrS\SS4U 4S jjr SS\R                  S\S\	\R                     S\R                  4S	 jjr     SS\R                  S\	\R                     S
\	\R                     S\S\S\S\\\4   4S jjrSrU =r$ )HieraEncoderik  r\   rb   Nc                   > [         TU ]  5         [        UR                  5      n[        R
                  " SUR                  USS9 Vs/ s H  o3R                  5       PM     nn[        R                  " UR                  SS9R                  S5      R                  5       nUS UR                   n[        U5       Vs/ s H*  owU;   a   [        R                  " UR                  5      OSPM,     nn[         R"                  " 5       U l        UR&                  n	S/U-   n
[        R                  " UR(                  5      n[        R                  " UR                  5      n[+        UR                  5       H  u  p[-        UR&                  UR.                  U-  -  5      n[1        UUU	UUR2                  U   XJU   XS-       XU   XS-       [-        XU* -  -  5      UR4                  U   US9
nUn	U R$                  R7                  U5        M     [9        UR:                  UR<                  5       VVs/ s H  u  nnUU-  PM     nnnUR                  /[?        UR                  S S 5      -  n0 U l         [        [?        UR                  5      5       HX  nUU4U R@                  U'   XR                  :  d  M%  [9        UUR                  5       VVs/ s H  u  nnUU-  PM     nnnUSS  nMZ     SU l!        g s  snf s  snf s  snnf s  snnf )Nr   cpuro   r   )
r\   r  r   r   r   r   r   r   r   r  r   F)"rI   rJ   sumdepthsr(   linspacedrop_path_rateitemtensorcumsumtolistnum_query_poolr  rt   ru   r   r	   r  stagesrX   rT   r  rv   embed_dim_multiplierr
  r   r  appendrQ   rP   rR   rK   schedulegradient_checkpointing)r[   r\   total_depthxdprcumulative_depthsquery_pool_layerr]   query_stridesr   
stage_endsmasked_unit_areaquery_stride_area	idx_stager  r   stager^   
stage_sizeunroll_scheduler_   s                       r-   rJ   HieraEncoder.__init__l  s   &--(!&63H3H+^c!de!dAvvx!de!LLuELLQOVVX,-Dv/D/DEafgrastas\]@P;P6#6#67VWWast mmo&&S,,
99V%<%<= IIf&9&9: )&-- 8I!$V%5%58S8SU^8^%^!_'#5 **953jQ6OP*i+@:Z[mC\] 0yj3P PQ#)#?#?	#J#E -KKKu%# !9, *-V->->@S@S)TU)TAa1f)T
U!../#fmmCR6H2IIs6==12I'6
'BDMM)$00014ZATAT1UV1UAa1f1U
V"1!""5	 3 ',#Y f u< V Ws   K0;1K5,K:
L r   	stage_idxr2   c           
         U R                   U   u  pEUR                  u  pgn[        U5      n	S/U	-  n
U H  nUR                  " U/UQU[        R
                  " U5      -  PU
QUP76 nUR                  SSSSSSS5      n[        U	5       H  nX==   X   -  ss'   M     UR                  " US/U
QUP76 nUR                  S   nM     UR                  " Xg/U
QUP76 nUb  U$ [        XU
5      nU$ )	a,  
Roll the given tensor back up to spatial order assuming it's from the given block.

If no bool_masked_pos is provided returns:
    - [batch_size, height, width, hidden_size]
If a bool_masked_pos is provided returns:
    - [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
r   r   r   r   rC   r      r   )
r.  re   rK   rf   rt   ru   r   r  r   r  )r[   r   r>  r2   r.  rd   r}   r   r   num_dimr  stridesr]   s                r-   rerollHieraEncoder.reroll  s-    y1+8+>+>(
[d)#-G)..$&-71C&CFUWbM *11!Q1aAFM 7^"gj0" $)11*b`?`T_`M#))!,G  $ &**:^^R]^ &   '}OLr,   r   r   output_hidden_statesreturn_dictc                 $   U(       a  SOS nU(       a  SOS nU(       a  SOS n	U(       a  Xq4-   nU R                  USUS9n
X4-   n[        U R                  5       H  u  pUb  X;   OS nU R                  (       a/  U R                  (       a  U R                  UR                  XU5      nO	U" XU5      nUS   nU(       a  XS   4-   n	U(       d  Mt  Xq4-   nU R                  XUS9n
X4-   nM     U(       d  [        S XX4 5       5      $ [        UUU	US9$ )Nr"   r   )r>  r2   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r"   ).0vs     r-   	<genexpr>'HieraEncoder.forward.<locals>.<genexpr>  s      lA ls   	)r   r   r    r!   )	rC  r  r+  r/  r   _gradient_checkpointing_func__call__tupler   )r[   r   r2   r   r   rE  rF  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr!   r]   stage_moduler  layer_outputss                  r-   r   HieraEncoder.forward  sB    #7BD+?RT"$5b4 14D D%)[[!]l[%m")CF_)_&(5OA.7.CilO**t}} $ A A ))=K\! !-]M^ _)!,M &91=M<O&O###$58H$H!)-]ap)q&-GJc-c*%  6(  '<Ol  
 "++*#=	
 	
r,   )r/  r.  r+  r   )NNFFT)r#   r$   r%   r&   r   rJ   r(   r   rv   r   r4   rC  r)   r|   r   rO  r   r   r+   r   r   s   @r-   r  r  k  s    0,{ 0,t 0,f jn,"\\,69,LTUZUeUeLf,	,b 7;15"'%* 1
||1
 "%"2"231
 E--.	1

  1
 #1
 1
 
uo%	&1
 1
r,   r  image_shaperR   r.  c           	         U R                   u  pEn[        X5       VVs/ s H	  u  pxXx-  PM     n	nnU	n
U R                  " U/U
-   U/-   6 n U H  n[        X5       VVs/ s H	  u  pxXx-  PM     n
nn[        X5       VVs/ s H  o  H  oPM     M     nnnU/U-   U/-   nU R                  U5      n [        U5      nS/[	        [        SUS-
  S5      5      -   [	        [        SUS-
  S5      5      -   US-
  /-   nU R                  U5      n U R                  S[        U5      5      n U[        R                  " U5      -  nM     U R                  S[        R                  " U	5      U5      n U $ s  snnf s  snnf s  snnf )ap  
Reorders the tokens such that patches are contiguous in memory.
E.g., given [batch_size, (height, width), hidden_size] and stride of (stride, stride), this will re-order the tokens as
[batch_size, (stride, stride, height // stride, width // stride), hidden_size]

This allows operations like Max2d to be computed as x.view(batch_size, stride*stride, -1, hidden_size).max(dim=1).
Not only is this faster, but it also makes it easy to support inputs of arbitrary
dimensions in addition to patch-wise sparsity.

Performing this operation multiple times in sequence puts entire windows as contiguous
in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
size 8x8 would be contiguous in memory, allowing operations like mask unit attention
computed easily and efficiently, while also allowing max to be applied sequentially.

Note: This means that intermediate values of the model are not in height x width order, so they
need to be re-rolled if you want to use the intermediate values as a height x width feature map.
The last block of the network is fine though, since by then the strides are all consumed.
r   rC   r   r   )re   rQ   rf   rK   listr  r   r   rt   ru   r   )r   rV  rR   r.  r}   r   r   r]   r^   rd   current_sizerB  pairr&  	new_shapenum_dimsr   s                    r-   unrollr]     s   * "/!4!4J;";=>=tqAF=D>L!&&*)D})TVM
 ,/|+EF+E41+EF&),&@R&@dTTTTT&@	RL9,}<	%**95 y>#U1hlA677$uQSTVW?X:YY]ehi]i\jj%--g6 &--aW>dii((
) , "))"diio{KM9 ? GRs   E+E1>E7c                   .    \ rS rSr\rSrSrSrSS jr	Sr
g)	HieraPreTrainedModeli6  hierara   TNc                    U R                   R                  n[        U[        5      (       a)  [        R
                  R                  UR                  US9  g[        U[        5      (       aQ  [        R
                  R                  UR                  US9  [        R
                  R                  UR                  US9  g[        U[        R                  [        R                  [        R                  45      (       aa  [        R
                  R                  UR                  US9  UR                  b+  [        R
                  R!                  UR                  U5        gg[        U[        R"                  5      (       ai  [        R
                  R!                  UR                  U5        [        R
                  R!                  UR                  U R                   R$                  5        gg)zInitialize the weights)stdN)r\   initializer_range
isinstancer   r	   inittrunc_normal_r   HieraDecoder
mask_tokendecoder_position_embeddingsr   Conv1drW   weightbias	constant_r   layer_norm_init)r[   modulerb  s      r-   _init_weights"HieraPreTrainedModel._init_weights=  s6   kk++fo..GG!!&"<"<#!F--GG!!&"3"3!=GG!!&"D"D#!NBIIryy ABBGG!!&--S!9{{&!!&++s3 ' --GGfkk3/GGfmmT[[-H-HI .r,   r"   )rb   N)r#   r$   r%   r&   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingrp  r+   r"   r,   r-   r_  r_  6  s    L$O&*#Jr,   r_  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )HieraPooleriR  r\   c                   > [         TU ]  5         [        UR                  UR                  [        UR                  5      S-
  -  -  5      n[        R                  " X!R                  S9U l
        [        R                  " S5      U l        g )Nr   r   )rI   rJ   rv   rX   r,  rK   r#  r	   r   r   	layernormAdaptiveAvgPool1dpooler)r[   r\   num_featuresr_   s      r-   rJ   HieraPooler.__init__S  sh    6++f.I.IcRXR_R_N`cdNd.eefl8M8MN**1-r,   r   rb   c                     UR                  SS5      nU R                  U5      n[        R                  " US5      nU R	                  U5      nU$ )Nr   rC   )r   r{  r(   r   ry  )r[   r   pooled_outputs      r-   r   HieraPooler.forwardY  sF    %//15M2mQ7}5r,   )ry  r{  )r#   r$   r%   r&   r   rJ   r(   r   r   r+   r   r   s   @r-   rw  rw  R  s/    .{ .U\\ ell  r,   rw  c                   "  ^  \ rS rSrSS\S\S\4U 4S jjjrS\4S jrS\	\
\\
   4   SS	4S
 jr\       SS\\R                      S\\R"                     S\\R                      S\\   S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )
HieraModelia  r\   add_pooling_layerrA   c                   > [         TU ]  U5        [        UR                  UR                  [        UR                  5      S-
  -  -  5      U l        [        XS9U l	        [        U5      U l        UR                  /[        UR                  SS 5      -  U l        U(       a  [        U5      OSU l        U R!                  5         g)z
add_pooling_layer (`bool`, *optional*, defaults to `True`):
    Whether or not to apply pooling layer.
is_mae (`bool`, *optional*, defaults to `False`):
    Whether or not to run the model on MAE mode.
r   r   Nr   )rI   rJ   rv   rX   r,  rK   r#  r|  r   r   r  encoderr   r<  rw  r{  	post_init)r[   r\   r  rA   r_   s       r-   rJ   HieraModel.__init__c  s     	  0 063N3NSVW]WdWdSehiSi3j jk)&@#F+ & 3 34s6==";M7NN->k&)D 	r,   rb   c                 .    U R                   R                  $ r   r   r   r   s    r-   get_input_embeddingsHieraModel.get_input_embeddingsw      ///r,   heads_to_pruneNc                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  layer	attentionprune_heads)r[   r  r  headss       r-   _prune_headsHieraModel._prune_headsz  s<    
 +002LELLu%//;;EB 3r,   ra   rm   r   r   rE  r   rF  c           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  U[        U R                   R                  5      5      nU R                  XUS9u  pn
UR                  S   UR                  S   4n[        UUU R                   R                  U R                  S9nU	bs  [        R                  " U R                   R                  5      nUR                  u  pnU	R!                  S5      R#                  SUU5      nUU   nUR%                  USU5      nU R'                  UU	UUUUS9nUS	   nSnU R(                  b  U R)                  U5      nU(       d  Ub  UU4OU4nU	b  UX4-   OUnUUSS -   $ [+        UUU	U
UR,                  UR.                  UR0                  S
9$ )z
noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*):
    Mainly used for testing purposes to control randomness and maintain the reproducibility
Nz You have to specify pixel_values)r   rm   rE   r   )rV  rR   r.  r   )r2   r   r   rE  rF  r   )r   r1   r2   r3   r   r    r!   )r\   r   rE  use_return_dictrN   get_head_maskrK   r#  r   re   r]  rR   r<  rt   ru   rT   	unsqueezetilerf   r  r{  r/   r   r    r!   )r[   ra   rm   r   r   rE  r   rF  embedding_outputr2   r3   rV  r   mask_unit_arear}   r   r   	positionsencoder_outputssequence_outputr  head_outputss                         r-   r   HieraModel.forward  s"    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@ &&y#dkk6H6H2IJ	9=SX :I :
6; $))"-|/A/A"/EF#11))	
 &!YYt{{'C'CDN)6)<)<&J;'11"5::1nkZI))4M)..z2{KM,,+/!5# ' 
 *!,;;" KK8M?L?XO];_n^pLAPA\==bn   /!""555-'+#)77&11#2#I#I
 	
r,   )r   r  r|  r{  r<  )TFNNNNNNN)r#   r$   r%   r&   r   r|   rJ   r?   r  r   rv   r   r  r   r   r(   r   r)   r   r   r   r   r+   r   r   s   @r-   r  r  a  s   { t TX  (0&: 0C4T#Y+? CD C  04-1,0,0/337&*N
u||,N
 ))*N
 ELL)	N

 $D>N
 'tnN
 #+4.N
 d^N
 
u00	1N
 N
r,   r  c                      ^  \ rS rSrS\4U 4S jjr  SS\R                  S\R                  S\	\R                     S\
S\\R                  \R                  4   4
S	 jjrS
rU =r$ )rg  i  r\   c                   > [         TU ]  5         [        UR                  UR                  [        UR                  5      S-
  -  -  5      n[        UR                  UR                  5       VVs/ s H	  u  p4X4-  PM     nnn[        XQR                  5       VVs/ s H  u  p4X4UR                  -  -  PM     snnU l        [        UR                  UR                  5       VVs/ s H  u  p4X4UR                  -  -  PM     snnU l        [        R                   " X!R"                  5      U l        [        R&                  " [(        R*                  " SSUR"                  5      5      U l        [        R&                  " [(        R*                  " S[.        R0                  " U R                  5      UR"                  5      5      U l        [5        UUR"                  UR"                  UR6                  UR8                  SS/UR8                  -  S/UR8                  -  SS9	U l        [        R<                  " UR"                  UR>                  S9U l         UR                  S   UR                  S   UR                  -  -  U l!        U RB                  [        UR                  5      -  URD                  -  n[        R                   " UR"                  U5      U l#        g s  snnf s  snnf s  snnf )Nr   Fr   r   )	r\   r   r   r   r  r   r   r   r   r   r   )$rI   rJ   rv   rX   r,  rK   r#  rQ   rP   rR   r   r*  tokens_spatial_shape_finalrT   mask_unit_spatial_shape_finalr	   r   decoder_hidden_sizedecoder_embeddingsr   r(   rz   rh  rt   ru   ri  r
  decoder_num_headsdecoder_depthdecoder_blockr   r   decoder_normpred_striderO   decoder_pred)r[   r\   r|  r]   r^   rS   pred_dimr_   s          r-   rJ   HieraDecoder.__init__  sZ   6++f.I.IcRXR_R_N`cdNd.eef36v7H7H&J]J]3^_3^413^_:=>RTgTg:h+
:h$!Av,,--:h+
' ;>f>U>UW]WjWj:k.
:k$!Av,,--:k.
* #%))L:T:T"U,,u{{1a9S9S'TU+-<<KK499T%D%DEvGaGab,
( (22%99..&&$ef222v333

 LL)C)CI^I^_ "..r2f6I6I"6MQWQfQf6fg$$F,?,?(@@FDWDWWIIf&@&@(KE  `+
.
s   .KK"Kencoder_hidden_statesr2   r   r   rb   c           
      z   U R                  U5      nUR                  SS  u  pgnUR                  u  p[        R                  " U	U
UUUUR                  UR
                  S9nU R                  R                  SSSSS5      nUR                  XSSS5      nUR                  SSXgU5      nUR                  5       X'   SUR                  5       -
  U-  UR                  5       U-  -   n[        UU R                  U R                  5      n[        USSS24   U R                  U R                  5      nUR                  UR                  S   SUR                  S   5      nUR                  UR                  S   S5      nXPR                  -   nU R!                  XSUS9u  p]U R#                  U5      nU R%                  U5      nXR4$ )NrC   )rp   r   r   r   .r   )r   r   )r  re   r(   rz   rp   r   rh  rf   r   expandr   ri   r  r  r  ri  r  r  r  )r[   r  r2   r   r   r   mask_unit_heightmask_unit_widthr  r}   r  decoder_hidden_statesmask_tokensr   s                 r-   r   HieraDecoder.forward  s    //0EF BOATATUVUWAX>+>%4%:%:"
 % ''%%!
 oo**1aAr:)11*aQRTUV)00R9I\op1>1F1F1H.%%''!)//14II!J
 '!++..

 )C1H%++..
 &--m.A.A!.Db-J]J]^`Jab)..}/B/B1/ErJ &(H(HH '+&8&8BS '9 '
# ))-8 ))-8--r,   )	r  r  r  ri  r  rh  r  r  r  r   )r#   r$   r%   r&   r   rJ   r(   r   r4   r   r|   r   r   r+   r   r   s   @r-   rg  rg    s{    %L{ %LV -1"'>.$||>. ))>. ELL)	>.
  >. 
u||U---	.>. >.r,   rg  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr
S\\R                     S\R                  4S	 jrS
rU =r$ )HieraMultiScaleHeadi=  r\   c           
        > [         TU ]  5         [        UR                  UR                  5       VVs/ s H  u  p#X#UR
                  -  -  PM     snnU l        [        [        UR                  5      5       Vs/ s H(  n[        UR                  UR                  U-  -  5      PM*     snU l        UR                  n[        R                  " 5       U l        [        UR
                  5       H  n[        X@R                  5       VVs/ s H	  u  p#X#-  PM     nnn[        XAR                  5       VVs/ s H	  u  p#X#-  PM     nnnU R                   R#                  [        R$                  " U R                  U   U R                  S   UUS95        M     U R                   R#                  [        R&                  " 5       5        g s  snnf s  snf s  snnf s  snnf )Nr   )rF   rG   )rI   rJ   rQ   rT   r   r*  r  r  rK   r#  rv   rX   r,  stage_dimensionsr	   r  multi_scale_fusion_headsr-  rW   r  )r[   r\   r]   r^   current_masked_unit_sizeidxkernelr_   s          r-   rJ   HieraMultiScaleHead.__init__>  s   :=f>U>UW]WjWj:k.
:k$!Av,,--:k.
* MRRUV\VcVcRdLe!
LeqC  6#>#>#AABLe!
 $*#:#: (*%../C),-EGiGi)jk)jaf)jFk;>?WYlYl;m'n;m41;m$'n))00		))#.))"- &!	 0 	%%,,R[[];).
!
 l'ns   G9/GG1Gheadr   rb   c                 <   [        U[        R                  5      (       a  U$ UR                  u  p4pVnUR	                  X4-  XVU5      nUR                  SSSS5      nU" U5      nUR                  SSSS5      nUR                  SS  u  pnUR	                  X4XU5      nU$ )Nr   r   r   rC   )rd  r	   r  re   r   r   )
r[   r  r   r}   r  r  r  r   mask_unit_height_finalmask_unit_width_finals
             r-   apply_fusion_head%HieraMultiScaleHead.apply_fusion_headV  s    dBKK((   VcUhUhR
$4{ &--')9K
 &--aAq9]+ &--aAq9EREXEXYZY[E\B{%--(>Wb
 r,   feature_mapsc                 n    Sn[        U R                  U5       H  u  p4X R                  X45      -   nM     U$ )Nr   )rQ   r  r  )r[   r  r   r  feature_maps        r-   r   HieraMultiScaleHead.forwardm  s=    !$T%B%BL!QD),B,B4,UUM "R r,   )r  r  r  )r#   r$   r%   r&   r   rJ   r	   Moduler(   r   r  r   r   r+   r   r   s   @r-   r  r  =  s]    <{ <0bii  QVQ]Q] .D$6 5<<  r,   r  a;  
    The Hiera Model transformer with the decoder on top for self-supervised pre-training.

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                     ^  \ rS rSrS\SS4U 4S jjrS\R                  S\R                  S\R                  4S jr	S\R                  S	\R                  S\R                  4S
 jr
\       SS\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )HieraForPreTrainingiv  r\   rb   Nc                 J  > [         TU ]  U5        [        USSS9U l        [        R
                  " U R                  R                  UR                  S9U l        [        U5      U l
        [        U5      U l        U R                  R                  U l        U R                  5         g )NFTr  rA   r   )rI   rJ   r  r`  r	   r   r|  r   encoder_normr  multiscale_fusionrg  decoderr  r  r[   r\   r_   s     r-   rJ   HieraForPreTraining.__init__  sz     %M
LL)@)@fF[F[\!4V!<#F+<<33 	r,   ra   r2   c                 d   UR                  SSSS5      nU R                  nUR                  SX35      R                  SX35      nUR                  SS5      R                  S5      nXB   nU R                  R
                  (       a-  UR                  SSS9nUR                  SSS9nXE-
  US-   S	-  -  nU$ )
Nr   rC   r   r   r   T)rr   keepdimgư>r   )r   r  unfoldr   r\   normalize_pixel_lossmeanvar)r[   ra   r2   rd   labelr  r  s          r-   get_pixel_label_2d&HieraForPreTraining.get_pixel_label_2d  s    #++Aq!Q7##At299!THa#++A.&;;++::"d:3D))D)1C\cFls%::Er,   r:   c                 d    U) nU R                  X5      nX#   nX$-
  S-  nUR                  5       nU$ )NrC   )r  r  )r[   ra   r:   r2   r  r9   s         r-   forward_loss HieraForPreTraining.forward_loss  s>    **''F(1$yy{r,   rm   r   r   rE  r   rF  c           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUSUUS9nUS   n	US   n
US   nU	SU R                  R                   R
                  S-    U	S   4-   n	U R                  U	5      nU R                  U5      nU R                  UU
UUS9u  pU R                  XU
5      nU(       d<  XU4nU(       a  XS   4-   nU(       a  XS	   4-   nU(       a  XS   4-   nUb  U4U-   $ U$ [        UUU
UU(       a  UR                  OSUR                  U(       a  UR                  S
9$ SS
9$ )a7  
noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*):
    Mainly used for testing purposes to control randomness and maintain the reproducibility

Examples:
```python
>>> from transformers import AutoImageProcessor, HieraForPreTraining
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-mae-hf")
>>> model = HieraForPreTraining.from_pretrained("facebook/hiera-tiny-224-mae-hf")

>>> inputs = image_processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> loss = outputs.loss
>>> print(list(logits.shape))
[1, 196, 768]
```NT)rm   r   r   rE  r   rF  r   r   rC   )r2   r   r   r   r   )r9   r:   r2   r3   r   r    r!   )r\   r  r   rE  r`  r*  r  r  r  r  r<   r   r    r!   )r[   ra   rm   r   r   rE  r   rF  outputsr  r2   ids_to_restorefused_hidden_statesr:   r9   r   s                   r-   r   HieraForPreTraining.forward  s   H &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 **/!%%=#  
 r{!!* #A

(9(9(H(H1(LMQ]^`QaPcc"44\B"//0CD #',,+/	 #/ #
   G~>F#1:-/ 1:-/#2;.0)-)9TGf$EvE(+&3G'//T))EY7#A#A
 	
 `d
 	
r,   )r  r  r`  r  r  r  )r#   r$   r%   r&   r   rJ   r(   r   r4   r  r  r   r   r)   r|   r   rO  r<   r   r+   r   r   s   @r-   r  r  v  s%   { t u|| eN^N^ chcoco 	 	u|| 	^c^n^n 	  04-1,0,0/337&*W
u||,W
 ))*W
 ELL)	W

 $D>W
 'tnW
 #+4.W
 d^W
 
u//	0W
 W
r,   r  a  
    Hiera Model transformer with an image classification head on top (a linear layer on top of the final hidden state with
    average pooling) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune Hiera on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                      ^  \ rS rSrS\SS4U 4S jjr\      SS\\R                     S\\R                     S\\
   S	\\
   S
\\
   S\\
   S\\\4   4S jj5       rSrU =r$ )HieraForImageClassificationi  r\   rb   Nc                 D  > [         TU ]  U5        UR                  U l        [        USSS9U l        UR                  S:  a5  [
        R                  " U R                  R                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )NTFr  r   )rI   rJ   
num_labelsr  r`  r	   r   r|  r  
classifierr  r  s     r-   rJ   $HieraForImageClassification.__init__  s|      ++$uM
 FLEVEVYZEZBIIdjj--v/@/@A`b`k`k`m 	
 	r,   r   labelsr   rE  r   rF  c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUS9nUS   n	U R                  U	5      n
SnUGb  UR                  U
R                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R                  S:X  a&  U" U
R                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [!        5       nU" U
R#                  SU R                  5      UR#                  S5      5      nO,U R                   R                  S:X  a  [%        5       nU" X5      nU(       d  U
4USS -   nUb  U4U-   $ U$ ['        UU
UR(                  UR*                  UR,                  S	9$ )
ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r   r   rE  r   rF  r   
regressionsingle_label_classificationmulti_label_classificationr   rC   )r9   r:   r   r    r!   )r\   r  r   rE  r`  r  ry   rp   problem_typer  r   r(   longrv   r   squeezer   rf   r
   r7   r   r    r!   )r[   ra   r   r  r   rE  r   rF  r  r  r:   r9   loss_fctr   s                 r-   r   #HieraForImageClassification.forward#  s   " &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 **/!5%=#  
  
/YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE0!//))#*#A#A
 	
r,   )r  r`  r  )NNNNNN)r#   r$   r%   r&   r   rJ   r   r   r(   r   r|   r   rO  r7   r   r+   r   r   s   @r-   r  r    s    { t   -1)-,0/337&*F
 ELL)F
 &	F

 $D>F
 'tnF
 #+4.F
 d^F
 
u77	8F
 F
r,   r  zN
    Hiera backbone, to be used with frameworks like DETR and MaskFormer.
    c                      ^  \ rS rSrS\4U 4S jjrS r   SS\R                  S\	\
   S\	\
   S\	\
   S	\4
S
 jjrSrU =r$ )HieraBackboneim  r\   c           	      0  > [         TU ]  U5        [         TU ]	  U5        UR                  /[	        [        UR                  5      5       Vs/ s H(  n[        UR                  UR                  U-  -  5      PM*     sn-   U l	        [        USS9U l        [        U5      U l        0 n[        U R                  U R                   5       H  u  pE["        R$                  " U5      X4'   M     ["        R&                  " U5      U l        U R+                  5         g s  snf )NFr   )rI   rJ   _init_backbonerX   r  rK   r#  rv   r,  r|  r   r   r  r  rQ   _out_featureschannelsr	   r   
ModuleDicthidden_states_normsr  )r[   r\   r]   r  r:  rO   r_   s         r-   rJ   HieraBackbone.__init__s  s     v&#--.LQRUV\VcVcRdLe2
LeqC  6#>#>#AABLe2
 
 *&?#F+ !#&t'9'94==#IE)+l)C& $J#%==1D#E  	2
s   /Dc                 .    U R                   R                  $ r   r  r   s    r-   r  "HieraBackbone.get_input_embeddings  r  r,   ra   rE  r   rF  rb   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      u  n  nU R                  USUSUS9nUS   nSn	[        U R                  U5       H  u  pXR                  ;   d  M  UR                  u  ppUR                  XU-  U5      nU R                  U
   " U5      nUR                  XX5      nUR                  SSSS	5      R                  5       nX4-  n	M     U(       d%  U	4nU(       a	  UUS   4-  nU(       a	  UUS	   4-  nU$ [        U	U(       a  US   OSU(       a  US	   S
9$ SS
9$ )a  
Returns:

Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-hf")
>>> model = AutoBackbone.from_pretrained(
...     "facebook/hiera-tiny-224-hf", out_features=["stage1", "stage2", "stage3", "stage4"]
... )

>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
>>> feature_maps = outputs.feature_maps
>>> list(feature_maps[-1].shape)
[1, 768, 7, 7]
```NT)r   r   rE  rF  r   r"   r   r   r   rC   )r  r   r    )r\   r  rE  r   r   r  rQ   stage_namesout_featuresre   rf   r  r   
contiguousr   )r[   ra   rE  r   rF  r  r   r  r   r  r:  hidden_stater}   r   r   rO   r   s                    r-   r   HieraBackbone.forward  s   @ &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq!%!>!Q,,/!%#  
  #&t'7'7#GE))):F:L:L7
E+00e^\Z#77>|L+00UY+33Aq!Q?JJL/ $H "_F#71:-' 71:-'M%(<'!*$%6wqz
 	
 =A
 	
r,   )r   r  r  r|  )NNN)r#   r$   r%   r&   r   rJ   r  r(   r   r   r|   r   r   r+   r   r   s   @r-   r  r  m  sp    { &0 04,0&*H
llH
 'tnH
 $D>	H

 d^H
 
H
 H
r,   r  )r  r  r  r  r_  )r   F)Cr'   rt   dataclassesr   typingr   r   r   r   r   r(   torch.utils.checkpointr	   torch.nnr
   r   r   activationsr   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   r   utils.backbone_utilsr   configuration_hierar   
get_loggerr#   loggerr   r/   r7   r<   r  r?   r   r   r   ri   r|   r   r   r   r   r
  rv   r  r  r]  r_  rw  r  rg  r  r  r  r  __all__r"   r,   r-   <module>r     s     ! 5 5    A A !  . 7 7 1 , 
		H	% K K K@ &K{ &K &KR  K(=  K  KF !F !F !FH[8299 [8|I8bii I8XAYRYY AYJU\\ e T V[VbVb *-BII -ryy =- =-@/+ /+d%,, tCy SWX[S\ afamam 4R
299 R
j3<<3.3CHo3MRSVX[S[_3hlmqrumvhw3
\\3l J? J J6"))  o
% o
 o
df.299 f.R6")) 6r 	A
. A
A
H V
"6 V
V
r 
_
(- _

_
D xr,   