
    fTh              	          S r SSKrSSKrSSKJr  SSKJrJrJ	r	  SSK
r
SSKr
SSK
Jr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  SSKJr  \R>                  " \ 5      r!\ " S S\5      5       r"\ " S S\5      5       r#\ " S S\5      5       r$\ " S S\5      5       r% " S S\RL                  5      r' " S S\RL                  5      r(S;S\
RR                  S\*S\+S\
RR                  4S jjr, " S  S!\RL                  5      r- " S" S#\RL                  5      r. " S$ S%\RL                  5      r/ " S& S'\RL                  5      r0 " S( S)\RL                  5      r1 " S* S+\RL                  5      r2\ " S, S-\5      5       r3\ " S. S/\35      5       r4\" S0S19 " S2 S3\35      5       r5\" S4S19 " S5 S6\35      5       r6\" S7S19 " S8 S9\3\5      5       r7/ S:Qr8g)<zPyTorch FocalNet model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutput)PreTrainedModel)ModelOutputauto_docstringlogging)BackboneMixin   )FocalNetConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                        \	S'   Srg)FocalNetEncoderOutput&   a  
FocalNet encoder's outputs, with potential hidden states.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.

    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nlast_hidden_statehidden_statesreshaped_hidden_states )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   __static_attributes__r       f/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/focalnet/modeling_focalnet.pyr   r   &   sT    ( 6:x 1 1298<M8E%"3"345<AEHU5+<+<%=>Er%   r   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	FocalNetModelOutputA   af  
FocalNet model's outputs that also contains a pooling of the last hidden states.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nr   pooler_outputr   r   r   )r   r   r   r   r    r   r   r!   r"   r#   r*   r   r   r   r$   r   r%   r&   r(   r(   A   si    * 6:x 1 12915M8E--.58<M8E%"3"345<AEHU5+<+<%=>Er%   r(   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	!FocalNetMaskedImageModelingOutput^   a  
FocalNet masked image model outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nlossreconstructionr   r   r   )r   r   r   r   r    r.   r   r!   r"   r#   r/   r   r   r   r$   r   r%   r&   r,   r,   ^   sh    * )-D(5$$
%,26NHU../68<M8E%"3"345<AEHU5+<+<%=>Er%   r,   c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	FocalNetImageClassifierOutput{   a  
FocalNet outputs for image classification.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, sequence_length, hidden_size)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
Nr.   logitsr   r   r   )r   r   r   r   r    r.   r   r!   r"   r#   r3   r   r   r   r$   r   r%   r&   r1   r1   {   sh    * )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<AEHU5+<+<%=>Er%   r1   c                      ^  \ rS rSrSrS	U 4S jjr S
S\\R                     S\\R                     S\
\R                     4S jjrSrU =r$ )FocalNetEmbeddings   zP
Construct the patch embeddings and layernorm. Optionally, also the mask token.
c           
        > [         TU ]  5         [        UUR                  UR                  UR
                  UR                  UR                  SS9U l        U R                  R                  U l
        U(       a6  [        R                  " [        R                  " SSUR                  5      5      OS U l        [        R                   " UR                  UR"                  S9U l        [        R&                  " UR(                  5      U l        g )NT)config
image_size
patch_sizenum_channels	embed_dimuse_conv_embedis_stemr   eps)super__init__FocalNetPatchEmbeddingsr9   r:   r;   r<   r=   patch_embeddings	grid_size
patch_gridr   	Parameterr!   zeros
mask_token	LayerNormlayer_norm_epsnormDropouthidden_dropout_probdropout)selfr8   use_mask_token	__class__s      r&   rB   FocalNetEmbeddings.__init__   s     7((((,,&&!00!
 //99O]",,u{{1a9I9I'JKcgLL!1!1v7L7LM	zz&"<"<=r%   pixel_valuesbool_masked_posreturnc                 0   U R                  U5      u  p4U R                  U5      nUR                  5       u  pVnUbI  U R                  R	                  XVS5      nUR                  S5      R                  U5      n	USU	-
  -  X-  -   nU R                  U5      nX44$ )N      ?)rD   rL   sizerI   expand	unsqueezetype_asrO   )
rP   rT   rU   
embeddingsoutput_dimensions
batch_sizeseq_len_mask_tokensmasks
             r&   forwardFocalNetEmbeddings.forward   s     )-(=(=l(K%
YYz*
!+!2
Q&//00bIK",,R088ED#sTz2[5GGJ\\*-
,,r%   )rO   rI   rL   rD   rF   )FN)r   r   r   r   r    rB   r   r!   r"   
BoolTensorr   Tensorre   r$   __classcell__rR   s   @r&   r5   r5      sV    >& hl-$U%6%67-JRSXScScJd-	u||	- -r%   r5   c                      ^  \ rS rSr   SU 4S jjrS rS\\R                     S\	\R                  \	\   4   4S jrSrU =r$ )	rC      c	                 t  > [         TU ]  5         [        U[        R                  R
                  5      (       a  UOX"4n[        U[        R                  R
                  5      (       a  UOX34nUS   US   -  US   US   -  -  n	X l        X0l        X@l        Xl	        US   US   -  US   US   -  4U l
        U(       a0  U(       a  Sn
SnSnOSn
SnSn[        R                  " XEXUS9U l        O[        R                  " XEX3S9U l        U(       a$  [        R                  " XQR                  S	9U l        g S U l        g )
Nr   r            r   )kernel_sizestridepadding)rr   rs   r?   )rA   rB   
isinstancecollectionsabcIterabler9   r:   r;   num_patchesrE   r   Conv2d
projectionrJ   rK   rL   )rP   r8   r9   r:   r;   r<   add_normr=   r>   ry   rr   rt   rs   rR   s                r&   rB    FocalNetPatchEmbeddings.__init__   s$    	#-j+//:R:R#S#SZZdYq
#-j+//:R:R#S#SZZdYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY ii[Y`DO !iiZkDOY4I4IJDIDIr%   c                 f   X0R                   S   -  S:w  aB  SU R                   S   X0R                   S   -  -
  4n[        R                  R                  X5      nX R                   S   -  S:w  aD  SSSU R                   S   X R                   S   -  -
  4n[        R                  R                  X5      nU$ )Nr   r   )r:   r   
functionalpad)rP   rT   heightwidth
pad_valuess        r&   	maybe_pad!FocalNetPatchEmbeddings.maybe_pad   s    ??1%%*T__Q/%//!:L2LLMJ==,,\FLOOA&&!+Q4??1#5QRAS8S#STJ==,,\FLr%   rT   rV   c                 D   UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  XU5      nU R	                  U5      nUR                   u    p$nXE4nUR                  S5      R                  SS5      nU R                  b  U R                  U5      nXg4$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rp   r   )shaper;   
ValueErrorr   r{   flatten	transposerL   )rP   rT   rb   r;   r   r   r^   r_   s           r&   re   FocalNetPatchEmbeddings.forward   s    )5););&,,,w  ~~lEB__\2
(..1e#O''*44Q:
99 :.J,,r%   )rE   r9   rL   r;   ry   r:   r{   )FFF)r   r   r   r   rB   r   r   r!   r"   r   ri   intre   r$   rj   rk   s   @r&   rC   rC      sQ     (T-HU->->$? -E%,,X]^aXbJbDc - -r%   rC   input	drop_probtrainingrV   c                    US:X  d  U(       d  U $ SU-
  nU R                   S   4SU R                  S-
  -  -   nU[        R                  " X@R                  U R
                  S9-   nUR                  5         U R                  U5      U-  nU$ )a*  
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
argument.
        r   r   )r   )dtypedevice)r   ndimr!   randr   r   floor_div)r   r   r   	keep_probr   random_tensoroutputs          r&   	drop_pathr     s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr%   c                      ^  \ rS rSrSrSS\\   SS4U 4S jjjrS\R                  S\R                  4S jr
S\4S	 jrS
rU =r$ )FocalNetDropPathi  zXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rV   c                 .   > [         TU ]  5         Xl        g rg   )rA   rB   r   )rP   r   rR   s     r&   rB   FocalNetDropPath.__init__   s    "r%   r   c                 B    [        XR                  U R                  5      $ rg   )r   r   r   )rP   r   s     r&   re   FocalNetDropPath.forward$  s    FFr%   c                 8    SR                  U R                  5      $ )Nzp={})formatr   rP   s    r&   
extra_reprFocalNetDropPath.extra_repr'  s    }}T^^,,r%   )r   rg   )r   r   r   r   r    r   floatrB   r!   ri   re   strr   r$   rj   rk   s   @r&   r   r     sQ    b#(5/ #T # #GU\\ Gell G-C - -r%   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )FocalNetModulationi+  c                   > [         T	U ]  5         X0l        UR                  U   U l        UR
                  U   U l        X@l        UR                  U l        UR                  U l	        [        R                  " USU-  U R                  S-   -   US9U l        [        R                  " X3SSUS9U l        [        R                  " 5       U l        [        R                  " X35      U l        [        R$                  " U5      U l        [        R(                  " 5       U l        / U l        [/        U R                  5       H  nU R                  U-  U R                  -   nU R*                  R1                  [        R2                  " [        R                  " X3USX8S-  SS9[        R                  " 5       5      5        U R,                  R1                  U5        M     U R                  (       a$  [        R4                  " X1R6                  S9U l        g g )Nrp   r   )bias)rr   rs   r   F)rr   rs   groupsrt   r   r?   )rA   rB   dimfocal_windowsfocal_windowfocal_levelsfocal_levelfocal_factor use_post_layernorm_in_modulationnormalize_modulatorr   Linearprojection_inrz   projection_contextGELU
activationprojection_outrM   projection_dropout
ModuleListfocal_layerskernel_sizesrangeappend
SequentialrJ   rK   	layernorm)
rP   r8   indexr   r   r   r   krr   rR   s
            r&   rB   FocalNetModulation.__init__,  s   "007!..u5(060W0W-#)#=#= YYsAGt7G7G!7K,LSWX"$))C!ATX"Y'') ii1"$**-?"@MMOt''(A++a/$2C2CCK$$IIk!ChiYipu GGI	 $$[1 ) 00\\#3H3HIDN 1r%   c                 *   UR                   S   nU R                  U5      R                  SSSS5      R                  5       n[        R
                  " X2X R                  S-   4S5      u  pEnSn[        U R                  5       H*  nU R                  U   " U5      nXuUSS2XS-   24   -  -   nM,     U R                  UR                  SSS9R                  SSS95      n	XyUSS2U R                  S24   -  -   nU R                  (       a  XpR                  S-   -  nU R                  U5      n
XJ-  nUR                  SSSS5      R                  5       nU R                  (       a  U R                  U5      nU R                  U5      nU R!                  U5      nU$ )	zh
Args:
    hidden_state:
        Input features with shape of (batch_size, height, width, num_channels)
rX   r   r   r   rp   NT)keepdim)r   r   permute
contiguousr!   splitr   r   r   r   meanr   r   r   r   r   r   )rP   hidden_stater;   xqctxgatesctx_alllevel
ctx_global	modulatorx_outs               r&   re   FocalNetModulation.forwardM  s    $))"- |,44Q1a@KKMAlDTDTWXDX'Y[\] 4++,E##E*3/CeAuqy/@,@&A AAG - __SXXaX%>%C%CAt%C%TU
q$2B2B2D/D)EEE ##!1!1A!56G ++G4	aAq)44600NN5)E ##E*''.r%   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   )rp   Tr   r   r   r   r   rB   re   r$   rj   rk   s   @r&   r   r   +  s    JB" "r%   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )FocalNetMlpir  c                    > [         TU ]  5         U=(       d    UnU=(       d    Un[        R                  " X#5      U l        [
        UR                     U l        [        R                  " X45      U l        [        R                  " U5      U l
        g rg   )rA   rB   r   r   fc1r   
hidden_actr   fc2rM   drop)rP   r8   in_featureshidden_featuresout_featuresr   rR   s         r&   rB   FocalNetMlp.__init__s  sd    #2{)8[99[: !2!2399_;JJt$	r%   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ rg   )r   r   r   r   )rP   r   s     r&   re   FocalNetMlp.forward|  sN    xx-|4yy.xx-yy.r%   )r   r   r   r   )NNr   r   rk   s   @r&   r   r   r  s    % r%   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )FocalNetLayeri  aS  Focal Modulation Network layer (block).

Args:
    config (`FocalNetConfig`):
        Model config.
    index (`int`):
        Layer index.
    dim (`int`):
        Number of input channels.
    input_resolution (`Tuple[int]`):
        Input resolution.
    drop_path (`float`, *optional*, defaults to 0.0):
        Stochastic depth rate.
c                 .  > [         TU ]  5         Xl        X0l        X@l        UR
                  U l        UR                  U l        [        R                  " X1R                  S9U l        [        UUUU R                  S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " X1R                  S9U l        [%        X1R&                  -  5      n[)        XX`R                  S9U l        SU l        SU l        UR0                  (       aw  [        R2                  " UR4                  [6        R8                  " U5      -  SS9U l        [        R2                  " UR4                  [6        R8                  " U5      -  SS9U l        g g )Nr?   )r8   r   r   r   r   )r8   r   r   r   rY   T)requires_grad)rA   rB   r8   r   input_resolutionrN   r   use_post_layernormr   rJ   rK   norm1r   
modulationr   Identityr   norm2r   	mlp_ratior   mlpgamma_1gamma_2use_layerscalerG   layerscale_valuer!   ones)rP   r8   r   r   r   r   mlp_hidden_dimrR   s          r&   rB   FocalNetLayer.__init__  s4     0 ..	"(";";\\#+@+@A
,#yy	
 9BC))4R[[]\\#+@+@A
S#3#334f~dmdmn  <<(?(?%**cBS(ScghDL<<(?(?%**cBS(ScghDL !r%   c           	      J   Uu  p4UR                   u  pVnUnU R                  (       a  UOU R                  U5      nUR                  XSXG5      nU R	                  U5      R                  XSU-  U5      nU R                  (       d  UOU R                  U5      nXR                  U R                  U-  5      -   nXR                  U R                  U R                  (       a   U R                  U R                  U5      5      OU R                  U R                  U5      5      -  5      -   nU$ rg   )
r   r   r   viewr   r   r   r   r   r   )	rP   r   input_dimensionsr   r   r`   rb   r;   shortcuts	            r&   re   FocalNetLayer.forward  s    (&2&8&8#
| (,'>'>|DJJ|D\#((UQ|499*unVbc+/+B+B|

S_H`  ..1L"MM#nnLL595L5Ltzz$((<01RVRZRZ[_[e[efr[sRtv'
 

 r%   )r8   r   r   r   r   r   r   r   r   r   r   r   )r   )	r   r   r   r   r    rB   re   r$   rj   rk   s   @r&   r   r     s    i@ r%   r   c                   v   ^  \ rS rSrU 4S jrS\R                  S\\\4   S\\R                     4S jr	Sr
U =r$ )FocalNetStagei  c                   > [         TU ]  5         Xl        [        UR                  5      U l        [        U R
                  5       Vs/ s H  oAR                  SU-  -  PM     nnXR   nX R
                  S-
  :  a  XRS-      OS nX R
                  S-
  :  a  [        OS n[        R                  " SUR                  [        UR                  5      SS9 V	s/ s H  oR                  5       PM     n
n	U
[        UR                  S U 5      [        UR                  S US-    5       n[        R                  " [        UR                  U   5       Vs/ s H)  n[!        UUUU[#        U[$        5      (       a  X   OUS9PM+     sn5      U l        Ub  U" UUSUUSUR(                  SS	9U l        OS U l        SU l        g s  snf s  sn	f s  snf )
Nrp   r   r   cpu)r   )r8   r   r   r   r   TF)r8   r9   r:   r;   r<   r|   r=   r>   )rA   rB   r8   lendepths
num_stagesr   r<   rC   r!   linspacedrop_path_ratesumitemr   r   r   ru   listlayersr=   
downsamplepointing)rP   r8   r   r   ir<   r   out_dimr  r   dprr   rR   s               r&   rB   FocalNetStage.__init__  s   fmm,8=doo8NO8N1%%A.8N	O+0??Q3F+F)AI&T1619L1L,SW
 "'63H3H#fmmJ\ej!kl!kAvvx!klFMM&512S{QR9S5TU	mm v}}U34	 5A !%5.8D.I.Iily 5	
 !(+ !%44	DO #DOI P m	s   F7F<0Gr   r   rV   c                    Uu  p4U R                    H  nU" X5      nM     UnU R                  bH  Uu  p4UR                  SS5      R                  UR                  S   SX45      nU R                  U5      u  pOX4X44nXU4nU$ )Nr   rp   r   rX   )r
  r  r   reshaper   )	rP   r   r   r   r   layer_module!hidden_states_before_downsamplingr_   stage_outputss	            r&   re   FocalNetStage.forward  s    ( KKL(IM ( -:)??&,MF)33Aq9AA177:BM 04}/M,M, "( >&K\]r%   )r8   r  r
  r  r  )r   r   r   r   rB   r!   ri   r   r   re   r$   rj   rk   s   @r&   r   r     sB    *XU\\ U3PS8_ Y^_d_k_kYl  r%   r   c                      ^  \ rS rSrU 4S jr   SS\R                  S\\\4   S\	\
   S\	\
   S\	\
   S\\\4   4S	 jjrS
rU =r$ )FocalNetEncoderi  c                 2  > [         TU ]  5         [        UR                  5      U l        Xl        [        R                  " [        U R                  5       Vs/ s H"  n[        UUUS   SU-  -  US   SU-  -  4S9PM$     sn5      U l
        SU l        g s  snf )Nr   rp   r   )r8   r   r   F)rA   rB   r  r  r  r8   r   r   r   r   stagesgradient_checkpointing)rP   r8   rE   i_layerrR   s       r&   rB   FocalNetEncoder.__init__  s    fmm,mm  %T__5  6G !!&/lq'z&BIaLUVX_U_D`%a
  6	
 ',#s   )Br   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrV   c                 h   U(       a  SOS nU(       a  SOS nU(       aB  UR                   u  pn
UR                  " U/UQU
P76 nUR                  SSSS5      nXa4-  nX{4-  n[        U R                  5       GH  u  pU R
                  (       a/  U R                  (       a  U R                  UR                  UU5      nOU" X5      nUS   nUS   nUS   nUS   US   4nU(       aS  U(       aL  UR                   u  pn
UR                  " U/US   US   4QU
P76 nUR                  SSSS5      nXo4-  nX{4-  nM  U(       d  M  U(       a  M  UR                   u  pn
UR                  " U/UQU
P76 nUR                  SSSS5      nXa4-  nX{4-  nGM     U(       d  [        S X4 5       5      $ [        UUUS	9$ )
Nr   r   r   r   rp   rX   c              3   .   #    U  H  oc  M  Uv   M     g 7frg   r   ).0vs     r&   	<genexpr>*FocalNetEncoder.forward.<locals>.<genexpr>U  s     X$Fq$Fs   	)r   r   r   )r   r   r   	enumerater  r  r   _gradient_checkpointing_func__call__tupler   )rP   r   r   r  r  r   all_hidden_statesall_reshaped_hidden_statesr`   rb   hidden_sizereshaped_hidden_stater  stage_moduler  r  r_   s                    r&   re   FocalNetEncoder.forward  s    #7BD+?RT")6)<)<&J;$1$6$6z$bDT$bVa$b!$9$A$A!Q1$M!!11&*BB&(5OA**t}} $ A A ))!$! !-] M)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
{ )J(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!%II!*.FF*%%.V.V-:-@-@*
{(5(:(::(fHX(fZe(f%(=(E(EaAq(Q%!%55!*.FF*A  6D X]$FXXX$++#=
 	
r%   )r8   r  r  r  )FFT)r   r   r   r   rB   r!   ri   r   r   r   boolr   r   re   r$   rj   rk   s   @r&   r  r    sz    ,, 05CH&*<
||<
  S/<
 'tn	<

 3;4.<
 d^<
 
u++	,<
 <
r%   r  c                   0    \ rS rSr\rSrSrSrS/r	S r
Srg)	FocalNetPreTrainedModeli^  focalnetrT   Tr   c                    [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        g[        U[        5      (       a3  UR                  b%  UR                  R
                  R                  5         gg[        U[        5      (       a  U R                  R                   (       as  UR"                  R
                  R                  U R                  R$                  5        UR&                  R
                  R                  U R                  R$                  5        ggg)zInitialize the weightsr   )r   stdNrY   )ru   r   r   rz   weightdatanormal_r8   initializer_ranger   zero_rJ   fill_r5   rI   r   r   r   r   r   )rP   modules     r&   _init_weights%FocalNetPreTrainedModel._init_weightsf  sH   fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) 233  ,!!&&,,. -..{{))##))$++*F*FG##))$++*F*FG * /r%   r   N)r   r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr?  r$   r   r%   r&   r4  r4  ^  s'    !L"$O&*#()Hr%   r4  c                      ^  \ rS rSrSU 4S jjrS r\    SS\\R                     S\\R                     S\\   S\\   S\\\4   4
S	 jj5       rS
rU =r$ )FocalNetModeliz  c                   > [         TU ]  U5        Xl        [        UR                  5      U l        [        UR                  SU R
                  S-
  -  -  5      U l        [        XS9U l
        [        XR                  R                  5      U l        [        R                  " U R                  UR                   S9U l        U(       a  [        R$                  " S5      OSU l        U R)                  5         g)z
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
use_mask_token (`bool`, *optional*, defaults to `False`):
    Whether to use a mask token for masked image modeling.
rp   r   )rQ   r?   N)rA   rB   r8   r  r  r  r   r<   num_featuresr5   r^   r  rF   encoderr   rJ   rK   r   AdaptiveAvgPool1dpooler	post_init)rP   r8   add_pooling_layerrQ   rR   s       r&   rB   FocalNetModel.__init__|  s     	 fmm, 0 0119L3M MN,VS&v/I/IJd&7&7V=R=RS1Bb**1- 	r%   c                 .    U R                   R                  $ rg   )r^   rD   r   s    r&   get_input_embeddings"FocalNetModel.get_input_embeddings  s    ///r%   rT   rU   r  r   rV   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  XS9u  pVU R                  UUUUS9nUS   nU R                  U5      nSn	U R                  b8  U R                  UR                  SS5      5      n	[        R                  " U	S5      n	U(       d  X4USS -   n
U
$ [        UU	UR                  UR                  S9$ )	z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
Nz You have to specify pixel_values)rU   r  r   r   r   rp   )r   r*   r   r   )r8   r  use_return_dictr   r^   rJ  r   rL  r   r!   r   r(   r   r   )rP   rT   rU   r  r   embedding_outputr   encoder_outputssequence_outputpooled_outputr   s              r&   re   FocalNetModel.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@-1__\_-k*,,!5#	 ' 
 *!,..9;;" KK(A(A!Q(GHM!MM-;M%58KKFM"-')77#2#I#I	
 	
r%   )r8   r^   rJ  r   rI  r  rL  )TFNNNN)r   r   r   r   rB   rQ  r   r   r!   r"   rh   r2  r   r   r(   re   r$   rj   rk   s   @r&   rG  rG  z  s    *0  596:/3&*.
u001.
 "%"2"23.
 'tn	.

 d^.
 
u))	*.
 .
r%   rG  a  
    FocalNet Model with a decoder on top for masked image modeling.

    This follows the same implementation as in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                      ^  \ rS rSrU 4S jr\    S
S\\R                     S\\R                     S\\
   S\\
   S\\\4   4
S jj5       rS	rU =r$ )FocalNetForMaskedImageModelingi  c                   > [         TU ]  U5        [        USSS9U l        [	        UR
                  5      U l        [        UR                  SU R                  S-
  -  -  5      n[        R                  " [        R                  " X!R                  S-  UR                  -  SS9[        R                  " UR                  5      5      U l        U R!                  5         g )NFT)rN  rQ   rp   r   )in_channelsout_channelsrr   )rA   rB   rG  r5  r  r  r  r   r<   r   r   rz   encoder_strider;   PixelShuffledecoderrM  )rP   r8   rI  rR   s      r&   rB   'FocalNetForMaskedImageModeling.__init__  s     %fVZ[fmm,6++aDOOa4G.HHI}}II(7L7La7ORXReRe7est OOF112	
 	r%   rT   rU   r  r   rV   c                    Ub  UOU R                   R                  nU R                  UUUUS9nUS   nUR                  SS5      nUR                  u  pxn	[
        R                  " U	S-  5      =pUR                  XxX5      nU R                  U5      nSnUGb  U R                   R                  U R                   R                  -  nUR                  SX5      nUR                  U R                   R                  S5      R                  U R                   R                  S5      R                  S5      R                  5       n[        R                  R!                  XSS	9nUU-  R#                  5       UR#                  5       S
-   -  U R                   R$                  -  nU(       d  U4USS -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  S9$ )a  
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

Examples:
```python
>>> from transformers import AutoImageProcessor, FocalNetConfig, FocalNetForMaskedImageModeling
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-base-simmim-window6-192")
>>> config = FocalNetConfig()
>>> model = FocalNetForMaskedImageModeling(config)

>>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
>>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
>>> # create random boolean mask of shape (batch_size, num_patches)
>>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

>>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
>>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
>>> list(reconstructed_pixel_values.shape)
[1, 3, 192, 192]
```N)rU   r  r   r   r   rp   g      ?rX   none)	reductiongh㈵>)r.   r/   r   r   )r8   rU  r5  r   r   mathfloorr  rd  r9   r:   repeat_interleaver\   r   r   r   l1_lossr  r;   r,   r   r   )rP   rT   rU   r  r   outputsrX  r`   r;   sequence_lengthr   r   reconstructed_pixel_valuesmasked_im_lossrZ   rd   reconstruction_lossr   s                     r&   re   &FocalNetForMaskedImageModeling.forward  s   H &1%<k$++B]B]--+!5#	   
 "!*)33Aq94C4I4I1
/OS$899)11*FZ &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7lr"7"s1D8==?488:PTCTUX\XcXcXpXppN02WQR[@F3A3M^%.YSYY05!//#*#A#A	
 	
r%   )rd  r5  r  r[  )r   r   r   r   rB   r   r   r!   r"   rh   r2  r   r   r,   re   r$   rj   rk   s   @r&   r^  r^    s    "  596:/3&*L
u001L
 "%"2"23L
 'tn	L

 d^L
 
u77	8L
 L
r%   r^  z
    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
    ImageNet.
    c                      ^  \ rS rSrU 4S jr\    S
S\\R                     S\\R                     S\\
   S\\
   S\\\4   4
S jj5       rS	rU =r$ )FocalNetForImageClassificationi6  c                 D  > [         TU ]  U5        UR                  U l        [        U5      U l        UR                  S:  a5  [
        R                  " U R                  R                  UR                  5      O[
        R                  " 5       U l	        U R                  5         g )Nr   )rA   rB   
num_labelsrG  r5  r   r   rI  r   
classifierrM  rP   r8   rR   s     r&   rB   'FocalNetForImageClassification.__init__>  sx      ++%f- IOHYHY\]H]BIIdmm00&2C2CDcecncncp 	
 	r%   rT   labelsr  r   rV   c                 $   Ub  UOU R                   R                  nU R                  UUUS9nUS   nU R                  U5      nSnUGb  U R                   R                  c  U R
                  S:X  a  SU R                   l        OoU R
                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       n	U R
                  S:X  a&  U	" UR                  5       UR                  5       5      nOU	" Xr5      nOU R                   R                  S:X  a=  [        5       n	U	" UR                  SU R
                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [        5       n	U	" Xr5      nU(       d  U4USS -   n
Ub  U4U
-   $ U
$ [        UUUR                   UR"                  S	9$ )
ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NrT  r   
regressionsingle_label_classificationmulti_label_classificationrX   rp   )r.   r3   r   r   )r8   rU  r5  rw  problem_typerv  r   r!   longr   r
   squeezer	   r   r   r1   r   r   )rP   rT   rz  r  r   rm  rY  r3   r.   loss_fctr   s              r&   re   &FocalNetForImageClassification.forwardL  s    &1%<k$++B]B]--!5#   
  
/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE,!//#*#A#A	
 	
r%   )rw  r5  rv  r[  )r   r   r   r   rB   r   r   r!   r"   
LongTensorr2  r   r   r1   re   r$   rj   rk   s   @r&   rt  rt  6  s      59-1/3&*9
u0019
 ))*9
 'tn	9

 d^9
 
u33	49
 9
r%   rt  zG
    FocalNet backbone, to be used with frameworks like X-Decoder.
    c            
       |   ^  \ rS rSrS\4U 4S jjr\  S
S\R                  S\	\
   S\	\
   S\4S jj5       rS	rU =r$ )FocalNetBackbonei  r8   c                    > [         TU ]  U5        [         TU ]	  U5        UR                  /UR                  -   U l        [        U5      U l        U R                  5         g rg   )	rA   rB   _init_backboner<   hidden_sizesrI  rG  r5  rM  rx  s     r&   rB   FocalNetBackbone.__init__  sQ     v&#--.1D1DD%f- 	r%   rT   r  r   rV   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  USSS9nUR                  nSn[        U R                  5       H  u  pxXR                  ;   d  M  XeU   4-  nM      U(       d  U4n	U(       a  XR                  4-  n	U	$ [        UU(       a  UR                  SS9$ SSS9$ )a  
Examples:

```python
>>> from transformers import AutoImageProcessor, AutoBackbone
>>> import torch
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
>>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")

>>> inputs = processor(image, return_tensors="pt")
>>> outputs = model(**inputs)
```NTrT  r   )feature_mapsr   
attentions)
r8   rU  r  r5  r   r(  stage_namesr   r   r   )
rP   rT   r  r   rm  r   r  idxstager   s
             r&   re   FocalNetBackbone.forward  s    2 &1%<k$++B]B]$8$D $++JjJj 	 --4UY-Z66#D$4$45JC)))s!3 55 6 "_F#0022M%3G'//
 	
MQ
 	
r%   )r5  rI  )NN)r   r   r   r   r   rB   r   r!   ri   r   r2  r   re   r$   rj   rk   s   @r&   r  r    s]    ~   04&*	0
ll0
 'tn0
 d^	0

 
0
 0
r%   r  )rt  r^  r  rG  r4  )r   F)9r    collections.abcrv   ri  dataclassesr   typingr   r   r   r!   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   modeling_outputsr   modeling_utilsr   utilsr   r   r   utils.backbone_utilsr   configuration_focalnetr   
get_loggerr   loggerr   r(   r,   r1   Moduler5   rC   ri   r   r2  r   r   r   r   r   r   r  r4  rG  r^  rt  r  __all__r   r%   r&   <module>r     sC      ! ) )    A A ! . - 9 9 1 2 
		H	% FK F F4 F+ F F8 F F F8 FK F F8%- %-PD-bii D-PU\\ e T V[VbVb *-ryy -D DN")) &BBII BJ?BII ?DO
bii O
d Ho H H6 H
+ H
 H
V _
%< _
_
D J
%< J
J
Z 
<
. <

<
~r%   