
    fTh              	          S r SSKJrJrJr  SSKrSSKrSSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJrJr  SSKJr  \R4                  " \5      rS;S\S\S\\   S\4S jjr\" S5      \" S5      4S\S\S\S\4S jjr  " S S\RB                  5      r" " S S\RB                  5      r# " S S\RB                  5      r$ " S S\RB                  5      r% " S S \RB                  5      r& " S! S"\RB                  5      r' " S# S$\RB                  5      r( " S% S&\RB                  5      r) " S' S(\RB                  5      r*\ " S) S*\5      5       r+\ " S+ S,\+5      5       r,\" S-S.9 " S/ S0\+5      5       r- " S1 S2\RB                  5      r. " S3 S4\RB                  5      r/ " S5 S6\RB                  5      r0\" S7S.9 " S8 S9\+5      5       r1/ S:Qr2g)<zPyTorch MobileViTV2 model.    )OptionalTupleUnionN)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)auto_docstringlogging   )MobileViTV2Configvaluedivisor	min_valuereturnc                 |    Uc  Un[        U[        XS-  -   5      U-  U-  5      nUSU -  :  a  X1-  n[        U5      $ )z
Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
original TensorFlow repo. It can be seen here:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
   g?)maxint)r   r   r   	new_values       l/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.pymake_divisibler   *   sO     	Is5Q;#677BWLMI3;	y>    z-infinfmin_valmax_valc                 ,    [        U[        X 5      5      $ N)r   minr   r"   r#   s      r   clipr(   9   s    wG+,,r    c                      ^  \ rS rSr      SS\S\S\S\S\S\S\S	\S
\S\\\4   SS4U 4S jjjr	S\
R                  S\
R                  4S jrSrU =r$ )MobileViTV2ConvLayer>   configin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 D  > [         TU ]  5         [        US-
  S-  5      U-  nX&-  S:w  a  [        SU SU S35      eX6-  S:w  a  [        SU SU S35      e[        R
                  " UUUUUUUUSS	9	U l        U	(       a  [        R                  " US
SSSS9U l        OS U l        U
(       an  [        U
[        5      (       a  [        U
   U l        g [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g S U l        g )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r-   r.   r/   r0   paddingr3   r1   r2   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r8   	__class__s               r   r@   MobileViTV2ConvLayer.__init__?   s,    	{Q!+,x71$/}<STZS[[cdee A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#.."("8F--s33"():):";"("3"3"DOr    featuresc                     U R                  U5      nU R                  b  U R                  U5      nU R                  b  U R                  U5      nU$ r%   )rC   rE   rH   )rJ   rM   s     r   forwardMobileViTV2ConvLayer.forwardu   sK    ##H-)))(3H??&x0Hr    )rH   rC   rE   )r   r   Fr   TT)__name__
__module____qualname____firstlineno__r   r   boolr   rG   r@   torchTensorrO   __static_attributes____classcell__rK   s   @r   r*   r*   >   s     "&+/4#!4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4# 4#l   r    r*   c                      ^  \ rS rSrSr SS\S\S\S\S\SS	4U 4S
 jjjrS\R                  S\R                  4S jr
SrU =r$ )MobileViTV2InvertedResidual   zI
Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
r,   r-   r.   r0   r3   r   Nc           
      6  > [         TU ]  5         [        [        [	        X!R
                  -  5      5      S5      nUS;  a  [        SU S35      eUS:H  =(       a    X#:H  U l        [        XUSS9U l	        [        UUUSUUUS9U l
        [        UUUSS	S
9U l        g )N   )r   r   zInvalid stride .r   )r-   r.   r/   r
   )r-   r.   r/   r0   r1   r3   Fr-   r.   r/   r5   )r?   r@   r   r   roundexpand_ratiorA   use_residualr*   
expand_1x1conv_3x3
reduce_1x1)rJ   r,   r-   r.   r0   r3   expanded_channelsrK   s          r   r@   $MobileViTV2InvertedResidual.__init__   s     	*3u[CVCV5V/W+XZ[\vha899#q[K{/J.:KYZ
 -)*$
 /)% 
r    rM   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  (       a  X!-   $ U$ r%   )re   rf   rg   rd   )rJ   rM   residuals      r   rO   #MobileViTV2InvertedResidual.forward   sG    ??8,==*??8,&*&7&7x"EXEr    )rf   re   rg   rd   )r   rQ   rR   rS   rT   __doc__r   r   r@   rV   rW   rO   rX   rY   rZ   s   @r   r\   r\      sn    
 lm
'
69
IL
VY
eh
	
 
BF F F Fr    r\   c                      ^  \ rS rSr SS\S\S\S\S\SS4U 4S	 jjjrS
\R                  S\R                  4S jr	Sr
U =r$ )MobileViTV2MobileNetLayer   r,   r-   r.   r0   
num_stagesr   Nc                    > [         TU ]  5         [        R                  " 5       U l        [        U5       H4  n[        UUUUS:X  a  UOSS9nU R                  R                  U5        UnM6     g )Nr   r   )r-   r.   r0   )r?   r@   r   
ModuleListlayerranger\   append)	rJ   r,   r-   r.   r0   rr   iru   rK   s	           r   r@   "MobileViTV2MobileNetLayer.__init__   sc     	]]_
z"A/')!"avQ	E JJe$&K #r    rM   c                 <    U R                    H  nU" U5      nM     U$ r%   ru   )rJ   rM   layer_modules      r   rO   !MobileViTV2MobileNetLayer.forward   s     JJL#H-H 'r    r{   )r   r   rQ   rR   rS   rT   r   r   r@   rV   rW   rO   rX   rY   rZ   s   @r   rp   rp      s`    qr'''69'IL'VY'kn'	' '    r    rp   c                   v   ^  \ rS rSrSrS\S\SS4U 4S jjrS\R                  S\R                  4S	 jr
S
rU =r$ )MobileViTV2LinearSelfAttention   aQ  
This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
https://arxiv.org/abs/2206.02680

Args:
    config (`MobileVitv2Config`):
         Model configuration object
    embed_dim (`int`):
        `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
r,   	embed_dimr   Nc           
         > [         TU ]  5         [        UUSSU-  -   SSSSS9U l        [        R
                  " UR                  S9U l        [        UUUSSSSS9U l        X l        g )Nr   r   TF)r,   r-   r.   r2   r/   r4   r5   p)	r?   r@   r*   qkv_projr   Dropoutattn_dropoutout_projr   )rJ   r,   r   rK   s      r   r@   'MobileViTV2LinearSelfAttention.__init__   sy    ,!a)m,# 
 JJ)<)<=,!"# 
 #r    hidden_statesc                    U R                  U5      n[        R                  " USU R                  U R                  /SS9u  p4n[        R                  R
                  R                  USS9nU R                  U5      nXF-  n[        R                  " USSS9n[        R                  R
                  R                  U5      UR                  U5      -  nU R                  U5      nU$ )Nr   )split_size_or_sectionsdimr   Tr   keepdim)r   rV   splitr   r   
functionalsoftmaxr   sumrelu	expand_asr   )	rJ   r   qkvquerykeyr   context_scorescontext_vectorouts	            r   rO   &MobileViTV2LinearSelfAttention.forward   s    mmM*
 "KKQX\XfXfDgmnoE ,,44U4C**>: ->r4H hh!!&&u-0H0H0OOmmC 
r    )r   r   r   r   rm   rZ   s   @r   r   r      sC    	#0 #S #T #2U\\ ell  r    r   c                      ^  \ rS rSr SS\S\S\S\SS4
U 4S jjjrS	\R                  S\R                  4S
 jr
SrU =r$ )MobileViTV2FFNi  r,   r   ffn_latent_dimffn_dropoutr   Nc                    > [         TU ]  5         [        UUUSSSSSS9U l        [        R
                  " U5      U l        [        UUUSSSSSS9U l        [        R
                  " U5      U l        g )Nr   TF)r,   r-   r.   r/   r0   r2   r4   r5   )	r?   r@   r*   conv1r   r   dropout1conv2dropout2)rJ   r,   r   r   r   rK   s        r   r@   MobileViTV2FFN.__init__  s|     	)!'#	

 

;/)&"# 	

 

;/r    r   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r%   )r   r   r   r   )rJ   r   s     r   rO   MobileViTV2FFN.forward'  s@    

=1m4

=1m4r    )r   r   r   r           rQ   rR   rS   rT   r   r   floatr@   rV   rW   rO   rX   rY   rZ   s   @r   r   r     sc     !0!0 0 	0
 0 
0 0@U\\ ell  r    r   c                      ^  \ rS rSr SS\S\S\S\SS4
U 4S jjjrS	\R                  S\R                  4S
 jr
SrU =r$ )MobileViTV2TransformerLayeri/  r,   r   r   dropoutr   Nc                 :  > [         TU ]  5         [        R                  " SX!R                  S9U l        [        X5      U l        [        R                  " US9U l	        [        R                  " SX!R                  S9U l
        [        XX1R                  5      U l        g )Nr   
num_groupsnum_channelsr;   r   )r?   r@   r   	GroupNormlayer_norm_epslayernorm_beforer   	attentionr   r   layernorm_afterr   r   ffn)rJ   r,   r   r   r   rK   s        r   r@   $MobileViTV2TransformerLayer.__init__0  sr     	 "	WlWl m7J

W-!||qyVkVkl!&^EWEWXr    r   c                     U R                  U5      nU R                  U5      nX1-   nU R                  U5      nU R                  U5      nXA-   nU$ r%   )r   r   r   r   )rJ   r   layernorm_1_outattention_outputlayer_outputs        r   rO   #MobileViTV2TransformerLayer.forward>  sU    //>>>/:(8++M:xx-#3r    )r   r   r   r   r   r   r   rZ   s   @r   r   r   /  si     Y!Y Y 	Y
 Y 
Y Y	U\\ 	ell 	 	r    r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTV2TransformeriJ  r,   n_layersd_modelr   Nc                 <  > [         T	U ]  5         UR                  nXC-  /U-  nU Vs/ s H  n[        US-  S-  5      PM     nn[        R
                  " 5       U l        [        U5       H*  n[        XXW   S9nU R                  R                  U5        M,     g s  snf )N   )r   r   )
r?   r@   ffn_multiplierr   r   rt   ru   rv   r   rw   )
rJ   r,   r   r   r   ffn_dimsd	block_idxtransformer_layerrK   s
            r   r@   MobileViTV2Transformer.__init__K  s    ..",-8 2::ACbB':]]_
xI ;(:M! JJ/0	 ) ;s   Br   c                 <    U R                    H  nU" U5      nM     U$ r%   r{   )rJ   r   r|   s      r   rO   MobileViTV2Transformer.forward\  s      JJL(7M 'r    r{   r~   rZ   s   @r   r   r   J  sF    10 1C 1# 1RV 1"U\\ ell  r    r   c                   *  ^  \ rS rSrSr   SS\S\S\S\S\S\S	\S
S4U 4S jjjrS\R                  S
\
\R                  \
\\4   4   4S jrS\R                  S\
\\4   S
\R                  4S jrS\R                  S
\R                  4S jrSrU =r$ )MobileViTV2Layerib  z5
MobileViTV2 layer: https://arxiv.org/abs/2206.02680
r,   r-   r.   attn_unit_dimn_attn_blocksr3   r0   r   Nc           	        > [         T	U ]  5         UR                  U l        UR                  U l        UnUS:X  a(  [        UUUUS:X  a  UOSUS:  a  US-  OSS9U l        UnOS U l        [        UUUUR                  US9U l	        [        UUUSSSS9U l
        [        XUS9U l        [        R                  " SXAR                  S9U l        [        UUUSS	SS9U l        g )
Nr   r   )r-   r.   r0   r3   )r-   r.   r/   r1   F)r-   r.   r/   r4   r5   )r   r   r   T)r?   r@   
patch_sizepatch_widthpatch_heightr\   downsampling_layerr*   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projection)
rJ   r,   r-   r.   r   r   r3   r0   cnn_out_dimrK   s
            r   r@   MobileViTV2Layer.__init__g  s    	!,,"--#Q;&A')!)QvA*2Q,QA'D# 'K&*D# -#$//
 -#$# 
 2&Zgh TiTij  4#$"  
r    feature_mapc                    UR                   u  p#pE[        R                  R                  UU R                  U R
                  4U R                  U R
                  4S9nUR                  X#U R                  U R
                  -  S5      nXdU44$ )N)r/   r0   r   )shaper   r   unfoldr   r   reshape)rJ   r   
batch_sizer-   
img_height	img_widthpatchess          r   	unfoldingMobileViTV2Layer.unfolding  s    9D9J9J6
--&&**D,<,<=%%t'7'78 ' 

 //*4;L;LtO_O_;_acdY///r    r   output_sizec                     UR                   u  p4pVUR                  X4U-  U5      n[        R                  R	                  UUU R
                  U R                  4U R
                  U R                  4S9nU$ )N)r   r/   r0   )r   r   r   r   foldr   r   )rJ   r   r   r   in_dimr   	n_patchesr   s           r   foldingMobileViTV2Layer.folding  st    4;MM1
J//*z.A9Mmm((#**D,<,<=%%t'7'78	 ) 
 r    rM   c                 <   U R                   (       a  U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      u  p#U R	                  U5      nU R                  U5      nU R                  X#5      nU R                  U5      nU$ r%   )r   r   r   r   r   r   r   r   )rJ   rM   r   r   s       r   rO   MobileViTV2Layer.forward  s    ""..x8H ==*==*  $~~h7 ""7+..) <<5''1r    )r   r   r   r   r   r   r   r   )r   r   r   )rQ   rR   rS   rT   rn   r   r   r@   rV   rW   r   r   r   rO   rX   rY   rZ   s   @r   r   r   b  s     ;
!;
 ;
 	;

 ;
 ;
 ;
 ;
 
;
 ;
z	0U\\ 	0eELL%PSUXPX/<Y6Z 	0u|| %S/ ell    r    r   c                   t   ^  \ rS rSrS\SS4U 4S jjr  SS\R                  S\S\S\	\
\4   4S	 jjrS
rU =r$ )MobileViTV2Encoderi  r,   r   Nc           
        > [         TU ]  5         Xl        [        R                  " 5       U l        SU l        S=p#UR                  S:X  a  SnSnOUR                  S:X  a  SnSn[        [        SUR                  -  SSS9SSS	9n[        SUR                  -  SS
9n[        SUR                  -  SS
9n[        SUR                  -  SS
9n[        SUR                  -  SS
9n	[        SUR                  -  SS
9n
[        UUUSSS9nU R
                  R                  U5        [        UUUSSS9nU R
                  R                  U5        [        UUU[        UR                  S   UR                  -  SS
9UR                  S   S9nU R
                  R                  U5        U(       a  US-  n[        UUU	[        UR                  S   UR                  -  SS
9UR                  S   US9nU R
                  R                  U5        U(       a  US-  n[        UU	U
[        UR                  S   UR                  -  SS
9UR                  S   US9nU R
                  R                  U5        g )NFr_   Tr   r       @   r'   r   r   r         i     )r-   r.   r0   rr   r   r   )r-   r.   r   r   )r-   r.   r   r   r3   )r?   r@   r,   r   rt   ru   gradient_checkpointingoutput_strider   r(   width_multiplierrp   rw   r   base_attn_unit_dimsr   )rJ   r,   dilate_layer_4dilate_layer_5r3   layer_0_dimlayer_1_dimlayer_2_dimlayer_3_dimlayer_4_dimlayer_5_dimlayer_1layer_2layer_3layer_4layer_5rK   s                   r   r@   MobileViTV2Encoder.__init__  sx   ]]_
&+# +0/1$!N!N!!R'!N$rF333RLVWce
 %R&*A*A%A2N$S6+B+B%BAN$S6+B+B%BAN$S6+B+B%BAN$S6+B+B%BAN+#$
 	

'"+#$
 	

'""#$()C)CA)FI`I`)`jkl ..q1
 	

'"MH"#$()C)CA)FI`I`)`jkl ..q1
 	

'"MH"#$()C)CA)FI`I`)`jkl ..q1
 	

'"r    r   output_hidden_statesreturn_dictc                 D   U(       a  SOS n[        U R                  5       HZ  u  pVU R                  (       a.  U R                  (       a  U R	                  UR
                  U5      nOU" U5      nU(       d  MU  XA4-   nM\     U(       d  [        S X4 5       5      $ [        XS9$ )N c              3   .   #    U  H  oc  M  Uv   M     g 7fr%   r  ).0vs     r   	<genexpr>-MobileViTV2Encoder.forward.<locals>.<genexpr>;  s     X$Fq$Fs   	)last_hidden_stater   )	enumerateru   r   training_gradient_checkpointing_func__call__tupler   )rJ   r   r  r  all_hidden_statesrx   r|   s          r   rO   MobileViTV2Encoder.forward&  s     #7BD(4OA**t}} $ A A ))!!
 !-] ;##$58H$H!  5 X]$FXXX-oor    )r,   r   ru   )FT)rQ   rR   rS   rT   r   r@   rV   rW   rU   r   r   r   rO   rX   rY   rZ   s   @r   r   r     sh    O#0 O#T O#h &+ 	p||p #p 	p
 
u44	5p pr    r   c                       \ rS rSr\rSrSrSrS/r	S\
\R                  \R                  \R                  4   SS4S	 jrS
rg)MobileViTV2PreTrainedModeli@  mobilevitv2pixel_valuesTr   moduler   Nc                 
   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weightsr   )meanstdNg      ?)rF   r   LinearrB   weightdatanormal_r,   initializer_ranger2   zero_	LayerNormfill_)rJ   r'  s     r   _init_weights(MobileViTV2PreTrainedModel._init_weightsI  s    fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) .r    r  )rQ   rR   rS   rT   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r+  rB   r1  r3  rX   r  r    r   r$  r$  @  sO     %L%$O&*#+,
*E"))RYY*L$M 
*RV 
*r    r$  c                      ^  \ rS rSrSS\S\4U 4S jjjrS r\   SS\	\
R                     S\	\   S\	\   S	\\\4   4S
 jj5       rSrU =r$ )MobileViTV2ModeliV  r,   expand_outputc           
         > [         TU ]  U5        Xl        X l        [	        [        SUR                  -  SSS9SSS9n[        UUR                  USSS	S	S
9U l	        [        U5      U l        U R                  5         g)z
expand_output (`bool`, *optional*, defaults to `True`):
    Whether to expand the output of the model. If `True`, the model will output pooled features in addition to
    hidden states. If `False`, only the hidden states will be returned.
r   r   r   r'   r_   r   r
   r   Tr-   r.   r/   r0   r4   r5   N)r?   r@   r,   r<  r   r(   r  r*   r   	conv_stemr   encoder	post_init)rJ   r,   r<  r  rK   s       r   r@   MobileViTV2Model.__init__X  s     	 *$rF333RLVWce
 .++$"
 *&1 	r    c                    UR                  5        Hm  u  p#U R                  R                  U   n[        U[        5      (       d  M5  UR
                  R                   H  nUR                  R                  U5        M      Mo     g)zPrunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
N)itemsr@  ru   rF   r   r   r   prune_heads)rJ   heads_to_prunelayer_indexheadsmobilevitv2_layerr   s         r   _prune_headsMobileViTV2Model._prune_headst  sj     #1"6"6"8K $ 2 2; ?+-=>>):)F)F)L)L%%//;;EB *M #9r    r&  r  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  U5      nU R                  UUUS9nU R                  (       a  US   n[        R                  " USS/SS9nOUS   nS nU(       d  Ub  Xg4OU4nXSS  -   $ [        UUUR                  S	9$ )
Nz You have to specify pixel_valuesr  r  r   r   Fr   r   )r  pooler_outputr   )r,   r  use_return_dictrA   r?  r@  r<  rV   r)  r   r   )	rJ   r&  r  r  embedding_outputencoder_outputsr  pooled_outputoutputs	            r   rO   MobileViTV2Model.forward~  s     %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  / 2 "JJ'8r2hPUVM / 2 M;H;T'7[lZnFAB///7/')77
 	
r    )r,   r?  r@  r<  )T)NNN)rQ   rR   rS   rT   r   rU   r@   rJ  r   r   rV   rW   r   r   r   rO   rX   rY   rZ   s   @r   r;  r;  V  s    0   8C  04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 '
r    r;  z
    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                      ^  \ rS rSrS\SS4U 4S jjr\    SS\\R                     S\\
   S\\R                     S	\\
   S\\\4   4
S
 jj5       rSrU =r$ )!MobileViTV2ForImageClassificationi  r,   r   Nc                 D  > [         TU ]  U5        UR                  U l        [        U5      U l        [        SUR                  -  SS9nUR                  S:  a  [        R                  " X!R                  S9O[        R                  " 5       U l
        U R                  5         g )Nr   r_   r   r   )in_featuresout_features)r?   r@   
num_labelsr;  r%  r   r  r   r+  Identity
classifierrA  )rJ   r,   r.   rK   s      r   r@   *MobileViTV2ForImageClassification.__init__  s      +++F3%cF,C,C&CQO   1$ II,=N=NO 	 	r    r&  r  labelsr  c                 2   Ub  UOU R                   R                  nU R                  XUS9nU(       a  UR                  OUS   nU R	                  U5      nSnUGb  U R                   R
                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R
                  S:X  aI  [        5       n	U R                  S:X  a&  U	" UR                  5       UR                  5       5      nOU	" Xs5      nOU R                   R
                  S:X  a=  [        5       n	U	" UR                  SU R                  5      UR                  S5      5      nO,U R                   R
                  S:X  a  [        5       n	U	" Xs5      nU(       d  U4USS -   n
Ub  U4U
-   $ U
$ [!        UUUR"                  S	9$ )
ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NrM  r   
regressionsingle_label_classificationmulti_label_classificationr   r   )losslogitsr   )r,   rP  r%  rO  r^  problem_typer\  dtyperV   longr   r	   squeezer   viewr   r   r   )rJ   r&  r  r`  r  outputsrS  rf  re  loss_fctrT  s              r   rO   )MobileViTV2ForImageClassification.forward  s    &1%<k$++B]B]""<hs"t1<--'!*/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE3!//
 	
r    )r^  r%  r\  NNNN)rQ   rR   rS   rT   r   r@   r   r   rV   rW   rU   r   r   r   rO   rX   rY   rZ   s   @r   rX  rX    s    0 T "  04/3)-&*4
u||,4
 'tn4
 &	4

 d^4
 
u::	;4
 4
r    rX  c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTV2ASPPPoolingi  r,   r-   r.   r   Nc           
      |   > [         TU ]  5         [        R                  " SS9U l        [        UUUSSSSS9U l        g )Nr   )r   Tr   r>  )r?   r@   r   AdaptiveAvgPool2dglobal_poolr*   r   )rJ   r,   r-   r.   rK   s       r   r@   MobileViTV2ASPPPooling.__init__  sB    //A>,#%"!
r    rM   c                     UR                   SS  nU R                  U5      nU R                  U5      n[        R                  R                  XSSS9nU$ )NrN  bilinearFsizemodealign_corners)r   rt  r   r   r   interpolate)rJ   rM   spatial_sizes      r   rO   MobileViTV2ASPPPooling.forward
  sQ    ~~bc*##H-==*==,,Xzin,or    )r   rt  r~   rZ   s   @r   rq  rq    sG    
0 
s 
RU 
Z^ 
   r    rq  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTV2ASPPi  zk
ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
r,   r   Nc                   > [         TU ]  5         [        SUR                  -  SS9nUnUR                  n[        UR                  5      S:w  a  [        S5      e[        R                  " 5       U l
        [        UUUSSS9nU R                  R                  U5        U R                  R                  UR                   Vs/ s H  n[        UUUSUSS	9PM     sn5        [        XU5      nU R                  R                  U5        [        US
U-  USSS9U l        [        R                   " UR"                  S9U l        g s  snf )Nr   r_   r   r
   z"Expected 3 values for atrous_ratesr   r   ra   )r-   r.   r/   r3   r5      r   )r?   r@   r   r  aspp_out_channelslenatrous_ratesrA   r   rt   convsr*   rw   extendrq  projectr   aspp_dropout_probr   )	rJ   r,   encoder_out_channelsr-   r.   in_projectionrate
pool_layerrK   s	           r   r@   MobileViTV2ASPP.__init__  s;   -cF4K4K.KUVW*//v""#q(ABB]]_
,#%!
 	

-(

 #//
 0D % +!- !!#) 0
	
 ,FN


*%+L 0|YZkq
 zzF$<$<=)
s   >D=rM   c                     / nU R                    H  nUR                  U" U5      5        M     [        R                  " USS9nU R	                  U5      nU R                  U5      nU$ )Nr   r   )r  rw   rV   catr  r   )rJ   rM   pyramidconvpooled_featuress        r   rO   MobileViTV2ASPP.forwardC  sW    JJDNN4>* ))G+,,w/,,7r    )r  r   r  rQ   rR   rS   rT   rn   r   r@   rV   rW   rO   rX   rY   rZ   s   @r   r  r    s=    *>0 *>T *>X   r    r  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTV2DeepLabV3iO  z:
DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
r,   r   Nc           
         > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        UUR                  UR                  SSSSS9U l        g )Nr   FT)r-   r.   r/   r4   r5   r2   )r?   r@   r  asppr   	Dropout2dclassifier_dropout_probr   r*   r  r\  r^  rJ   r,   rK   s     r   r@   MobileViTV2DeepLabV3.__init__T  s]    #F+	||F$B$BC.00**# 
r    r   c                 r    U R                  US   5      nU R                  U5      nU R                  U5      nU$ )Nr   )r  r   r^  )rJ   r   rM   s      r   rO   MobileViTV2DeepLabV3.forwardd  s6    99]2./<<)??8,r    )r  r^  r   r  rZ   s   @r   r  r  O  s<    
0 
T 
 U\\ ell  r    r  zZ
    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                      ^  \ rS rSrS\SS4U 4S jjr\    SS\\R                     S\\R                     S\\
   S	\\
   S\\\4   4
S
 jj5       rSrU =r$ )"MobileViTV2ForSemanticSegmentationik  r,   r   Nc                    > [         TU ]  U5        UR                  U l        [        USS9U l        [        U5      U l        U R                  5         g )NF)r<  )r?   r@   r\  r;  r%  r  segmentation_headrA  r  s     r   r@   +MobileViTV2ForSemanticSegmentation.__init__q  sE      +++F%H!5f!= 	r    r&  r`  r  r  c                 z   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb%  U R                   R                  S:X  a  [	        S5      eU R                  USUS9nU(       a  UR                  OUS   nU R                  U5      nSnUbQ  [        R                  R                  XrR                  SS SSS	9n	[        U R                   R                  S
9n
U
" X5      nU(       d%  U(       a
  U4USS -   nO	U4USS -   nUb  U4U-   $ U$ [        UUU(       a  UR                  SS9$ SSS9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import requests
>>> import torch
>>> from PIL import Image
>>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
>>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")

>>> inputs = image_processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> # logits are of shape (batch_size, num_labels, height, width)
>>> logits = outputs.logits
```Nr   z/The number of labels should be greater than oneTrM  rN  rw  Frx  )ignore_indexr   )re  rf  r   
attentions)r,   r  rP  r\  rA   r%  r   r  r   r   r|  r   r   semantic_loss_ignore_indexr   )rJ   r&  r`  r  r  rl  encoder_hidden_statesrf  re  upsampled_logitsrm  rT  s               r   rO   *MobileViTV2ForSemanticSegmentation.forward{  so   H %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO""!%# # 
 :E 5 5'RS*''(=>!}}88\\"#.Zu  9   (T[[5[5[\H,5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r    )r%  r\  r  ro  )rQ   rR   rS   rT   r   r@   r   r   rV   rW   rU   r   r   r   rO   rX   rY   rZ   s   @r   r  r  k  s    0 T   04)-/3&*I
u||,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
r    r  )rX  r  r;  r$  )r_   N)3rn   typingr   r   r   rV   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   utilsr   r   configuration_mobilevitv2r   
get_loggerrQ   loggerr   r   r   r(   Moduler*   r\   rp   r   r   r   r   r   r   r$  r;  rX  rq  r  r  r  __all__r  r    r   <module>r     s  " ! ) )    A A !  . , 8 
		H	%#  HSM UX  ).fe - - - -Y^ -
=299 =B-F")) -Fb		 .<RYY <~&RYY &R")) 6RYY 0oryy odip ipX * * *( O
1 O
 O
d G
(B G
G
VRYY 09bii 9z299 8 
U
)C U

U
pr    