
    fTh              	          S r SSKrSSKJrJrJrJrJr  SSKrSSK	rSSKJ
r
  SSKJrJrJr  SSKJr  SSKJrJrJrJr  SS	KJr  SS
KJrJr  SSKJrJrJr  SSKJ r   \RB                  " \"5      r#S=S\$S\$S\\$   S\$4S jjr% " S S\
RL                  5      r' " S S\
RL                  5      r( " S S\
RL                  5      r) " S S\
RL                  5      r* " S S\
RL                  5      r+ " S S\
RL                  5      r, " S S \
RL                  5      r- " S! S"\
RL                  5      r. " S# S$\
RL                  5      r/ " S% S&\
RL                  5      r0 " S' S(\
RL                  5      r1 " S) S*\
RL                  5      r2\ " S+ S,\5      5       r3\ " S- S.\35      5       r4\" S/S09 " S1 S2\35      5       r5 " S3 S4\
RL                  5      r6 " S5 S6\
RL                  5      r7 " S7 S8\
RL                  5      r8\" S9S09 " S: S;\35      5       r9/ S<Qr:g)>zPyTorch MobileViT model.    N)DictOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )MobileViTConfigvaluedivisor	min_valuereturnc                 |    Uc  Un[        U[        XS-  -   5      U-  U-  5      nUSU -  :  a  X1-  n[        U5      $ )z
Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
original TensorFlow repo. It can be seen here:
https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
   g?)maxint)r   r   r   	new_values       h/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler$   +   sO     	Is5Q;#677BWLMI3;	y>    c                      ^  \ rS rSr      SS\S\S\S\S\S\S\S	\S
\S\\\4   SS4U 4S jjjr	S\
R                  S\
R                  4S jrSrU =r$ )MobileViTConvLayer:   configin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 D  > [         TU ]  5         [        US-
  S-  5      U-  nX&-  S:w  a  [        SU SU S35      eX6-  S:w  a  [        SU SU S35      e[        R
                  " UUUUUUUUSS	9	U l        U	(       a  [        R                  " US
SSSS9U l        OS U l        U
(       an  [        U
[        5      (       a  [        U
   U l        g [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g S U l        g )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r*   r+   r,   r-   paddingr0   r.   r/   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r!   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r5   	__class__s               r#   r=   MobileViTConvLayer.__init__;   s,    	{Q!+,x71$/}<STZS[[cdee A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#.."("8F--s33"():):";"("3"3"DOr%   featuresc                     U R                  U5      nU R                  b  U R                  U5      nU R                  b  U R                  U5      nU$ N)r@   rB   rE   )rG   rJ   s     r#   forwardMobileViTConvLayer.forwardq   sK    ##H-)))(3H??&x0Hr%   )rE   r@   rB   )r   r   Fr   TT)__name__
__module____qualname____firstlineno__r   r!   boolr   rD   r=   torchTensorrM   __static_attributes____classcell__rH   s   @r#   r'   r'   :   s     "&+/4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4# 4#l   r%   r'   c                      ^  \ rS rSrSr SS\S\S\S\S\SS	4U 4S
 jjjrS\R                  S\R                  4S jr
SrU =r$ )MobileViTInvertedResidualz   zI
Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
r)   r*   r+   r-   r0   r   Nc           
      6  > [         TU ]  5         [        [        [	        X!R
                  -  5      5      S5      nUS;  a  [        SU S35      eUS:H  =(       a    X#:H  U l        [        XUSS9U l	        [        UUUSUUUS9U l
        [        UUUSS	S
9U l        g )N   )r   r   zInvalid stride .r   r*   r+   r,   r   )r*   r+   r,   r-   r.   r0   Fr*   r+   r,   r2   )r<   r=   r$   r!   roundexpand_ratior>   use_residualr'   
expand_1x1conv_3x3
reduce_1x1)rG   r)   r*   r+   r-   r0   expanded_channelsrH   s          r#   r=   "MobileViTInvertedResidual.__init__   s     	*3u[CVCV5V/W+XZ[\vha899#q[K{/J,:KYZ
 +)*$
 -)% 
r%   rJ   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  (       a  X!-   $ U$ rL   )rd   re   rf   rc   )rG   rJ   residuals      r#   rM   !MobileViTInvertedResidual.forward   sG    ??8,==*??8,&*&7&7x"EXEr%   )re   rd   rf   rc   r   )rO   rP   rQ   rR   __doc__r   r!   r=   rT   rU   rM   rV   rW   rX   s   @r#   rZ   rZ   z   sn    
 jk
%
47
GJ
TW
cf
	
 
BF F F Fr%   rZ   c                      ^  \ rS rSr SS\S\S\S\S\SS4U 4S	 jjjrS
\R                  S\R                  4S jr	Sr
U =r$ )MobileViTMobileNetLayer   r)   r*   r+   r-   
num_stagesr   Nc                    > [         TU ]  5         [        R                  " 5       U l        [        U5       H4  n[        UUUUS:X  a  UOSS9nU R                  R                  U5        UnM6     g )Nr   r   )r*   r+   r-   )r<   r=   r   
ModuleListlayerrangerZ   append)	rG   r)   r*   r+   r-   rq   irt   rH   s	           r#   r=    MobileViTMobileNetLayer.__init__   sc     	]]_
z"A-')!"avQ	E JJe$&K #r%   rJ   c                 <    U R                    H  nU" U5      nM     U$ rL   rt   )rG   rJ   layer_modules      r#   rM   MobileViTMobileNetLayer.forward   s     JJL#H-H 'r%   rz   )r   r   rO   rP   rQ   rR   r   r!   r=   rT   rU   rM   rV   rW   rX   s   @r#   ro   ro      s`    op'%'47'GJ'TW'il'	' '    r%   ro   c                      ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jr	S	\R                  S\R                  4S
 jr
SrU =r$ )MobileViTSelfAttention   r)   hidden_sizer   Nc                 r  > [         TU ]  5         X!R                  -  S:w  a  [        SU SUR                   S35      eUR                  U l        [	        X!R                  -  5      U l        U R                  U R
                  -  U l        [        R                  " X R                  UR                  S9U l
        [        R                  " X R                  UR                  S9U l        [        R                  " X R                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   zThe hidden size z4 is not a multiple of the number of attention heads r^   )r/   )r<   r=   num_attention_headsr>   r!   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrG   r)   r   rH   s      r#   r=   MobileViTSelfAttention.__init__   s    333q8";- 0334A7 
 $*#=#= #&{5O5O'O#P !558P8PPYY{,>,>V__U
99[*<*<6??SYY{,>,>V__U
zz&"E"EFr%   xc                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )Nr   r   r   r   )sizer   r   viewpermute)rG   r   new_x_shapes      r#   transpose_for_scores+MobileViTSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$r%   hidden_statesc                    U R                  U5      nU R                  U R                  U5      5      nU R                  U R                  U5      5      nU R                  U5      n[        R
                  " XSR                  SS5      5      nU[        R                  " U R                  5      -  n[        R                  R                  USS9nU R                  U5      n[        R
                  " Xt5      nUR                  SSSS5      R                  5       nUR!                  5       S S U R"                  4-   n	UR$                  " U	6 nU$ )Nr   dimr   r   r   r   )r   r   r   r   rT   matmul	transposemathsqrtr   r   
functionalsoftmaxr   r   
contiguousr   r   r   )
rG   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapes
             r#   rM   MobileViTSelfAttention.forward   s     JJ}5--dhh}.EF	//

=0IJ//0AB !<<5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDr%   )r   r   r   r   r   r   r   )rO   rP   rQ   rR   r   r!   r=   rT   rU   r   rM   rV   rW   rX   s   @r#   r   r      s\    G GS GT G&%ell %u|| %
U\\ ell  r%   r   c                   r   ^  \ rS rSrS\S\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTSelfOutput   r)   r   r   Nc                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g rL   r<   r=   r   r   denser   hidden_dropout_probr   r   s      r#   r=   MobileViTSelfOutput.__init__   s4    YY{8
zz&"<"<=r%   r   c                 J    U R                  U5      nU R                  U5      nU$ rL   r   r   rG   r   s     r#   rM   MobileViTSelfOutput.forward   s$    

=1]3r%   r   r}   rX   s   @r#   r   r      s=    > >S >T >
U\\ ell  r%   r   c                      ^  \ rS rSrS\S\SS4U 4S jjrS\\   SS4S jrS	\	R                  S\	R                  4S
 jrSrU =r$ )MobileViTAttention   r)   r   r   Nc                    > [         TU ]  5         [        X5      U l        [	        X5      U l        [        5       U l        g rL   )r<   r=   r   	attentionr   outputsetpruned_headsr   s      r#   r=   MobileViTAttention.__init__  s0    /D)&>Er%   headsc                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   r   r   r   r   r   r   r   r   r   r   r   union)rG   r   indexs      r#   prune_headsMobileViTAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r%   r   c                 J    U R                  U5      nU R                  U5      nU$ rL   )r   r   )rG   r   self_outputsattention_outputs       r#   rM   MobileViTAttention.forward  s%    ~~m4;;|4r%   )r   r   r   )rO   rP   rQ   rR   r   r!   r=   r   r   rT   rU   rM   rV   rW   rX   s   @r#   r   r      sT    " "S "T ";S ;d ;$ U\\  ell    r%   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTIntermediatei  r)   r   intermediate_sizer   Nc                    > [         TU ]  5         [        R                  " X#5      U l        [        UR                  [        5      (       a  [        UR                     U l	        g UR                  U l	        g rL   )
r<   r=   r   r   r   rC   rF   rD   r   intermediate_act_fnrG   r)   r   r   rH   s       r#   r=   MobileViTIntermediate.__init__   sR    YY{>
f''--'-f.?.?'@D$'-'8'8D$r%   r   c                 J    U R                  U5      nU R                  U5      nU$ rL   r   r   r   s     r#   rM   MobileViTIntermediate.forward(  s&    

=100?r%   r   r}   rX   s   @r#   r   r     sF    9 9S 9UX 9]a 9U\\ ell  r%   r   c                      ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S	\R                  S\R                  4S
 jr	Sr
U =r$ )MobileViTOutputi.  r)   r   r   r   Nc                    > [         TU ]  5         [        R                  " X25      U l        [        R
                  " UR                  5      U l        g rL   r   r   s       r#   r=   MobileViTOutput.__init__/  s5    YY0>
zz&"<"<=r%   r   input_tensorc                 R    U R                  U5      nU R                  U5      nX-   nU$ rL   r   )rG   r   r   s      r#   rM   MobileViTOutput.forward4  s,    

=1]3%4r%   r   r}   rX   s   @r#   r   r   .  sT    > >S >UX >]a >
U\\  RWR^R^  r%   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTTransformerLayeri;  r)   r   r   r   Nc                   > [         TU ]  5         [        X5      U l        [	        XU5      U l        [        XU5      U l        [        R                  " X!R                  S9U l        [        R                  " X!R                  S9U l        g )Nr8   )r<   r=   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r#   r=   "MobileViTTransformerLayer.__init__<  sg    +F@1&GXY%f;LM "[>S>S T!||K=R=RSr%   r   c                     U R                  U R                  U5      5      nX!-   nU R                  U5      nU R                  U5      nU R	                  X15      nU$ rL   )r   r   r   r   r   )rG   r   r   layer_outputs       r#   rM   !MobileViTTransformerLayer.forwardD  sX    >>$*?*?*NO(8++M:((6{{<?r%   )r   r   r   r   r   r}   rX   s   @r#   r   r   ;  sK    T TS TUX T]a TU\\ ell  r%   r   c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTTransformeriN  r)   r   rq   r   Nc           	         > [         TU ]  5         [        R                  " 5       U l        [        U5       H>  n[        UU[        X!R                  -  5      S9nU R                  R                  U5        M@     g )N)r   r   )
r<   r=   r   rs   rt   ru   r   r!   	mlp_ratiorv   )rG   r)   r   rq   _transformer_layerrH   s         r#   r=   MobileViTTransformer.__init__O  sa    ]]_
z"A 9'"%k4D4D&D"E!
 JJ/0 #r%   r   c                 <    U R                    H  nU" U5      nM     U$ rL   rz   )rG   r   r{   s      r#   rM   MobileViTTransformer.forward[  s      JJL(7M 'r%   rz   r}   rX   s   @r#   r   r   N  sE    
1 
1S 
1c 
1VZ 
1U\\ ell  r%   r   c                     ^  \ rS rSrSr SS\S\S\S\S\S\S	\S
S4U 4S jjjrS\R                  S
\
\R                  \4   4S jrS\R                  S\S
\R                  4S jrS\R                  S
\R                  4S jrSrU =r$ )MobileViTLayeria  z3
MobileViT block: https://arxiv.org/abs/2110.02178
r)   r*   r+   r-   r   rq   r0   r   Nc           	        > [         TU ]  5         UR                  U l        UR                  U l        US:X  a(  [        UUUUS:X  a  UOSUS:  a  US-  OSS9U l        UnOS U l        [        UUUUR                  S9U l	        [        UUUSSSS9U l
        [        UUUS9U l        [        R                  " XQR                  S9U l        [        XUSS9U l        [        USU-  X!R                  S9U l        g )	Nr   r   )r*   r+   r-   r0   r_   F)r*   r+   r,   r1   r2   )r   rq   r   )r<   r=   
patch_sizepatch_widthpatch_heightrZ   downsampling_layerr'   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	rG   r)   r*   r+   r-   r   rq   r0   rH   s	           r#   r=   MobileViTLayer.__init__f  s    	!,,"--Q;&?')!)QvA*2Q,QA'D# 'K&*D#*#$//	
 +#$# 
 0#!
 k7L7LM1+ST 
 )KkWnWn
r%   rJ   c                 n   U R                   U R                  p2[        X#-  5      nUR                  u  pVpx[        R
                  R                  5       (       a$  [        [        R                  " Xs-  5      U-  5      O#[        [        R                  " Xs-  5      U-  5      n	[        R
                  R                  5       (       a$  [        [        R                  " X-  5      U-  5      O#[        [        R                  " X-  5      U-  5      n
SnX:w  d  X:w  a#  [        R                  R                  XU
4SSS9nSnX-  nX-  nX-  nUR                  XV-  U-  X<U5      nUR                  SS5      nUR                  XVX5      nUR                  SS5      nUR                  XT-  US5      nXx4UUUUUUS	.nUU4$ )
NFbilinearr   modealign_cornersTr   r   r   r   )	orig_size
batch_sizechannelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r!   shaperT   jit
is_tracingr   ceilr   r   r   r  reshaper   )rG   rJ   r   r   
patch_arear	  r
  orig_height
orig_width
new_height	new_widthr  num_patch_widthnum_patch_heightr  patches	info_dicts                    r#   	unfoldingMobileViTLayer.unfolding  s   $($4$4d6G6G\34
8@5
k yy##%% ejj!;<|KLTYY{9:\IJ 	 yy##%% ejj!9:[HITYYz78;FG 	 "j&?}}00I6ZW\ 1 H K $2%5&8 ""!$44lU`
 ##Aq)//*P##Aq)//*"9;K &2$ &&!0"2
	 	!!r%   r  r  c                    U R                   U R                  pC[        X4-  5      nUS   nUS   nUS   nUS   n	US   n
UR                  5       R	                  XeUS5      nUR                  SS5      nUR                  Xg-  U	-  XU5      nUR                  SS	5      nUR                  XgX-  X-  5      nUS
   (       a"  [        R                  R                  XS   SSS9nU$ )Nr	  r
  r  r  r  r   r   r   r   r  r  r  Fr  )
r   r   r!   r   r   r   r  r   r   r  )rG   r  r  r   r   r  r	  r
  r  r  r  rJ   s               r#   foldingMobileViTLayer.folding  s   $($4$4d6G6G\34
|,
Z(.$%9:#$78 %%',,Z[RTU%%a+##!$44oU`
 %%a+##"2"A?C`
 ]#}}005JV[ 1 H r%   c                    U R                   (       a  U R                  U5      nUnU R                  U5      nU R                  U5      nU R                  U5      u  p4U R	                  U5      nU R                  U5      nU R                  X45      nU R                  U5      nU R                  [        R                  " X!4SS95      nU$ Nr   r   )r   r   r   r  r   r   r   r   r  rT   cat)rG   rJ   rj   r  r  s        r#   rM   MobileViTLayer.forward  s    ""..x8H ==*==* "^^H5 ""7+..) <<3''1;;uyy()=1EFr%   )	r   r   r   r   r  r   r   r   r   rl   )rO   rP   rQ   rR   rm   r   r!   r=   rT   rU   r   r   r  r   rM   rV   rW   rX   s   @r#   r   r   a  s     8
8
 8
 	8

 8
 8
 8
 8
 
8
 8
t1"%,, 1"5t9K3L 1"fu||   :   r%   r   c                   t   ^  \ rS rSrS\SS4U 4S jjr  SS\R                  S\S\S\	\
\4   4S	 jjrS
rU =r$ )MobileViTEncoderi
  r)   r   Nc           
        > [         T
U ]  5         Xl        [        R                  " 5       U l        SU l        S=p#UR                  S:X  a  SnSnOUR                  S:X  a  SnSn[        UUR                  S   UR                  S   SSS9nU R
                  R                  U5        [        UUR                  S   UR                  S   SS	S9nU R
                  R                  U5        [        UUR                  S   UR                  S	   SUR                  S   SS
9nU R
                  R                  U5        U(       a  US-  n[        UUR                  S	   UR                  S   SUR                  S   SUS9nU R
                  R                  U5        U(       a  US-  n[        UUR                  S   UR                  S   SUR                  S   S	US9n	U R
                  R                  U	5        g )NFr]   T   r   r   )r*   r+   r-   rq   r   r   )r*   r+   r-   r   rq      )r*   r+   r-   r   rq   r0      )r<   r=   r)   r   rs   rt   gradient_checkpointingoutput_stridero   neck_hidden_sizesrv   r   hidden_sizes)rG   r)   dilate_layer_4dilate_layer_5r0   layer_1layer_2layer_3layer_4layer_5rH   s             r#   r=   MobileViTEncoder.__init__  s   ]]_
&+# +0/1$!N!N!!R'!N)00311!4
 	

'")00311!4
 	

'" 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"r%   r   output_hidden_statesreturn_dictc                 D   U(       a  SOS n[        U R                  5       HZ  u  pVU R                  (       a.  U R                  (       a  U R	                  UR
                  U5      nOU" U5      nU(       d  MU  XA4-   nM\     U(       d  [        S X4 5       5      $ [        XS9$ )N c              3   .   #    U  H  oc  M  Uv   M     g 7frL   r;  ).0vs     r#   	<genexpr>+MobileViTEncoder.forward.<locals>.<genexpr>j  s     X$Fq$Fs   	)last_hidden_stater   )	enumeratert   r,  training_gradient_checkpointing_func__call__tupler   )rG   r   r8  r9  all_hidden_statesrw   r{   s          r#   rM   MobileViTEncoder.forwardU  s     #7BD(4OA**t}} $ A A ))!!
 !-] ;##$58H$H!  5 X]$FXXX-oor%   )r)   r,  rt   )FT)rO   rP   rQ   rR   r   r=   rT   rU   rS   r   rF  r   rM   rV   rW   rX   s   @r#   r'  r'  
  sg    H# H#4 H#Z &+ 	p||p #p 	p
 
u44	5p pr%   r'  c                       \ rS rSr\rSrSrSrS/r	S\
\R                  \R                  \R                  4   SS4S	 jrS
rg)MobileViTPreTrainedModelio  	mobilevitpixel_valuesTr   moduler   Nc                 
   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        gg)zInitialize the weightsg        )meanstdNg      ?)rC   r   r   r?   weightdatanormal_r)   initializer_ranger/   zero_r   fill_)rG   rM  s     r#   _init_weights&MobileViTPreTrainedModel._init_weightsw  s    fryy"))455 MM&&CT[[5R5R&S{{&  &&( '--KK""$MM$$S) .r%   r;  )rO   rP   rQ   rR   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   r   r?   r   rW  rV   r;  r%   r#   rJ  rJ  o  sM    "L#$O&*#)*
*E"))RYY*L$M 
*RV 
*r%   rJ  c                      ^  \ rS rSrSS\S\4U 4S jjjrS r\   SS\	\
R                     S\	\   S\	\   S	\\\4   4S
 jj5       rSrU =r$ )MobileViTModeli  r)   expand_outputc                 F  > [         TU ]  U5        Xl        X l        [	        UUR
                  UR                  S   SSS9U l        [        U5      U l	        U R                  (       a+  [	        UUR                  S   UR                  S   SS9U l
        U R                  5         g	)
a%  
expand_output (`bool`, *optional*, defaults to `True`):
    Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
    1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
r   r   r   )r*   r+   r,   r-   r+     r   r_   N)r<   r=   r)   r`  r'   num_channelsr.  	conv_stemr'  encoderconv_1x1_exp	post_init)rG   r)   r`  rH   s      r#   r=   MobileViTModel.__init__  s     	 *+++11!4
 (/ 2"44Q7#55a8	!D 	r%   c                    UR                  5        Hm  u  p#U R                  R                  U   n[        U[        5      (       d  M5  UR
                  R                   H  nUR                  R                  U5        M      Mo     g)zPrunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
N)itemsre  rt   rC   r   r   r   r   )rG   heads_to_prunelayer_indexr   mobilevit_layerr   s         r#   _prune_headsMobileViTModel._prune_heads  sg     #1"6"6"8K"ll00=O/>::)8)D)D)J)J%%//;;EB *K #9r%   rL  r8  r9  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  U5      nU R                  UUUS9nU R                  (       a-  U R                  US   5      n[        R                  " USS/SS9nOUS   nS nU(       d  Ub  Xg4OU4nXSS  -   $ [        UUUR                  S	9$ )
Nz You have to specify pixel_valuesr8  r9  r   r   r   F)r   keepdimr   )rA  pooler_outputr   )r)   r8  use_return_dictr>   rd  re  r`  rf  rT   rO  r   r   )	rG   rL  r8  r9  embedding_outputencoder_outputsrA  pooled_outputr   s	            r#   rM   MobileViTModel.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  $ 1 1/!2D E "JJ'8r2hPUVM / 2 M;H;T'7[lZnFAB///7/')77
 	
r%   )r)   rf  rd  re  r`  )T)NNN)rO   rP   rQ   rR   r   rS   r=   rn  r   r   rT   rU   r   rF  r   rM   rV   rW   rX   s   @r#   r_  r_    s     t  >C  04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 '
r%   r_  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                      ^  \ rS rSrS\SS4U 4S jjr\    SS\\R                     S\\
   S\\R                     S	\\
   S\\\4   4
S
 jj5       rSrU =r$ )MobileViTForImageClassificationi  r)   r   Nc                 ~  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  SS9U l        UR                  S:  a.  [
        R                  " UR                  S   UR                  5      O[
        R                  " 5       U l        U R                  5         g )NT)inplacer   r   )r<   r=   
num_labelsr_  rK  r   r   classifier_dropout_probr   r   r.  Identity
classifierrg  rG   r)   rH   s     r#   r=   (MobileViTForImageClassification.__init__  s      ++'/ zz&"@"@$OJPJ[J[^_J_BIIf..r2F4E4EFegepeper 	
 	r%   rL  r8  labelsr9  c                 P   Ub  UOU R                   R                  nU R                  XUS9nU(       a  UR                  OUS   nU R	                  U R                  U5      5      nSnUGb  U R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       n	U R                  S:X  a&  U	" UR                  5       UR                  5       5      nOU	" Xs5      nOU R                   R                  S:X  a=  [        5       n	U	" UR                  SU R                  5      UR                  S5      5      nO,U R                   R                  S:X  a  [!        5       n	U	" Xs5      nU(       d  U4USS -   n
Ub  U4U
-   $ U
$ [#        UUUR$                  S	9$ )
ab  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
Nrq  r   
regressionsingle_label_classificationmulti_label_classificationr   r   )losslogitsr   )r)   rt  rK  rs  r  r   problem_typer~  dtyperT   longr!   r   squeezer
   r   r	   r   r   )rG   rL  r8  r  r9  outputsrw  r  r  loss_fctr   s              r#   rM   'MobileViTForImageClassification.forward  s    &1%<k$++B]B]..fq.r1<--'!*m!<={{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE3!//
 	
r%   )r  r   rK  r~  NNNN)rO   rP   rQ   rR   r   r=   r   r   rT   rU   rS   r   rF  r   rM   rV   rW   rX   s   @r#   r{  r{    s     4   04/3)-&*4
u||,4
 'tn4
 &	4

 d^4
 
u::	;4
 4
r%   r{  c                   v   ^  \ rS rSrS\S\S\SS4U 4S jjrS\R                  S\R                  4S	 jr	S
r
U =r$ )MobileViTASPPPoolingi(  r)   r*   r+   r   Nc           
      |   > [         TU ]  5         [        R                  " SS9U l        [        UUUSSSSS9U l        g )Nr   )output_sizeTrelu)r*   r+   r,   r-   r1   r2   )r<   r=   r   AdaptiveAvgPool2dglobal_poolr'   r   )rG   r)   r*   r+   rH   s       r#   r=   MobileViTASPPPooling.__init__)  sB    //A>*#%"!
r%   rJ   c                     UR                   SS  nU R                  U5      nU R                  U5      n[        R                  R                  XSSS9nU$ )Nr   r  Fr  )r  r  r   r   r   r  )rG   rJ   spatial_sizes      r#   rM   MobileViTASPPPooling.forward8  sQ    ~~bc*##H-==*==,,Xzin,or%   )r   r  r}   rX   s   @r#   r  r  (  sF    
 
S 
PS 
X\ 
   r%   r  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTASPPi@  zk
ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
r)   r   Nc                 p  > [         TU ]  5         UR                  S   nUR                  n[	        UR
                  5      S:w  a  [        S5      e[        R                  " 5       U l	        [        UUUSSS9nU R                  R                  U5        U R                  R                  UR
                   Vs/ s H  n[        UUUSUSS9PM     sn5        [        XU5      nU R                  R                  U5        [        USU-  USSS9U l        [        R                  " UR                   S	9U l        g s  snf )
Nr   r   z"Expected 3 values for atrous_ratesr   r  r`   )r*   r+   r,   r0   r2   r+  )p)r<   r=   r.  aspp_out_channelsr   atrous_ratesr>   r   rs   convsr'   rv   extendr  projectr   aspp_dropout_probr   )rG   r)   r*   r+   in_projectionrate
pool_layerrH   s          r#   r=   MobileViTASPP.__init__E  s-   ..r2//v""#q(ABB]]_
*#%!
 	

-(

 #//
 0D # +!- !!#) 0
	
 *&|L


*%)L 0|YZkq
 zzF$<$<=)
s   4D3rJ   c                     / nU R                    H  nUR                  U" U5      5        M     [        R                  " USS9nU R	                  U5      nU R                  U5      nU$ r#  )r  rv   rT   r$  r  r   )rG   rJ   pyramidconvpooled_featuress        r#   rM   MobileViTASPP.forwardp  sW    JJDNN4>* ))G+,,w/,,7r%   )r  r   r  rO   rP   rQ   rR   rm   r   r=   rT   rU   rM   rV   rW   rX   s   @r#   r  r  @  s<    )> )>4 )>V   r%   r  c                   r   ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\R                  4S jr	S	r
U =r$ )
MobileViTDeepLabV3i{  z:
DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
r)   r   Nc           
         > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        UUR                  UR                  SSSSS9U l        g )Nr   FT)r*   r+   r,   r1   r2   r/   )r<   r=   r  asppr   	Dropout2dr  r   r'   r  r~  r  r  s     r#   r=   MobileViTDeepLabV3.__init__  s]    !&)	||F$B$BC,00**# 
r%   r   c                 r    U R                  US   5      nU R                  U5      nU R                  U5      nU$ )Nr   )r  r   r  )rG   r   rJ   s      r#   rM   MobileViTDeepLabV3.forward  s6    99]2./<<)??8,r%   )r  r  r   r  rX   s   @r#   r  r  {  s;    
 
4 
 U\\ ell  r%   r  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                      ^  \ rS rSrS\SS4U 4S jjr\    SS\\R                     S\\R                     S\\
   S	\\
   S\\\4   4
S
 jj5       rSrU =r$ ) MobileViTForSemanticSegmentationi  r)   r   Nc                    > [         TU ]  U5        UR                  U l        [        USS9U l        [        U5      U l        U R                  5         g )NF)r`  )r<   r=   r~  r_  rK  r  segmentation_headrg  r  s     r#   r=   )MobileViTForSemanticSegmentation.__init__  sD      ++'eD!3F!; 	r%   rL  r  r8  r9  c                 z   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb%  U R                   R                  S:X  a  [	        S5      eU R                  USUS9nU(       a  UR                  OUS   nU R                  U5      nSnUbQ  [        R                  R                  XrR                  SS SSS	9n	[        U R                   R                  S
9n
U
" X5      nU(       d%  U(       a
  U4USS -   nO	U4USS -   nUb  U4U-   $ U$ [        UUU(       a  UR                  SS9$ SSS9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
    Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

Examples:

```python
>>> import requests
>>> import torch
>>> from PIL import Image
>>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
>>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

>>> inputs = image_processor(images=image, return_tensors="pt")

>>> with torch.no_grad():
...     outputs = model(**inputs)

>>> # logits are of shape (batch_size, num_labels, height, width)
>>> logits = outputs.logits
```Nr   z/The number of labels should be greater than oneTrq  r   r  Fr  )ignore_indexr   )r  r  r   
attentions)r)   r8  rt  r~  r>   rK  r   r  r   r   r  r  r
   semantic_loss_ignore_indexr   )rG   rL  r  r8  r9  r  encoder_hidden_statesr  r  upsampled_logitsr  r   s               r#   rM   (MobileViTForSemanticSegmentation.forward  sm   H %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO..!%# ! 
 :E 5 5'RS*''(=>!}}88\\"#.Zu  9   (T[[5[5[\H,5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r%   )rK  r~  r  r  )rO   rP   rQ   rR   r   r=   r   r   rT   rU   rS   r   rF  r   rM   rV   rW   rX   s   @r#   r  r    s     4   04)-/3&*I
u||,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
r%   r  )r{  r  r_  rJ  )r]   N);rm   r   typingr   r   r   r   r   rT   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   configuration_mobilevitr   
get_loggerrO   loggerr!   r$   Moduler'   rZ   ro   r   r   r   r   r   r   r   r   r'  rJ  r_  r{  r  r  r  r  __all__r;  r%   r#   <module>r     s  "   4 4    A A !  . Q 7 7 4 
		H	%#  HSM UX = =@-F		 -F`bii .0RYY 0f	")) 	   >BII 
bii 
		 &299 &fRYY fRbpryy bpJ * * *( R
- R
 R
j E
&> E
E
P299 08BII 8v 8 
U
'? U

U
pr%   