o
    Zh                  	   @   sX  d Z ddlZddlmZmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZ ddlm Z  e!e"Z#d?de$de$dee$ de$fddZ%G dd de
j&Z'G dd de
j&Z(G dd de
j&Z)G dd de
j&Z*G dd de
j&Z+G dd  d e
j&Z,G d!d" d"e
j&Z-G d#d$ d$e
j&Z.G d%d& d&e
j&Z/G d'd( d(e
j&Z0G d)d* d*e
j&Z1G d+d, d,e
j&Z2eG d-d. d.eZ3eG d/d0 d0e3Z4ed1d2G d3d4 d4e3Z5G d5d6 d6e
j&Z6G d7d8 d8e
j&Z7G d9d: d:e
j&Z8ed;d2G d<d= d=e3Z9g d>Z:dS )@zPyTorch MobileViT model.    N)DictOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )MobileViTConfig   valuedivisor	min_valuereturnc                 C   sF   |du r|}t |t| |d  | | }|d|  k r||7 }t|S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    N   g?)maxint)r   r   r   	new_value r"   _/var/www/auris/lib/python3.10/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisible+   s   r$   c                       sv   e Zd Z						ddededededed	ed
edededeeef ddf fddZde	j
de	j
fddZ  ZS )MobileViTConvLayerr   FTconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                    s   t    t|d d | }|| dkr td| d| d|| dkr1td| d| dtj||||||||dd		| _|	rNtj|d
dddd| _nd | _|
rst	|
t
r_t|
 | _d S t	|jt
rmt|j | _d S |j| _d S d | _d S )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (Zzeros)	r'   r(   r)   r*   paddingr-   r+   r,   Zpadding_modegh㈵>g?T)Znum_featuresepsZmomentumZaffineZtrack_running_stats)super__init__r    
ValueErrorr   Conv2dconvolutionZBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   	__class__r"   r#   r3   ;   sB   



zMobileViTConvLayer.__init__featuresc                 C   s6   |  |}| jd ur| |}| jd ur| |}|S N)r6   r7   r:   )r<   r?   r"   r"   r#   forwardq   s   




zMobileViTConvLayer.forward)r   r   Fr   TT)__name__
__module____qualname__r   r    boolr   r9   r3   torchTensorrA   __classcell__r"   r"   r=   r#   r%   :   s>    	

6r%   c                       sT   e Zd ZdZ	ddedededededd	f fd
dZdejdejfddZ	  Z
S )MobileViTInvertedResidualzQ
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    r   r&   r'   r(   r*   r-   r   Nc              	      s   t    ttt||j d}|dvrtd| d|dko$||k| _t|||dd| _	t|||d|||d| _
t|||dd	d
| _d S )Nr   )r   r   zInvalid stride .r   r'   r(   r)   r   )r'   r(   r)   r*   r+   r-   Fr'   r(   r)   r/   )r2   r3   r$   r    roundZexpand_ratior4   use_residualr%   
expand_1x1conv_3x3
reduce_1x1)r<   r&   r'   r(   r*   r-   Zexpanded_channelsr=   r"   r#   r3      s0   

z"MobileViTInvertedResidual.__init__r?   c                 C   s4   |}|  |}| |}| |}| jr|| S |S r@   )rO   rP   rQ   rN   )r<   r?   residualr"   r"   r#   rA      s
   


z!MobileViTInvertedResidual.forwardr   )rB   rC   rD   __doc__r   r    r3   rF   rG   rA   rH   r"   r"   r=   r#   rI   z   s"    !rI   c                       sP   e Zd Z	ddedededededdf fd	d
ZdejdejfddZ  Z	S )MobileViTMobileNetLayerr   r&   r'   r(   r*   
num_stagesr   Nc                    sR   t    t | _t|D ]}t||||dkr|ndd}| j| |}qd S )Nr   r   )r'   r(   r*   )r2   r3   r   
ModuleListlayerrangerI   append)r<   r&   r'   r(   r*   rV   irX   r=   r"   r#   r3      s   

z MobileViTMobileNetLayer.__init__r?   c                 C      | j D ]}||}q|S r@   rX   )r<   r?   layer_moduler"   r"   r#   rA         

zMobileViTMobileNetLayer.forward)r   r   
rB   rC   rD   r   r    r3   rF   rG   rA   rH   r"   r"   r=   r#   rU      s     rU   c                       sV   e Zd Zdededdf fddZdejdejfdd	Zd
ejdejfddZ	  Z
S )MobileViTSelfAttentionr&   hidden_sizer   Nc                    s   t    ||j dkrtd| d|j d|j| _t||j | _| j| j | _tj|| j|j	d| _
tj|| j|j	d| _tj|| j|j	d| _t|j| _d S )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rJ   )r,   )r2   r3   num_attention_headsr4   r    attention_head_sizeall_head_sizer   LinearZqkv_biasquerykeyr   DropoutZattention_probs_dropout_probdropoutr<   r&   rb   r=   r"   r#   r3      s   
zMobileViTSelfAttention.__init__xc                 C   s6   |  d d | j| jf }|j| }|ddddS )Nr   r   r   r   )sizerc   rd   viewpermute)r<   rl   Znew_x_shaper"   r"   r#   transpose_for_scores   s   
z+MobileViTSelfAttention.transpose_for_scoreshidden_statesc           
      C   s   |  |}| | |}| | |}| |}t||dd}|t| j	 }t
jj|dd}| |}t||}|dddd }| d d | jf }	|j|	 }|S )Nrm   dimr   r   r   r   )rg   rq   rh   r   rF   matmul	transposemathsqrtrd   r   
functionalZsoftmaxrj   rp   
contiguousrn   re   ro   )
r<   rr   Zmixed_query_layerZ	key_layerZvalue_layerZquery_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shaper"   r"   r#   rA      s   



zMobileViTSelfAttention.forward)rB   rC   rD   r   r    r3   rF   rG   rq   rA   rH   r"   r"   r=   r#   ra      s    ra   c                       s@   e Zd Zdededdf fddZdejdejfdd	Z  Z	S )
MobileViTSelfOutputr&   rb   r   Nc                    s*   t    t||| _t|j| _d S r@   r2   r3   r   rf   denseri   Zhidden_dropout_probrj   rk   r=   r"   r#   r3         
zMobileViTSelfOutput.__init__rr   c                 C      |  |}| |}|S r@   r~   rj   r<   rr   r"   r"   r#   rA      r_   zMobileViTSelfOutput.forwardr`   r"   r"   r=   r#   r|      s    r|   c                       sV   e Zd Zdededdf fddZdee ddfdd	Zd
ej	dej	fddZ
  ZS )MobileViTAttentionr&   rb   r   Nc                    s.   t    t||| _t||| _t | _d S r@   )r2   r3   ra   	attentionr|   outputsetpruned_headsrk   r=   r"   r#   r3     s   
zMobileViTAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rt   )lenr   r   rc   rd   r   r   rg   rh   r   r   r~   re   union)r<   r   indexr"   r"   r#   prune_heads  s   zMobileViTAttention.prune_headsrr   c                 C   s   |  |}| |}|S r@   )r   r   )r<   rr   Zself_outputsattention_outputr"   r"   r#   rA     r_   zMobileViTAttention.forward)rB   rC   rD   r   r    r3   r   r   rF   rG   rA   rH   r"   r"   r=   r#   r      s    r   c                       D   e Zd Zdedededdf fddZdejdejfd	d
Z  Z	S )MobileViTIntermediater&   rb   intermediate_sizer   Nc                    s@   t    t||| _t|jtrt|j | _	d S |j| _	d S r@   )
r2   r3   r   rf   r~   r8   r;   r9   r   intermediate_act_fnr<   r&   rb   r   r=   r"   r#   r3      s
   
zMobileViTIntermediate.__init__rr   c                 C   r   r@   )r~   r   r   r"   r"   r#   rA   (  r_   zMobileViTIntermediate.forwardr`   r"   r"   r=   r#   r         r   c                       sJ   e Zd Zdedededdf fddZdejd	ejdejfd
dZ  Z	S )MobileViTOutputr&   rb   r   r   Nc                    s*   t    t||| _t|j| _d S r@   r}   r   r=   r"   r#   r3   /  r   zMobileViTOutput.__init__rr   input_tensorc                 C   s    |  |}| |}|| }|S r@   r   )r<   rr   r   r"   r"   r#   rA   4  s   

zMobileViTOutput.forwardr`   r"   r"   r=   r#   r   .  s    $r   c                       r   )MobileViTTransformerLayerr&   rb   r   r   Nc                    sZ   t    t||| _t|||| _t|||| _tj	||j
d| _tj	||j
d| _d S )Nr1   )r2   r3   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   r=   r"   r#   r3   <  s   
z"MobileViTTransformerLayer.__init__rr   c                 C   s<   |  | |}|| }| |}| |}| ||}|S r@   )r   r   r   r   r   )r<   rr   r   Zlayer_outputr"   r"   r#   rA   D  s   

z!MobileViTTransformerLayer.forwardr`   r"   r"   r=   r#   r   ;  r   r   c                       r   )MobileViTTransformerr&   rb   rV   r   Nc                    sJ   t    t | _t|D ]}t||t||j d}| j	| qd S )N)rb   r   )
r2   r3   r   rW   rX   rY   r   r    Z	mlp_ratiorZ   )r<   r&   rb   rV   _transformer_layerr=   r"   r#   r3   O  s   

zMobileViTTransformer.__init__rr   c                 C   r\   r@   r]   )r<   rr   r^   r"   r"   r#   rA   [  r_   zMobileViTTransformer.forwardr`   r"   r"   r=   r#   r   N  s    r   c                       s   e Zd ZdZ	ddedededededed	ed
df fddZdejd
e	eje
f fddZdejde
d
ejfddZdejd
ejfddZ  ZS )MobileViTLayerz;
    MobileViT block: https://arxiv.org/abs/2110.02178
    r   r&   r'   r(   r*   rb   rV   r-   r   Nc                    s   t    |j| _|j| _|dkr,t||||dkr|nd|dkr$|d ndd| _|}nd | _t||||jd| _	t|||dddd| _
t|||d| _tj||jd| _t|||dd| _t|d| ||jd| _d S )	Nr   r   )r'   r(   r*   r-   rK   F)r'   r(   r)   r.   r/   )rb   rV   r   )r2   r3   Z
patch_sizepatch_widthpatch_heightrI   downsampling_layerr%   Zconv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)r<   r&   r'   r(   r*   rb   rV   r-   r=   r"   r#   r3   f  sN   

	zMobileViTLayer.__init__r?   c                 C   sN  | j | j}}t|| }|j\}}}}tj r$tt|| | n
tt	|| | }	tj r?tt|| | n
tt	|| | }
d}|
|ksT|	|krbt
jj||	|
fddd}d}|
| }|	| }|| }||| | |||}|dd}|||||}|dd}||| |d}||f||||||d	}||fS )
NFbilinearrn   modeZalign_cornersTr   r   r   rm   )	orig_size
batch_sizechannelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r    shaperF   Zjit
is_tracingr   ceilrx   r   rz   r   reshaperw   )r<   r?   r   r   
patch_arear   r   Zorig_heightZ
orig_widthZ
new_heightZ	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dictr"   r"   r#   	unfolding  sH   	zMobileViTLayer.unfoldingr   r   c                 C   s   | j | j}}t|| }|d }|d }|d }|d }	|d }
| |||d}|dd}||| |	 |
||}|dd	}||||	| |
| }|d
 r_tjj	||d ddd}|S )Nr   r   r   r   r   rm   r   r   r   r   r   r   Fr   )
r   r   r    r{   ro   rw   r   r   rz   r   )r<   r   r   r   r   r   r   r   r   r   r   r?   r"   r"   r#   folding  s*   zMobileViTLayer.foldingc                 C   s|   | j r|  |}|}| |}| |}| |\}}| |}| |}| ||}| |}| t	j
||fdd}|S Nr   rt   )r   r   r   r   r   r   r   r   r   rF   cat)r<   r?   rR   r   r   r"   r"   r#   rA     s   





zMobileViTLayer.forwardrS   )rB   rC   rD   rT   r   r    r3   rF   rG   r   r   r   r   rA   rH   r"   r"   r=   r#   r   a  s.    	:3r   c                       sP   e Zd Zdeddf fddZ		ddejd	ed
edee	e
f fddZ  ZS )MobileViTEncoderr&   r   Nc           
   	      sX  t    || _t | _d| _d }}|jdkrd}d}n|jdkr%d}d}t||j	d |j	d ddd}| j
| t||j	d |j	d dd	d}| j
| t||j	d |j	d	 d|jd dd
}| j
| |rp|d9 }t||j	d	 |j	d d|jd d|d}| j
| |r|d9 }t||j	d |j	d d|jd d	|d}	| j
|	 d S )NFr   T   r   r   )r'   r(   r*   rV   r   r   )r'   r(   r*   rb   rV      )r'   r(   r*   rb   rV   r-      )r2   r3   r&   r   rW   rX   gradient_checkpointingZoutput_striderU   neck_hidden_sizesrZ   r   Zhidden_sizes)
r<   r&   Zdilate_layer_4Zdilate_layer_5r-   Zlayer_1Zlayer_2Zlayer_3Zlayer_4Zlayer_5r=   r"   r#   r3     sx   



		zMobileViTEncoder.__init__FTrr   output_hidden_statesreturn_dictc                 C   sx   |rdnd }t | jD ]\}}| jr| jr| |j|}n||}|r(||f }q|s6tdd ||fD S t||dS )Nr"   c                 s   s    | ]	}|d ur|V  qd S r@   r"   ).0vr"   r"   r#   	<genexpr>j  s    z+MobileViTEncoder.forward.<locals>.<genexpr>)last_hidden_staterr   )	enumeraterX   r   ZtrainingZ_gradient_checkpointing_func__call__tupler   )r<   rr   r   r   Zall_hidden_statesr[   r^   r"   r"   r#   rA   U  s   
zMobileViTEncoder.forward)FT)rB   rC   rD   r   r3   rF   rG   rE   r   r   r   rA   rH   r"   r"   r=   r#   r   
  s    M
r   c                   @   sD   e Zd ZeZdZdZdZdgZde	e
je
je
jf ddfdd	ZdS )
MobileViTPreTrainedModel	mobilevitpixel_valuesTr   moduler   Nc                 C   st   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS dS )zInitialize the weightsg        )meanZstdNg      ?)r8   r   rf   r5   weightdataZnormal_r&   Zinitializer_ranger,   Zzero_r   Zfill_)r<   r   r"   r"   r#   _init_weightsw  s   
z&MobileViTPreTrainedModel._init_weights)rB   rC   rD   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesr   r   rf   r5   r   r   r"   r"   r"   r#   r   o  s    &r   c                       sl   e Zd Zddedef fddZdd Ze			dd	ee	j
 d
ee dee deeef fddZ  ZS )MobileViTModelTr&   expand_outputc                    sn   t  | || _|| _t||j|jd ddd| _t|| _	| jr1t||jd |jd dd| _
|   d	S )
aE  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
            1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
        r   r   r   )r'   r(   r)   r*   r      r   rK   N)r2   r3   r&   r   r%   Znum_channelsr   	conv_stemr   encoderconv_1x1_exp	post_init)r<   r&   r   r=   r"   r#   r3     s&   
zMobileViTModel.__init__c                 C   sF   |  D ]\}}| jj| }t|tr |jjD ]}|j| qqdS )zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr   rX   r8   r   r   r   r   )r<   Zheads_to_pruneZlayer_indexr   Zmobilevit_layerr   r"   r"   r#   _prune_heads  s   
zMobileViTModel._prune_headsNr   r   r   r   c           	      C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}| jr>| |d }tj	|ddgdd}n|d }d }|sY|d urN||fn|f}||dd   S t
|||jd	S )
Nz You have to specify pixel_valuesr   r   r   rs   rm   F)ru   Zkeepdimr   )r   pooler_outputrr   )r&   r   use_return_dictr4   r   r   r   r   rF   r   r   rr   )	r<   r   r   r   Zembedding_outputZencoder_outputsr   pooled_outputr   r"   r"   r#   rA     s0   
zMobileViTModel.forward)T)NNN)rB   rC   rD   r   rE   r3   r   r   r   rF   rG   r   r   r   rA   rH   r"   r"   r=   r#   r     s     

r   z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )Zcustom_introc                       sn   e Zd Zdeddf fddZe				ddeej dee	 deej d	ee	 de
eef f
d
dZ  ZS )MobileViTForImageClassificationr&   r   Nc                    sd   t  | |j| _t|| _tj|jdd| _|jdkr't	|j
d |jnt | _|   d S )NT)Zinplacer   rm   )r2   r3   
num_labelsr   r   r   ri   classifier_dropout_probrj   rf   r   ZIdentity
classifierr   r<   r&   r=   r"   r#   r3     s   
$z(MobileViTForImageClassification.__init__r   r   labelsr   c                 C   sh  |dur|n| j j}| j|||d}|r|jn|d }| | |}d}|dur| j jdu rS| jdkr9d| j _n| jdkrO|jt	j
ksJ|jt	jkrOd| j _nd| j _| j jdkrqt }	| jdkrk|	| | }n+|	||}n%| j jdkrt }	|	|d| j|d}n| j jdkrt }	|	||}|s|f|dd  }
|dur|f|
 S |
S t|||jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrm   r   )losslogitsrr   )r&   r   r   r   r   rj   Zproblem_typer   ZdtyperF   longr    r
   Zsqueezer	   ro   r   r   rr   )r<   r   r   r   r   outputsr   r   r   loss_fctr   r"   r"   r#   rA     s>   

"


z'MobileViTForImageClassification.forwardNNNN)rB   rC   rD   r   r3   r   r   rF   rG   rE   r   r   r   rA   rH   r"   r"   r=   r#   r     s$    
r   c                       r   )MobileViTASPPPoolingr&   r'   r(   r   Nc              	      s4   t    tjdd| _t|||ddddd| _d S )Nr   )Zoutput_sizeTrelu)r'   r(   r)   r*   r.   r/   )r2   r3   r   ZAdaptiveAvgPool2dglobal_poolr%   r   )r<   r&   r'   r(   r=   r"   r#   r3   )  s   
zMobileViTASPPPooling.__init__r?   c                 C   s:   |j dd  }| |}| |}tjj||ddd}|S )Nrs   r   Fr   )r   r   r   r   rz   r   )r<   r?   Zspatial_sizer"   r"   r#   rA   8  s
   

zMobileViTASPPPooling.forwardr`   r"   r"   r=   r#   r   (  s    r   c                       @   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
MobileViTASPPzs
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    r&   r   Nc                    s   t     jd  jt jdkrtdt | _	t
 ddd}| j	| | j	 fdd jD  t }| j	| t
 d	 ddd| _tj jd
| _d S )Nrs   r   z"Expected 3 values for atrous_ratesr   r   rL   c              
      s    g | ]}t  d |ddqS )r   r   )r'   r(   r)   r-   r/   )r%   )r   Zrater&   r'   r(   r"   r#   
<listcomp>Z  s    	z*MobileViTASPP.__init__.<locals>.<listcomp>r   )p)r2   r3   r   aspp_out_channelsr   Zatrous_ratesr4   r   rW   convsr%   rZ   extendr   projectri   Zaspp_dropout_probrj   )r<   r&   Zin_projectionZ
pool_layerr=   r   r#   r3   E  s2   


	zMobileViTASPP.__init__r?   c                 C   sD   g }| j D ]	}||| qtj|dd}| |}| |}|S r   )r   rZ   rF   r   r   rj   )r<   r?   ZpyramidconvZpooled_featuresr"   r"   r#   rA   p  s   


zMobileViTASPP.forward
rB   rC   rD   rT   r   r3   rF   rG   rA   rH   r"   r"   r=   r#   r   @  s    +r   c                       r   )
MobileViTDeepLabV3zB
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    r&   r   Nc              	      sB   t    t|| _t|j| _t||j	|j
ddddd| _d S )Nr   FT)r'   r(   r)   r.   r/   r,   )r2   r3   r   asppr   Z	Dropout2dr   rj   r%   r   r   r   r   r=   r"   r#   r3     s   

zMobileViTDeepLabV3.__init__rr   c                 C   s&   |  |d }| |}| |}|S )Nrm   )r   rj   r   )r<   rr   r?   r"   r"   r#   rA     s   

zMobileViTDeepLabV3.forwardr   r"   r"   r=   r#   r   {  s    r   zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                       sn   e Zd Zdeddf fddZe				ddeej deej dee	 d	ee	 de
eef f
d
dZ  ZS ) MobileViTForSemanticSegmentationr&   r   Nc                    s8   t  | |j| _t|dd| _t|| _|   d S )NF)r   )r2   r3   r   r   r   r   segmentation_headr   r   r=   r"   r#   r3     s
   
z)MobileViTForSemanticSegmentation.__init__r   r   r   r   c                 C   s  |dur|n| j j}|dur|n| j j}|dur"| j jdkr"td| j|d|d}|r/|jn|d }| |}d}|durYtj	j
||jdd ddd	}	t| j jd
}
|
|	|}|s{|rg|f|dd  }n	|f|dd  }|dury|f| S |S t|||r|jddS dddS )a{  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr   rs   r   Fr   )Zignore_indexr   )r   r   rr   Z
attentions)r&   r   r   r   r4   r   rr   r  r   rz   r   r   r	   Zsemantic_loss_ignore_indexr   )r<   r   r   r   r   r   Zencoder_hidden_statesr   r   Zupsampled_logitsr   r   r"   r"   r#   rA     sB   $

z(MobileViTForSemanticSegmentation.forwardr   )rB   rC   rD   r   r3   r   r   rF   rG   rE   r   r   r   rA   rH   r"   r"   r=   r#   r    s$    

r  )r   r  r   r   )r   N);rT   rx   typingr   r   r   r   r   rF   Ztorch.utils.checkpointr   Ztorch.nnr   r	   r
   Zactivationsr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   utilsr   r   r   Zconfiguration_mobilevitr   Z
get_loggerrB   loggerr    r$   Moduler%   rI   rU   ra   r|   r   r   r   r   r   r   r   r   r   r   r   r   r   r  __all__r"   r"   r"   r#   <module>   sX   
 @03 *eUH;X