o
    Zhh                     @   s  d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZ eeZeG dd deZG dd de
jZ G dd de
jZ!G dd de
jZ"G dd de
jZ#G dd de
jZ$G dd de
jZ%G dd de
jZ&G dd de
jZ'G dd  d e
jZ(G d!d" d"e
jZ)G d#d$ d$e
jZ*eG d%d& d&eZ+eG d'd( d(e+Z,ed)d*G d+d, d,e+Z-ed-d*G d.d/ d/e+Z.g d0Z/dS )1zPyTorch LeViT model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionModelOutput)PreTrainedModel)auto_docstringlogging   )LevitConfigc                   @   s^   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dS ),LevitForImageClassificationWithTeacherOutputa  
    Output type of [`LevitForImageClassificationWithTeacher`].

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores as the average of the `cls_logits` and `distillation_logits`.
        cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
            class token).
        distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
            distillation token).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
            plus the initial embedding outputs.
    Nlogits
cls_logitsdistillation_logitshidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r    r    r    W/var/www/auris/lib/python3.10/site-packages/transformers/models/levit/modeling_levit.pyr   (   s   
 r   c                       s,   e Zd ZdZ	d fdd	Zdd Z  ZS )LevitConvEmbeddingsz[
    LeViT Conv Embeddings with Batch Norm, used in the initial patch embedding layer.
    r   c	           	   
      s6   t    tj|||||||dd| _t|| _d S )NF)dilationgroupsbias)super__init__r   Conv2dconvolutionBatchNorm2d
batch_norm)	selfZin_channelsZout_channelskernel_sizestridepaddingr#   r$   bn_weight_init	__class__r    r!   r'   G   s
   
zLevitConvEmbeddings.__init__c                 C   s   |  |}| |}|S N)r)   r+   )r,   
embeddingsr    r    r!   forwardP      

zLevitConvEmbeddings.forward)r   r   r   r   r   r   r   r'   r5   __classcell__r    r    r1   r!   r"   B   s
    	r"   c                       (   e Zd ZdZ fddZdd Z  ZS )LevitPatchEmbeddingsz
    LeViT patch embeddings, for final embeddings to be passed to transformer blocks. It consists of multiple
    `LevitConvEmbeddings`.
    c                    s   t    t|j|jd d |j|j|j| _t	
 | _t|jd d |jd d |j|j|j| _t	
 | _t|jd d |jd d |j|j|j| _t	
 | _t|jd d |jd |j|j|j| _|j| _d S )Nr            )r&   r'   r"   num_channelshidden_sizesr-   r.   r/   embedding_layer_1r   	Hardswishactivation_layer_1embedding_layer_2activation_layer_2embedding_layer_3activation_layer_3embedding_layer_4r,   configr1   r    r!   r'   \   s"   

$
$
 zLevitPatchEmbeddings.__init__c                 C   st   |j d }|| jkrtd| |}| |}| |}| |}| |}| |}| 	|}|
dddS )Nr   zeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r=   )shaper>   
ValueErrorr@   rB   rC   rD   rE   rF   rG   flatten	transpose)r,   pixel_valuesr>   r4   r    r    r!   r5   r   s   








zLevitPatchEmbeddings.forwardr7   r    r    r1   r!   r:   V   s    r:   c                       s&   e Zd Zd fdd	Zdd Z  ZS )MLPLayerWithBNr   c                    s,   t    tj||dd| _t|| _d S )NF)Zin_featuresZout_featuresr%   )r&   r'   r   LinearlinearBatchNorm1dr+   )r,   	input_dim
output_dimr0   r1   r    r!   r'      s   
zMLPLayerWithBN.__init__c                 C   s&   |  |}| |dd|}|S )Nr   r   )rQ   r+   rL   Z
reshape_asr,   hidden_stater    r    r!   r5      s   
zMLPLayerWithBN.forward)r   r   r   r   r'   r5   r8   r    r    r1   r!   rO      s    rO   c                       s$   e Zd Z fddZdd Z  ZS )LevitSubsamplec                       t    || _|| _d S r3   )r&   r'   r.   
resolution)r,   r.   rZ   r1   r    r!   r'         

zLevitSubsample.__init__c                 C   sL   |j \}}}||| j| j|d d d d | jd d | jf |d|}|S )N)rJ   viewrZ   r.   reshape)r,   rV   
batch_size_Zchannelsr    r    r!   r5      s   
zLevitSubsample.forwardrW   r    r    r1   r!   rX      s    rX   c                       B   e Zd Z fddZe d
 fdd	Zdd Zdd	 Z  Z	S )LevitAttentionc                    sB  t    || _|d | _|| _|| _|| | || d  | _|| | | _t|| j| _	t
 | _t| j|dd| _ttt|t|}t|}i g }}	|D ],}
|D ]'}t|
d |d  t|
d |d  f}||vrwt|||< |	||  qWqSi | _tj
t|t|| _| jdt|	||dd d S )	N      r=   r   )r0   r   attention_bias_idxsF
persistent)r&   r'   num_attention_headsscalekey_dimattention_ratioout_dim_keys_valuesout_dim_projectionrO   queries_keys_valuesr   rA   
activation
projectionlist	itertoolsproductrangelenabsappendattention_bias_cacher   	Parameterzerosattention_biasesregister_buffer
LongTensorr]   )r,   r?   ri   rg   rj   rZ   points
len_pointsattention_offsetsindicesp1p2offsetr1   r    r!   r'      s2   



(
zLevitAttention.__init__Tc                    (   t  | |r| jri | _d S d S d S r3   r&   trainrw   r,   moder1   r    r!   r         

zLevitAttention.trainc                 C   P   | j r| jd d | jf S t|}|| jvr#| jd d | jf | j|< | j| S r3   trainingrz   rd   strrw   r,   deviceZ
device_keyr    r    r!   get_attention_biases      

z#LevitAttention.get_attention_biasesc           
      C   s   |j \}}}| |}|||| jdj| j| j| j| j gdd\}}}|dddd}|dddd}|dddd}||dd | j	 | 
|j }	|	jdd}	|	| dd||| j}| | |}|S Nr\   r
   dimr   r=   r   )rJ   rm   r]   rg   splitri   rj   permuterM   rh   r   r   softmaxr^   rl   ro   rn   )
r,   rV   r_   
seq_lengthr`   rm   querykeyvalue	attentionr    r    r!   r5      s   
"zLevitAttention.forwardT
r   r   r   r'   r   Zno_gradr   r   r5   r8   r    r    r1   r!   rb      s    	rb   c                       ra   )LevitAttentionSubsamplec	                    s  t    || _|d | _|| _|| _|| | ||  | _|| | | _|| _t	|| j| _
t||| _t	||| | _t | _t	| j|| _i | _ttt|t|}	ttt|t|}
t|	t|
}}i g }}|
D ]>}|	D ]9}d}t|d | |d  |d d  t|d | |d  |d d  f}||vrt|||< |||  qxqttjt|t|| _| jdt| ||dd d S )Nrc   r   r   r=   rd   Fre   )!r&   r'   rg   rh   ri   rj   rk   rl   resolution_outrO   keys_valuesrX   queries_subsamplequeriesr   rA   rn   ro   rw   rp   rq   rr   rs   rt   ru   rv   r   rx   ry   rz   r{   r|   r]   )r,   rS   rT   ri   rg   rj   r.   resolution_inr   r}   Zpoints_r~   Zlen_points_r   r   r   r   sizer   r1   r    r!   r'      s<   



H
z LevitAttentionSubsample.__init__Tc                    r   r3   r   r   r1   r    r!   r     r   zLevitAttentionSubsample.trainc                 C   r   r3   r   r   r    r    r!   r     r   z,LevitAttentionSubsample.get_attention_biasesc           	      C   s   |j \}}}| |||| jdj| j| j| j gdd\}}|dddd}|dddd}| | 	|}||| j
d | j| jdddd}||dd | j | |j }|jdd}|| dd|d| j}| | |}|S r   )rJ   r   r]   rg   r   ri   rj   r   r   r   r   rM   rh   r   r   r   r^   rl   ro   rn   )	r,   rV   r_   r   r`   r   r   r   r   r    r    r!   r5     s"   "zLevitAttentionSubsample.forwardr   r   r    r    r1   r!   r      s    -	r   c                       r9   )LevitMLPLayerzE
    MLP Layer with `2X` expansion in contrast to ViT with `4X`.
    c                    s0   t    t||| _t | _t||| _d S r3   )r&   r'   rO   	linear_upr   rA   rn   linear_down)r,   rS   
hidden_dimr1   r    r!   r'   2  s   

zLevitMLPLayer.__init__c                 C   s"   |  |}| |}| |}|S r3   )r   rn   r   rU   r    r    r!   r5   8  s   


zLevitMLPLayer.forwardr7   r    r    r1   r!   r   -  s    r   c                       r9   )LevitResidualLayerz"
    Residual Block for LeViT
    c                    rY   r3   )r&   r'   module	drop_rate)r,   r   r   r1   r    r!   r'   D  r[   zLevitResidualLayer.__init__c                 C   sn   | j r.| jdkr.tj|ddd|jd}|| jd| j  }|| 	||  }|S || 	| }|S )Nr   r   )r   )
r   r   r   Zrandr   r   Zge_divdetachr   )r,   rV   Zrndr    r    r!   r5   I  s   zLevitResidualLayer.forwardr7   r    r    r1   r!   r   ?      r   c                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )
LevitStagezP
    LeViT Stage consisting of `LevitMLPLayer` and `LevitAttention` layers.
    c                    sD  t    g | _|| _|
| _t|D ])}| jtt|||||
| jj	 |dkr;|| }| jtt
||| jj	 q|	d dkr| jd |	d  d | _| jt| jj||d  |	d |	d |	d |	d |
| jd | j| _|	d dkr| jj|d  |	d  }| jtt
| jj|d  || jj	 t| j| _d S )	Nr   Z	Subsampler      r=   r
   )ri   rg   rj   r.   r   r   r<   )r&   r'   layersrI   r   rs   rv   r   rb   Zdrop_path_rater   r   r   r?   r   
ModuleList)r,   rI   idxr?   ri   depthsrg   rj   	mlp_ratiodown_opsr   r`   r   r1   r    r!   r'   Y  sN   
zLevitStage.__init__c                 C   s   | j S r3   )r   )r,   r    r    r!   get_resolution  s   zLevitStage.get_resolutionc                 C   s   | j D ]}||}q|S r3   )r   )r,   rV   layerr    r    r!   r5     r6   zLevitStage.forward)r   r   r   r   r'   r   r5   r8   r    r    r1   r!   r   T  s
    7r   c                       s*   e Zd ZdZ fddZdddZ  ZS )	LevitEncoderzC
    LeViT Encoder consisting of multiple `LevitStage` stages.
    c                    s   t    || _| jj| jj }g | _| jjdg tt	|j
D ].}t|||j| |j| |j
| |j| |j| |j| |j| |
}| }| j| q"t| j| _d S )N )r&   r'   rI   Z
image_sizeZ
patch_sizestagesr   rv   rs   rt   r   r   r?   ri   rg   rj   r   r   r   r   )r,   rI   rZ   Z	stage_idxstager1   r    r!   r'     s*   
zLevitEncoder.__init__FTc                 C   sb   |rdnd }| j D ]}|r||f }||}q	|r||f }|s+tdd ||fD S t||dS )Nr    c                 s   s    | ]	}|d ur|V  qd S r3   r    ).0vr    r    r!   	<genexpr>  s    z'LevitEncoder.forward.<locals>.<genexpr>)last_hidden_stater   )r   tupler   )r,   rV   output_hidden_statesreturn_dictZall_hidden_statesr   r    r    r!   r5     s   



zLevitEncoder.forward)FTr7   r    r    r1   r!   r     s    r   c                       r9   )LevitClassificationLayerz$
    LeViT Classification Layer
    c                    s(   t    t|| _t||| _d S r3   )r&   r'   r   rR   r+   rP   rQ   )r,   rS   rT   r1   r    r!   r'     s   
z!LevitClassificationLayer.__init__c                 C   s   |  |}| |}|S r3   )r+   rQ   )r,   rV   r   r    r    r!   r5     r6   z LevitClassificationLayer.forwardr7   r    r    r1   r!   r     r   r   c                   @   s&   e Zd ZeZdZdZdgZdd ZdS )LevitPreTrainedModellevitrN   r   c                 C   sz   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjtjfr;|j	j
  |jjd dS dS )zInitialize the weightsg        )meanZstdNg      ?)
isinstancer   rP   r(   weightdataZnormal_rI   Zinitializer_ranger%   Zzero_rR   r*   Zfill_)r,   r   r    r    r!   _init_weights  s   
z"LevitPreTrainedModel._init_weightsN)	r   r   r   r   Zconfig_classZbase_model_prefixZmain_input_nameZ_no_split_modulesr   r    r    r    r!   r     s    r   c                       X   e Zd Z fddZe			d
deej dee dee de	e
ef fdd	Z  ZS )
LevitModelc                    s2   t  | || _t|| _t|| _|   d S r3   )r&   r'   rI   r:   patch_embeddingsr   encoder	post_initrH   r1   r    r!   r'     s
   

zLevitModel.__init__NrN   r   r   returnc                 C   s   |d ur|n| j j}|d ur|n| j j}|d u rtd| |}| j|||d}|d }|jdd}|s?||f|dd   S t|||jdS )Nz You have to specify pixel_valuesr   r   r   r   r   )r   Zpooler_outputr   )	rI   r   use_return_dictrK   r   r   r   r   r   )r,   rN   r   r   r4   Zencoder_outputsr   Zpooled_outputr    r    r!   r5     s(   
zLevitModel.forwardNNN)r   r   r   r'   r   r   r   r   boolr   r   r   r5   r8   r    r    r1   r!   r     s    
r   z
    Levit Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )Zcustom_introc                       sd   e Zd Z fddZe				ddeej deej dee	 dee	 de
eef f
d	d
Z  ZS )LevitForImageClassificationc                    sX   t  | || _|j| _t|| _|jdkr t|jd |jntj	
 | _|   d S Nr   r\   )r&   r'   rI   
num_labelsr   r   r   r?   r   r   Identity
classifierr   rH   r1   r    r!   r'   !  s   

z$LevitForImageClassification.__init__NrN   labelsr   r   r   c                 C   sb  |dur|n| j j}| j|||d}|d }|d}| |}d}|dur| j jdu rP| jdkr6d| j _n| jdkrL|jtj	ksG|jtj
krLd| j _nd| j _| j jdkrnt }	| jdkrh|	| | }n+|	||}n%| j jdkrt }	|	|d| j|d}n| j jdkrt }	|	||}|s|f|d	d  }
|dur|f|
 S |
S t|||jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr\   r=   )lossr   r   )rI   r   r   r   r   Zproblem_typer   Zdtyper   longintr	   Zsqueezer   r]   r   r   r   )r,   rN   r   r   r   outputssequence_outputr   r   Zloss_fctoutputr    r    r!   r5   1  s@   



"


z#LevitForImageClassification.forward)NNNN)r   r   r   r'   r   r   r   r   r|   r   r   r   r   r5   r8   r    r    r1   r!   r     s$    
r   ap  
    LeViT Model transformer with image classification heads on top (a linear layer on top of the final hidden state and
    a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet. .. warning::
           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                       r   )&LevitForImageClassificationWithTeacherc                    s   t  | || _|j| _t|| _|jdkr t|jd |jntj	
 | _|jdkr4t|jd |jntj	
 | _|   d S r   )r&   r'   rI   r   r   r   r   r?   r   r   r   r   classifier_distillr   rH   r1   r    r!   r'   q  s   


z/LevitForImageClassificationWithTeacher.__init__NrN   r   r   r   c           
      C   s   |d ur|n| j j}| j|||d}|d }|d}| || |}}|| d }|s;|||f|dd   }	|	S t||||jdS )Nr   r   r   r=   )r   r   r   r   )rI   r   r   r   r   r   r   r   )
r,   rN   r   r   r   r   r   Zdistill_logitsr   r   r    r    r!   r5     s   
z.LevitForImageClassificationWithTeacher.forwardr   )r   r   r   r'   r   r   r   r   r   r   r   r   r5   r8   r    r    r1   r!   r   h  s    	
r   )r   r   r   r   )0r   rq   dataclassesr   typingr   r   r   r   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   utilsr   r   Zconfiguration_levitr   Z
get_loggerr   loggerr   Moduler"   r:   rO   rX   rb   r   r   r   r   r   r   r   r   r   r   __all__r    r    r    r!   <module>   sN   
,>SE..H2