o
    Zhe                  	   @   s  d Z ddlZddlZddlmZmZmZmZ ddlZddl	m
  mZ ddlZddlm
Z
 ddlmZmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlm Z  e!e"Z#d/dej$de%de&dej$fddZ'G dd de
j(Z)G dd de
j(Z*G dd de
j(Z+G dd de
j(Z,G dd de
j(Z-G d d! d!e
j(Z.G d"d# d#e
j(Z/G d$d% d%e
j(Z0eG d&d' d'eZ1eG d(d) d)e1Z2ed*d+G d,d- d-e1Z3g d.Z4dS )0zPyTorch PVT model.    N)IterableOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputImageClassifierOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging   )	PvtConfig        Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   )dtypedevice)shapendimtorchZrandr   r   Zfloor_div)r   r   r   Z	keep_probr   Zrandom_tensoroutput r!   S/var/www/auris/lib/python3.10/site-packages/transformers/models/pvt/modeling_pvt.py	drop_path)   s   
r#   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )PvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                    s   t    || _d S N)super__init__r   )selfr   	__class__r!   r"   r'   A   s   

zPvtDropPath.__init__hidden_statesc                 C   s   t || j| jS r%   )r#   r   r   r(   r+   r!   r!   r"   forwardE   s   zPvtDropPath.forwardc                 C   s   d | jS )Nzp={})formatr   )r(   r!   r!   r"   
extra_reprH   s   zPvtDropPath.extra_reprr%   )__name__
__module____qualname____doc__r   floatr'   r   Tensorr-   strr/   __classcell__r!   r!   r)   r"   r$   >   s
    r$   c                       s   e Zd ZdZ	ddedeeee f deeee f dededed	ef fd
dZ	de
jdedede
jfddZde
jdee
jeef fddZ  ZS )PvtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    Fconfig
image_size
patch_sizestridenum_channelshidden_size	cls_tokenc           	         s   t    || _t|tjjr|n||f}t|tjjr|n||f}|d |d  |d |d   }|| _|| _|| _	|| _
ttd|rI|d n||| _|r[ttdd|nd | _tj||||d| _tj||jd| _tj|jd| _d S )Nr   r   Zkernel_sizer<   Zeps)p)r&   r'   r9   
isinstancecollectionsabcr   r:   r;   r=   num_patchesr   	Parameterr   Zrandnposition_embeddingsZzerosr?   Conv2d
projection	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropout)	r(   r9   r:   r;   r<   r=   r>   r?   rF   r)   r!   r"   r'   S   s    

 zPvtPatchEmbeddings.__init__
embeddingsheightwidthr   c                 C   s|   || }t j s|| jj| jj kr| jS |d||ddddd}tj	|||fdd}|dd|| ddd}|S )Nr   r   r
      Zbilinear)sizemode)
r   Zjit
is_tracingr9   r:   rH   reshapepermuteFZinterpolate)r(   rQ   rR   rS   rF   Zinterpolated_embeddingsr!   r!   r"   interpolate_pos_encodingo   s   z+PvtPatchEmbeddings.interpolate_pos_encodingpixel_valuesc                 C   s   |j \}}}}|| jkrtd| |}|j ^ }}}|ddd}| |}| jd urc| j|dd}	t	j
|	|fdd}| | jd d dd f ||}
t	j
| jd d d df |
fdd}
n| | j||}
| ||
 }|||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rU   r   rT   dim)r   r=   
ValueErrorrJ   flatten	transposerM   r?   expandr   catr\   rH   rP   )r(   r]   
batch_sizer=   rR   rS   Zpatch_embed_rQ   r?   rH   r!   r!   r"   r-   z   s"   



 &
zPvtPatchEmbeddings.forwardF)r0   r1   r2   r3   r   r   intr   boolr'   r   r5   r\   r   r-   r7   r!   r!   r)   r"   r8   L   s(    (r8   c                       s<   e Zd Zdedef fddZdejdejfddZ  Z	S )	PvtSelfOutputr9   r>   c                    s*   t    t||| _t|j| _d S r%   )r&   r'   r   LineardenserN   rO   rP   )r(   r9   r>   r)   r!   r"   r'      s   
zPvtSelfOutput.__init__r+   r   c                 C   s   |  |}| |}|S r%   )rl   rP   r,   r!   r!   r"   r-      s   

zPvtSelfOutput.forward)
r0   r1   r2   r   rh   r'   r   r5   r-   r7   r!   r!   r)   r"   rj      s    rj   c                       sp   e Zd ZdZdedededef fddZded	ej	fd
dZ
	ddej	dededed	eej	 f
ddZ  ZS )PvtEfficientSelfAttentionzpEfficient self-attention mechanism with reduction of the sequence [PvT paper](https://arxiv.org/abs/2102.12122).r9   r>   num_attention_headssequences_reduction_ratioc                    s   t    || _|| _| j| j dkr td| j d| j dt| j| j | _| j| j | _tj	| j| j|j
d| _tj	| j| j|j
d| _tj	| j| j|j
d| _t|j| _|| _|dkrwtj||||d| _tj||jd| _d S d S )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ())biasr   r@   rA   )r&   r'   r>   rn   r`   rh   attention_head_sizeall_head_sizer   rk   Zqkv_biasquerykeyvaluerN   Zattention_probs_dropout_probrP   ro   rI   sequence_reductionrK   rL   rM   r(   r9   r>   rn   ro   r)   r!   r"   r'      s,   

z"PvtEfficientSelfAttention.__init__r+   r   c                 C   s6   |  d d | j| jf }||}|ddddS )NrT   r   rU   r   r
   )rV   rn   rr   viewrZ   )r(   r+   Z	new_shaper!   r!   r"   transpose_for_scores   s   
z.PvtEfficientSelfAttention.transpose_for_scoresFrR   rS   output_attentionsc                 C   s&  |  | |}| jdkr6|j\}}}|ddd||||}| |}|||dddd}| |}|  | |}	|  | 	|}
t
||	dd}|t| j }tjj|dd}| |}t
||
}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr   r   rU   rT   r^   r
   )rz   rt   ro   r   rZ   rY   rw   rM   ru   rv   r   matmulrb   mathsqrtrr   r   
functionalZsoftmaxrP   
contiguousrV   rs   ry   )r(   r+   rR   rS   r{   Zquery_layerre   Zseq_lenr=   Z	key_layerZvalue_layerZattention_scoresZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr!   r!   r"   r-      s*   




z!PvtEfficientSelfAttention.forwardrg   )r0   r1   r2   r3   r   rh   r4   r'   r   r5   rz   ri   r   r-   r7   r!   r!   r)   r"   rm      s0    
rm   c                       s`   e Zd Zdedededef fddZdd Z		dd
ej	dedede
deej	 f
ddZ  ZS )PvtAttentionr9   r>   rn   ro   c                    s6   t    t||||d| _t||d| _t | _d S )N)r>   rn   ro   )r>   )r&   r'   rm   r(   rj   r    setpruned_headsrx   r)   r!   r"   r'      s   
zPvtAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r^   )lenr   r(   rn   rr   r   r   rt   ru   rv   r    rl   rs   union)r(   headsindexr!   r!   r"   prune_heads   s   zPvtAttention.prune_headsFr+   rR   rS   r{   r   c                 C   s4   |  ||||}| |d }|f|dd   }|S )Nr   r   )r(   r    )r(   r+   rR   rS   r{   Zself_outputsattention_outputr   r!   r!   r"   r-     s   zPvtAttention.forwardrg   )r0   r1   r2   r   rh   r4   r'   r   r   r5   ri   r   r-   r7   r!   r!   r)   r"   r      s.    r   c                
       sR   e Zd Z		ddededee dee f fddZdejd	ejfd
dZ	  Z
S )PvtFFNNr9   in_featureshidden_featuresout_featuresc                    sj   t    |d ur|n|}t||| _t|jtr!t|j | _	n|j| _	t||| _
t|j| _d S r%   )r&   r'   r   rk   dense1rC   Z
hidden_actr6   r   intermediate_act_fndense2rN   rO   rP   )r(   r9   r   r   r   r)   r!   r"   r'     s   
zPvtFFN.__init__r+   r   c                 C   s6   |  |}| |}| |}| |}| |}|S r%   )r   r   rP   r   r,   r!   r!   r"   r-   *  s   




zPvtFFN.forward)NN)r0   r1   r2   r   rh   r   r'   r   r5   r-   r7   r!   r!   r)   r"   r     s    r   c                       sT   e Zd Zdedededededef fddZdd
ejdedede	fddZ
  ZS )PvtLayerr9   r>   rn   r#   ro   	mlp_ratioc                    sz   t    tj||jd| _t||||d| _|dkrt|nt	 | _
tj||jd| _t|| }t|||d| _d S )NrA   )r9   r>   rn   ro   r   )r9   r   r   )r&   r'   r   rK   rL   layer_norm_1r   	attentionr$   Identityr#   layer_norm_2rh   r   mlp)r(   r9   r>   rn   r#   ro   r   Zmlp_hidden_sizer)   r!   r"   r'   4  s   
	zPvtLayer.__init__Fr+   rR   rS   r{   c           
      C   sn   | j | ||||d}|d }|dd  }| |}|| }| | |}| |}|| }	|	f| }|S )N)r+   rR   rS   r{   r   r   )r   r   r#   r   r   )
r(   r+   rR   rS   r{   Zself_attention_outputsr   r   Z
mlp_outputZlayer_outputr!   r!   r"   r-   J  s   


zPvtLayer.forwardrg   )r0   r1   r2   r   rh   r4   r'   r   r5   ri   r-   r7   r!   r!   r)   r"   r   3  s    &r   c                       s^   e Zd Zdef fddZ			ddejdee dee d	ee d
e	e
ef f
ddZ  ZS )
PvtEncoderr9   c           	         sx  t    || _tjd|jt|jdd }g }t	|j
D ]9}|t||dkr+|jn	| jjd|d   |j| |j| |dkrC|jn|j|d  |j| ||j
d kd qt|| _g }d}t	|j
D ]@}g }|dkrx||j|d  7 }t	|j| D ]}|t||j| |j| |||  |j| |j| d q|t| qgt|| _tj|jd |jd	| _d S )
Nr   cpu)r   rU   r   )r9   r:   r;   r<   r=   r>   r?   )r9   r>   rn   r#   ro   r   rT   rA   )r&   r'   r9   r   ZlinspaceZdrop_path_ratesumZdepthstolistrangeZnum_encoder_blocksappendr8   r:   Zpatch_sizesstridesr=   hidden_sizesr   Z
ModuleListpatch_embeddingsr   rn   Zsequence_reduction_ratiosZ
mlp_ratiosblockrK   rL   rM   )	r(   r9   Zdrop_path_decaysrQ   iblockscurZlayersjr)   r!   r"   r'   b  sJ   
 

zPvtEncoder.__init__FTr]   r{   output_hidden_statesreturn_dictr   c                 C   s  |rdnd }|r
dnd }|j d }t| j}|}	tt| j| jD ]C\}
\}}||	\}	}}|D ]}||	|||}|d }	|rF||d f }|rM||	f }q0|
|d krd|	|||ddddd }	q!| 	|	}	|rq||	f }|st
dd |	||fD S t|	||d	S )
Nr!   r   r   rT   r
   rU   c                 s   s    | ]	}|d ur|V  qd S r%   r!   ).0vr!   r!   r"   	<genexpr>  s    z%PvtEncoder.forward.<locals>.<genexpr>Zlast_hidden_stater+   
attentions)r   r   r   	enumeratezipr   rY   rZ   r   rM   tupler   )r(   r]   r{   r   r   Zall_hidden_statesZall_self_attentionsre   Z
num_blocksr+   idxZembedding_layerZblock_layerrR   rS   r   Zlayer_outputsr!   r!   r"   r-     s8   


 

zPvtEncoder.forward)FFT)r0   r1   r2   r   r'   r   FloatTensorr   ri   r   r   r   r-   r7   r!   r!   r)   r"   r   a  s     5
r   c                   @   s>   e Zd ZeZdZdZg Zdee	j
e	je	jf ddfddZdS )PvtPreTrainedModelpvtr]   moduler   Nc                 C   s   t |tjr$tjj|jjd| jjd|j_|j	dur"|j	j
  dS dS t |tjr9|j	j
  |jjd dS t |trctjj|jjd| jjd|j_|jduretjj|jjd| jjd|j_dS dS dS )zInitialize the weightsr   )meanZstdNg      ?)rC   r   rk   initZtrunc_normal_weightdatar9   Zinitializer_rangerq   Zzero_rK   Zfill_r8   rH   r?   )r(   r   r!   r!   r"   _init_weights  s,   



z PvtPreTrainedModel._init_weights)r0   r1   r2   r   Zconfig_classZbase_model_prefixZmain_input_nameZ_no_split_modulesr   r   rk   rI   rK   r   r!   r!   r!   r"   r     s    &r   c                       sj   e Zd Zdef fddZdd Ze			ddejde	e
 d	e	e
 d
e	e
 deeef f
ddZ  ZS )PvtModelr9   c                    s(   t  | || _t|| _|   d S r%   )r&   r'   r9   r   encoder	post_initr(   r9   r)   r!   r"   r'     s   
zPvtModel.__init__c                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   layerr   r   )r(   Zheads_to_pruner   r   r!   r!   r"   _prune_heads  s   zPvtModel._prune_headsNr]   r{   r   r   r   c                 C   s~   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}| j||||d}|d }|s6|f|dd   S t||j|jdS )Nr]   r{   r   r   r   r   r   )r9   r{   r   use_return_dictr   r   r+   r   )r(   r]   r{   r   r   Zencoder_outputssequence_outputr!   r!   r"   r-     s$   zPvtModel.forward)NNN)r0   r1   r2   r   r'   r   r   r   r   r   ri   r   r   r   r-   r7   r!   r!   r)   r"   r     s$    

r   z
    Pvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    )Zcustom_introc                       sv   e Zd Zdeddf fddZe				ddeej deej dee	 d	ee	 d
ee	 de
eef fddZ  ZS )PvtForImageClassificationr9   r   Nc                    sR   t  | |j| _t|| _|jdkrt|jd |jnt | _	| 
  d S )Nr   rT   )r&   r'   
num_labelsr   r   r   rk   r   r   
classifierr   r   r)   r!   r"   r'     s   
$z"PvtForImageClassification.__init__r]   labelsr{   r   r   c                 C   sp  |dur|n| j j}| j||||d}|d }| |dddddf }d}	|dur| j jdu rU| jdkr;d| j _n| jdkrQ|jtjksL|jtj	krQd| j _nd| j _| j jdkrst
 }
| jdkrm|
| | }	n+|
||}	n%| j jdkrt }
|
|d| j|d}	n| j jdkrt }
|
||}	|s|f|dd  }|	dur|	f| S |S t|	||j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrT   )losslogitsr+   r   )r9   r   r   r   Zproblem_typer   r   r   longrh   r	   Zsqueezer   ry   r   r   r+   r   )r(   r]   r   r{   r   r   r   r   r   r   Zloss_fctr    r!   r!   r"   r-   #  sJ   

"


z!PvtForImageClassification.forward)NNNN)r0   r1   r2   r   r'   r   r   r   r5   ri   r   r   r   r-   r7   r!   r!   r)   r"   r     s(    
r   )r   r   r   )r   F)5r3   rD   r~   typingr   r   r   r   r   Ztorch.nn.functionalr   r   r[   Ztorch.utils.checkpointZtorch.nnr   r   r	   Zactivationsr   Zmodeling_outputsr   r   Zmodeling_utilsr   Zpytorch_utilsr   r   utilsr   r   Zconfiguration_pvtr   Z
get_loggerr0   loggerr5   r4   ri   r#   Moduler$   r8   rj   rm   r   r   r   r   r   r   r   __all__r!   r!   r!   r"   <module>   sD   
 DR*.Y3N