o
    Zh7                     @   s*  d Z ddlZddlZddlmZmZmZmZm	Z	m
Z
mZ ddlZddlZddlmZ ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% e"&e'Z(G dd dej)Z*G dd dej)Z+	d8dej)dej,dej,dej,deej, de-de-fddZ.G dd dej)Z/G dd dej)Z0G d d! d!ej)Z1G d"d# d#ej)Z2G d$d% d%ej)Z3G d&d' d'ej)Z4G d(d) d)ej)Z5e!G d*d+ d+eZ6e!G d,d- d-e6Z7G d.d/ d/ej)Z8e!d0d1G d2d3 d3e6Z9e!d4d1G d5d6 d6e6Z:g d7Z;dS )9zPyTorch ViT model.    N)CallableDictListOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )	ViTConfigc                	       sx   e Zd ZdZddededdf fddZd	ejd
e	de	dejfddZ
		ddejdeej dedejfddZ  ZS )ViTEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenreturnNc                    s   t    ttdd|j| _|rttdd|jnd | _	t
|| _| jj}ttd|d |j| _t|j| _|j| _|| _d S )Nr   )super__init__r	   	ParametertorchZrandnhidden_size	cls_tokenZzeros
mask_tokenViTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r   r)   	__class__ S/var/www/auris/lib/python3.10/site-packages/transformers/models/vit/modeling_vit.pyr!   /   s   
 

zViTEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   N      ?r   r      ZbicubicF)sizemodeZalign_cornersdim)shaper*   r#   Zjit
is_tracingr.   r   reshapepermuter	   
functionalZinterpolateviewcat)r/   r4   r5   r6   r)   Znum_positionsZclass_pos_embedZpatch_pos_embedr=   Z
new_heightZ	new_widthZsqrt_num_positionsr2   r2   r3   interpolate_pos_encoding;   s(   



z&ViTEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posrE   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }| j|dd}tj||fdd}|rN|| 	||| }n|| j
 }| |}|S )N)rE   r   r7         ?r<   )r>   r(   r&   expand	unsqueezeZtype_asr%   r#   rD   rE   r*   r-   )r/   rF   rG   rE   
batch_sizenum_channelsr5   r6   r4   Z
seq_lengthZmask_tokensmaskZ
cls_tokensr2   r2   r3   forwardc   s   


zViTEmbeddings.forwardFNF)__name__
__module____qualname____doc__r   boolr!   r#   TensorintrE   r   
BoolTensorrN   __classcell__r2   r2   r0   r3   r   *   s    +r   c                       s<   e Zd ZdZ fddZd
dejdedejfdd	Z  Z	S )r'   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizeZstride)r    r!   
image_sizer.   rL   r$   
isinstancecollectionsabcIterabler)   r	   Conv2d
projection)r/   r   r[   r.   rL   r$   r)   r0   r2   r3   r!      s   
 zViTPatchEmbeddings.__init__FrF   rE   r   c              
   C   s   |j \}}}}|| jkrtd| j d| d|s?|| jd ks(|| jd kr?td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).r9   )r>   rL   
ValueErrorr[   ra   flatten	transpose)r/   rF   rE   rK   rL   r5   r6   r4   r2   r2   r3   rN      s(   
zViTPatchEmbeddings.forwardrO   )
rQ   rR   rS   rT   r!   r#   rV   rU   rN   rY   r2   r2   r0   r3   r'      s    $r'           modulequerykeyvalueattention_maskscalingr-   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )Nr7   )r=   dtype)ptrainingr   r9   )r#   matmulrf   r	   rB   Zsoftmaxfloat32toro   r-   rq   
contiguous)
rh   ri   rj   rk   rl   rm   r-   kwargsZattn_weightsZattn_outputr2   r2   r3   eager_attention_forward   s   rw   c                
       sv   e Zd Zdeddf fddZdejdejfddZ		dd
eej de	de
eejejf eej f fddZ  ZS )ViTSelfAttentionr   r   Nc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads rb   g      F)bias)r    r!   r$   num_attention_headshasattrrd   r   rW   attention_head_sizeall_head_sizeZattention_probs_dropout_probdropout_probrm   	is_causalr	   LinearZqkv_biasri   rj   rk   r/   r   r0   r2   r3   r!      s"   

zViTSelfAttention.__init__xc                 C   s6   |  d d | j| jf }||}|ddddS )Nr7   r   r9   r   r   )r:   rz   r|   rC   rA   )r/   r   Znew_x_shaper2   r2   r3   transpose_for_scores   s   
z%ViTSelfAttention.transpose_for_scoresF	head_maskoutput_attentionsc              
   C   s   |  | |}|  | |}|  | |}t}| jjdkr4| jjdkr.|r.td nt	| jj }|| ||||| j
| j| jsCdn| jd\}}	| d d | jf }
||
}|rc||	f}|S |f}|S )NeagerZsdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rg   )r   rm   r-   rn   )r   rj   rk   ri   rw   r   Z_attn_implementationloggerZwarning_oncer   r   rm   rq   r~   r:   r}   r@   )r/   hidden_statesr   r   Z	key_layerZvalue_layerZquery_layerZattention_interfaceZcontext_layerZattention_probsZnew_context_layer_shapeoutputsr2   r2   r3   rN      s4   

zViTSelfAttention.forwardrP   )rQ   rR   rS   r   r!   r#   rV   r   r   rU   r   r   rN   rY   r2   r2   r0   r3   rx      s    rx   c                       sF   e Zd ZdZdeddf fddZdejdejdejfd	d
Z  Z	S )ViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   r   Nc                    s.   t    t|j|j| _t|j| _d S N)	r    r!   r	   r   r$   denser+   r,   r-   r   r0   r2   r3   r!        
zViTSelfOutput.__init__r   input_tensorc                 C      |  |}| |}|S r   r   r-   r/   r   r   r2   r2   r3   rN        

zViTSelfOutput.forward)
rQ   rR   rS   rT   r   r!   r#   rV   rN   rY   r2   r2   r0   r3   r     s    $r   c                       s~   e Zd Zdeddf fddZdee ddfddZ			dd
ej	de
ej	 dedeeej	ej	f eej	 f fddZ  ZS )ViTAttentionr   r   Nc                    s*   t    t|| _t|| _t | _d S r   )r    r!   rx   	attentionr   outputsetpruned_headsr   r0   r2   r3   r!     s   


zViTAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r<   )lenr   r   rz   r|   r   r   ri   rj   rk   r   r   r}   union)r/   r   indexr2   r2   r3   prune_heads  s   zViTAttention.prune_headsFr   r   r   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r/   r   r   r   Zself_outputsattention_outputr   r2   r2   r3   rN   0  s   zViTAttention.forwardrP   )rQ   rR   rS   r   r!   r   rW   r   r#   rV   r   rU   r   r   rN   rY   r2   r2   r0   r3   r     s    r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	ViTIntermediater   r   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r    r!   r	   r   r$   intermediate_sizer   r\   Z
hidden_actstrr   intermediate_act_fnr   r0   r2   r3   r!   ?  s
   
zViTIntermediate.__init__r   c                 C   r   r   )r   r   )r/   r   r2   r2   r3   rN   G  r   zViTIntermediate.forward	rQ   rR   rS   r   r!   r#   rV   rN   rY   r2   r2   r0   r3   r   >  s    r   c                       sB   e Zd Zdeddf fddZdejdejdejfdd	Z  ZS )
	ViTOutputr   r   Nc                    s.   t    t|j|j| _t|j| _	d S r   )
r    r!   r	   r   r   r$   r   r+   r,   r-   r   r0   r2   r3   r!   O  r   zViTOutput.__init__r   r   c                 C   s    |  |}| |}|| }|S r   r   r   r2   r2   r3   rN   T  s   

zViTOutput.forwardr   r2   r2   r0   r3   r   N  s    $r   c                       sl   e Zd ZdZdeddf fddZ		ddejd	eej d
e	de
eejejf eej f fddZ  ZS )ViTLayerz?This corresponds to the Block class in the timm implementation.r   r   Nc                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   Zeps)r    r!   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r   r	   	LayerNormr$   layer_norm_epslayernorm_beforelayernorm_afterr   r0   r2   r3   r!   `  s   



zViTLayer.__init__Fr   r   r   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S )N)r   r   r   )r   r   r   r   r   )r/   r   r   r   Zself_attention_outputsr   r   Zlayer_outputr2   r2   r3   rN   j  s   


zViTLayer.forwardrP   )rQ   rR   rS   rT   r   r!   r#   rV   r   rU   r   r   rN   rY   r2   r2   r0   r3   r   ]  s    r   c                       sb   e Zd Zdeddf fddZ				ddejd	eej d
ededede	e
ef fddZ  ZS )
ViTEncoderr   r   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r2   )r   ).0_r   r2   r3   
<listcomp>  s    z'ViTEncoder.__init__.<locals>.<listcomp>F)	r    r!   r   r	   Z
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r0   r   r3   r!     s   
 
zViTEncoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ]8\}}	|r||f }|d ur$|| nd }
| jr6| jr6| |	j||
|}n|	||
|}|d }|rI||d f }q|rQ||f }|s_tdd |||fD S t|||dS )Nr2   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r2   )r   vr2   r2   r3   	<genexpr>  s    z%ViTEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	enumerater   r   rq   Z_gradient_checkpointing_func__call__tupler   )r/   r   r   r   r   r   Zall_hidden_statesZall_self_attentionsiZlayer_moduleZlayer_head_maskZlayer_outputsr2   r2   r3   rN     s6   

zViTEncoder.forward)NFFT)rQ   rR   rS   r   r!   r#   rV   r   rU   r   r   r   rN   rY   r2   r2   r0   r3   r     s&    	
r   c                   @   sN   e Zd ZeZdZdZdZddgZdZ	dZ
deejejejf ddfd	d
ZdS )ViTPreTrainedModelvitrF   Tr   r   rh   r   Nc                 C   s  t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |trtjj|jjt	j
d| jjd|jj|j_tjj|jjt	j
d| jjd|jj|j_|jdur|jj  dS dS dS )zInitialize the weightsrg   )meanZstdNrH   )r\   r	   r   r`   initZtrunc_normal_weightdatart   r#   rs   r   Zinitializer_rangero   ry   Zzero_r   Zfill_r   r*   r%   r&   )r/   rh   r2   r2   r3   _init_weights  s>   





z ViTPreTrainedModel._init_weights)rQ   rR   rS   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attn_2r   r	   r   r`   r   r   r2   r2   r2   r3   r     s    &r   c                       s   e Zd Zddededef fddZdefd	d
Zdee	e
e	 f ddfddZe							ddeej deej deej dee dee dee dee deeef fddZ  ZS )ViTModelTFr   add_pooling_layerr   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|r%t|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   r   N)r    r!   r   r   r4   r   encoderr	   r   r$   r   	layernorm	ViTPoolerpooler	post_init)r/   r   r   r   r0   r2   r3   r!     s   
zViTModel.__init__r   c                 C   s   | j jS r   )r4   r(   )r/   r2   r2   r3   get_input_embeddings  s   zViTModel.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r/   r   r   r   r2   r2   r3   _prune_heads  s   zViTModel._prune_headsrF   rG   r   r   r   rE   r   c                 C   s
  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| || j j}| jjj	j
j}|j|kr?||}| j|||d}	| j|	||||d}
|
d }| |}| jdurd| |nd}|s{|durp||fn|f}||
dd  S t|||
j|
jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rG   rE   )r   r   r   r   r   r   )r   Zpooler_outputr   r   )r   r   r   use_return_dictrd   Zget_head_maskr   r4   r(   ra   r   ro   rt   r   r   r   r   r   r   )r/   rF   rG   r   r   r   rE   r   Zexpected_dtypeZembedding_outputZencoder_outputssequence_outputpooled_outputZhead_outputsr2   r2   r3   rN     s@   


zViTModel.forward)TFNNNNNNN)rQ   rR   rS   r   rU   r!   r'   r   r   rW   r   r   r   r   r#   rV   rX   r   r   r   rN   rY   r2   r2   r0   r3   r     s:    
	r   c                       s*   e Zd Zdef fddZdd Z  ZS )r   r   c                    s,   t    t|j|j| _t|j | _	d S r   )
r    r!   r	   r   r$   Zpooler_output_sizer   r   Z
pooler_act
activationr   r0   r2   r3   r!   B  s   
zViTPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r/   r   Zfirst_token_tensorr   r2   r2   r3   rN   G  s   

zViTPooler.forward)rQ   rR   rS   r   r!   rN   rY   r2   r2   r0   r3   r   A  s    r   a[  
    ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )Zcustom_introc                       s   e Zd Zdeddf fddZe							ddeej deej	 deej d	ee
 d
ee
 dee
 dee
 deeef fddZ  ZS )ViTForMaskedImageModelingr   r   Nc                    sX   t  | t|ddd| _ttj|j|jd |j	 ddt
|j| _|   d S )NFT)r   r   r9   r   )Zin_channelsZout_channelsrZ   )r    r!   r   r   r	   Z
Sequentialr`   r$   encoder_striderL   ZPixelShuffledecoderr   r   r0   r2   r3   r!   ]  s   

z"ViTForMaskedImageModeling.__init__rF   rG   r   r   r   rE   r   c              	   C   s  |dur|n| j j}|dur%| j j| j jkr%td| j j d| j j d| j|||||||d}|d }	|	ddddf }	|	j\}
}}t|d  }}|		dd	d
|
|||}	| |	}d}|dur| j j| j j }|
d
||}|| j jd| j jd	d }tjj||dd}||  | d  | j j }|s|f|dd  }|dur|f| S |S t|||j|jdS )a+  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
        >>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```NzWhen `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that the reconstructed image has the same dimensions as the input. Got `patch_size` = z and `encoder_stride` = rb   )rG   r   r   r   rE   r   r   r   r8   r9   r7   none)Z	reductiongh㈵>)lossZreconstructionr   r   )r   r   r.   r   rd   r   r>   mathfloorrA   r@   r   r[   Zrepeat_interleaverJ   ru   r	   rB   Zl1_losssumrL   r   r   r   )r/   rF   rG   r   r   r   rE   r   r   r   rK   Zsequence_lengthrL   r5   r6   Zreconstructed_pixel_valuesZmasked_im_lossr:   rM   Zreconstruction_lossr   r2   r2   r3   rN   n  sX   &

 z!ViTForMaskedImageModeling.forwardr   )rQ   rR   rS   r   r!   r   r   r#   rV   rX   rU   r   r   r   rN   rY   r2   r2   r0   r3   r   P  s6    
	r   a  
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                       s   e Zd Zdeddf fddZe							ddeej deej deej d	ee	 d
ee	 dee	 dee	 de
eef fddZ  ZS )ViTForImageClassificationr   r   Nc                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S )NF)r   r   )r    r!   
num_labelsr   r   r	   r   r$   ZIdentity
classifierr   r   r0   r2   r3   r!     s
   $z"ViTForImageClassification.__init__rF   r   labelsr   r   rE   r   c                 C   s  |dur|n| j j}| j||||||d}|d }	| |	dddddf }
d}|dur||
j}| j jdu r]| jdkrCd| j _n| jdkrY|jt	j
ksT|jt	jkrYd| j _nd| j _| j jdkr{t }| jdkru||
 | }n+||
|}n%| j jdkrt }||
d| j|d}n| j jdkrt }||
|}|s|
f|dd  }|dur|f| S |S t||
|j|jd	S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   rE   r   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationr7   )r   logitsr   r   )r   r   r   r   rt   ZdeviceZproblem_typer   ro   r#   longrW   r   Zsqueezer   rC   r
   r   r   r   )r/   rF   r   r   r   r   rE   r   r   r   r   r   Zloss_fctr   r2   r2   r3   rN     sP   	

"


z!ViTForImageClassification.forwardr   )rQ   rR   rS   r   r!   r   r   r#   rV   rU   r   r   r   rN   rY   r2   r2   r0   r3   r     s6    
	r   )r   r   r   r   )rg   )<rT   collections.abcr]   r   typingr   r   r   r   r   r   r   r#   Ztorch.utils.checkpointr	   Ztorch.nnr
   r   r   Zactivationsr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   r   Zpytorch_utilsr   r   utilsr   r   r   Zconfiguration_vitr   Z
get_loggerrQ   r   Moduler   r'   rV   floatrw   rx   r   r   r   r   r   r   r   r   r   r   r   __all__r2   r2   r2   r3   <module>   sl   $
X.
>'*3'^oR