o
    Zh$                     @   s   d dl mZmZ d dlZd dlmZ d dlmZmZmZ d dl	m
Z
 ddlmZ ddlmZ ddlmZmZ d	d
lmZmZmZ G dd deZeG dd deZG dd deeZeddG dd deeZg dZdS )    )OptionalUnionN)BCEWithLogitsLossCrossEntropyLossMSELoss)IJepaConfig   )ImageClassifierOutput)PreTrainedModel)auto_docstring	torch_int   )ViTEmbeddingsViTForImageClassificationViTModelc                	       st   e Zd Zddededdf fddZdejd	ed
edejfddZ			ddejde
ej dedejfddZ  ZS )IJepaEmbeddingsFconfiguse_mask_tokenreturnNc                    s6   t  || | `| jj}ttd||j	| _
d S )N   )super__init__Z	cls_tokenpatch_embeddingsnum_patchesnn	ParametertorchZrandnZhidden_sizeposition_embeddings)selfr   r   r   	__class__ V/var/www/auris/lib/python3.10/site-packages/transformers/models/ijepa/modular_ijepa.pyr      s   zIJepaEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r   ZbicubicF)sizemodeZalign_corners)shaper   r   Zjit
is_tracingZ
patch_sizer   ZreshapeZpermuter   Z
functionalZinterpolateview)r   r#   r$   r%   r   Znum_positionsZpatch_pos_embeddimZ
new_heightZ	new_widthZsqrt_num_positionsr!   r!   r"   interpolate_pos_encoding   s&   




z(IJepaEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posr-   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }|r=|| ||| }n|| j }| |}|S )N)r-   r   r&         ?)	r)   r   
mask_tokenexpandZ	unsqueezeZtype_asr-   r   Zdropout)r   r.   r/   r-   Z
batch_size_r$   r%   r#   Z
seq_lengthZmask_tokensmaskr!   r!   r"   forward>   s   


zIJepaEmbeddings.forward)F)NF)__name__
__module____qualname__r   boolr   r   Tensorintr-   r   Z
BoolTensorr5   __classcell__r!   r!   r   r"   r      s    *r   c                   @   sN   e Zd ZeZdZdZdZddgZdZ	dZ
deejejejf ddfd	d
ZdS )IJepaPreTrainedModelijepar.   Tr   Z
IJepaLayermoduler   Nc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |trotjj|jjt	j
d| jjd|jj|j_|jdurq|jj  dS dS dS )zInitialize the weightsg        )meanZstdNr0   )
isinstancer   LinearConv2dinitZtrunc_normal_weightdatator   Zfloat32r   Zinitializer_rangedtypeZbiasZzero_	LayerNormZfill_r   r   r1   )r   r?   r!   r!   r"   _init_weightsc   s0   




z"IJepaPreTrainedModel._init_weights)r6   r7   r8   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attn_2r   r   rB   rC   rI   rJ   r!   r!   r!   r"   r=   Y   s    &r=   c                       s,   e Zd Zddededef fddZ  ZS )
IJepaModelFr   add_pooling_layerr   c                    s$   t  | || _t||d| _dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   N)r   r   r   r   r#   )r   r   rL   r   r   r!   r"   r   {   s   zIJepaModel.__init__)FF)r6   r7   r8   r   r9   r   r<   r!   r!   r   r"   rK   z   s    $rK   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )Zcustom_introc                       s   e Zd Zdef fddZ							ddeej deej deej dee d	ee d
ee dee de	e
ef fddZ  ZS )IJepaForImageClassificationr   c                    s&   t  | t|dd| _|   d S )NF)rL   )r   r   rK   r>   Z	post_init)r   r   r   r!   r"   r      s   z$IJepaForImageClassification.__init__Nr.   	head_masklabelsoutput_attentionsoutput_hidden_statesr-   return_dictr   c                 C   sv  |dur|n| j j}| j||||||d}|d }	| |	jdd}
d}|dur||
j}| j jdu rX| jdkr>d| j _n| jdkrT|j	t
jksO|j	t
jkrTd| j _nd| j _| j jdkrvt }| jdkrp||
 | }n+||
|}n%| j jdkrt }||
d	| j|d	}n| j jdkrt }||
|}|s|
f|dd  }|dur|f| S |S t||
|j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)rN   rP   rQ   r-   rR   r   r   )r,   Z
regressionZsingle_label_classificationZmulti_label_classificationr&   )losslogitshidden_states
attentions)r   Zuse_return_dictr>   Z
classifierr@   rG   ZdeviceZproblem_typeZ
num_labelsrH   r   longr;   r   Zsqueezer   r+   r   r	   rU   rV   )r   r.   rN   rO   rP   rQ   r-   rR   ZoutputsZsequence_outputrT   rS   Zloss_fctoutputr!   r!   r"   r5      sP   	

"


z#IJepaForImageClassification.forward)NNNNNNN)r6   r7   r8   r   r   r   r   r:   r9   r   tupler	   r5   r<   r!   r!   r   r"   rM      s4    
	rM   )r=   rK   rM   )typingr   r   r   Ztorch.nnr   r   r   r   Z-transformers.models.ijepa.configuration_ijepar   Zmodeling_outputsr	   Zmodeling_utilsr
   utilsr   r   Zvit.modeling_vitr   r   r   r   r=   rK   rM   __all__r!   r!   r!   r"   <module>   s$    J J