o
    Zhn                     @   s  d dl Zd dlmZmZmZmZmZmZm	Z	 d dl
Z
d dlmZ d dlmZmZmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZmZ dd	lmZmZmZ d
dl m!Z! e"e#Z$G dd dej%Z&G dd dej%Z'eG dd deZ(	d3dej%de
j)de
j)de
j)dee
j) de*de*fddZ+G dd dej%Z,G dd dej%Z-G d d! d!ej%Z.G d"d# d#ej%Z/G d$d% d%ej%Z0G d&d' d'ej%Z1G d(d) d)ej%Z2G d*d+ d+ej%Z3eG d,d- d-e(Z4ed.d/G d0d1 d1e(Z5g d2Z6dS )4    N)CallableDictListOptionalSetTupleUnion)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )IJepaConfigc                       s<   e Zd ZdZ fddZd
dejdedejfdd	Z  Z	S )IJepaPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )Zkernel_sizeZstride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesnnConv2d
projection)selfconfigr   r   r   r    r%   	__class__ W/var/www/auris/lib/python3.10/site-packages/transformers/models/ijepa/modeling_ijepa.pyr       s   
 zIJepaPatchEmbeddings.__init__Fpixel_valuesinterpolate_pos_encodingreturnc              
   C   s   |j \}}}}|| jkrtd| j d| d|s?|| jd ks(|| jd kr?td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).   )shaper   
ValueErrorr   r(   flatten	transpose)r)   r/   r0   
batch_sizer   heightwidth
embeddingsr-   r-   r.   forward/   s(   
zIJepaPatchEmbeddings.forwardF)
__name__
__module____qualname____doc__r   torchTensorboolr=   __classcell__r-   r-   r+   r.   r      s    $r   c                	       sx   e Zd ZdZddededdf fddZd	ejd
e	de	dejfddZ
		ddejdeej dedejfddZ  ZS )IJepaEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fr*   use_mask_tokenr1   Nc                    st   t    |rttdd|jnd | _t|| _	| j	j
}ttd||j| _t|j| _|j| _|| _d S )Nr   )r   r   r&   	ParameterrC   Zzerosr    
mask_tokenr   patch_embeddingsr%   Zrandnposition_embeddingsDropouthidden_dropout_probdropoutr   r*   )r)   r*   rH   r%   r+   r-   r.   r   E   s   
 

zIJepaEmbeddings.__init__r<   r:   r;   c                 C   s   |j d }| jj d }tj s||kr||kr| jS | j}|j d }|| j }|| j }	t|d }
|d|
|
|}|dddd}t	j
j|||	fddd	}|dddddd|}|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r4   ZbicubicF)sizemodeZalign_corners)r5   rL   rC   Zjit
is_tracingr   r   reshapepermuter&   
functionalZinterpolateview)r)   r<   r:   r;   r%   Znum_positionsZpatch_pos_embeddimZ
new_heightZ	new_widthZsqrt_num_positionsr-   r-   r.   r0   O   s&   




z(IJepaEmbeddings.interpolate_pos_encodingr/   bool_masked_posr0   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }|r=|| ||| }n|| j }| |}|S )N)r0   r   rP         ?)	r5   rK   rJ   expandZ	unsqueezeZtype_asr0   rL   rO   )r)   r/   rY   r0   r9   _r:   r;   r<   Z
seq_lengthZmask_tokensmaskr-   r-   r.   r=   v   s   


zIJepaEmbeddings.forwardr>   NF)r?   r@   rA   rB   r   rE   r   rC   rD   intr0   r   
BoolTensorr=   rF   r-   r-   r+   r.   rG   @   s    
*rG   c                   @   sN   e Zd ZeZdZdZdZddgZdZ	dZ
deejejejf ddfd	d
ZdS )IJepaPreTrainedModelijepar/   TrG   
IJepaLayermoduler1   Nc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |trotjj|jjt	j
d| jjd|jj|j_|jdurq|jj  dS dS dS )zInitialize the weights        )meanZstdNrZ   )r!   r&   Linearr'   initZtrunc_normal_weightdatatorC   float32r*   Zinitializer_rangedtypebiasZzero_	LayerNormZfill_rG   rL   rJ   )r)   rd   r-   r-   r.   _init_weights   s0   




z"IJepaPreTrainedModel._init_weights)r?   r@   rA   r   Zconfig_classZbase_model_prefixZmain_input_nameZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_sdpaZ_supports_flash_attn_2r   r&   rg   r'   ro   rp   r-   r-   r-   r.   ra      s    &ra   re   rd   querykeyvalueattention_maskscalingrO   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )NrP   )rX   rm   )ptrainingr   r4   )rC   matmulr8   r&   rV   Zsoftmaxrl   rk   rm   rO   rx   
contiguous)
rd   rq   rr   rs   rt   ru   rO   kwargsZattn_weightsZattn_outputr-   r-   r.   eager_attention_forward   s   r|   c                
       sv   e Zd Zdeddf fddZdejdejfddZ		dd
eej de	de
eejejf eej f fddZ  ZS )IJepaSelfAttentionr*   r1   Nc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   Zembedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r2   g      F)rn   )r   r   r    num_attention_headshasattrr6   r*   r_   attention_head_sizeall_head_sizeZattention_probs_dropout_probdropout_probru   	is_causalr&   rg   Zqkv_biasrq   rr   rs   r)   r*   r+   r-   r.   r      s"   

zIJepaSelfAttention.__init__xc                 C   s6   |  d d | j| jf }||}|ddddS )NrP   r   r4   r   r   )rQ   r~   r   rW   rU   )r)   r   Znew_x_shaper-   r-   r.   transpose_for_scores   s   
z'IJepaSelfAttention.transpose_for_scoresF	head_maskoutput_attentionsc              
   C   s   |  | |}|  | |}|  | |}t}| jjdkr4| jjdkr.|r.td nt	| jj }|| ||||| j
| j| jsCdn| jd\}}	| d d | jf }
||
}|rc||	f}|S |f}|S )NeagerZsdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.re   )r   ru   rO   rv   )r   rr   rs   rq   r|   r*   Z_attn_implementationloggerZwarning_oncer   r   ru   rx   r   rQ   r   rT   )r)   hidden_statesr   r   Z	key_layerZvalue_layerZquery_layerZattention_interfaceZcontext_layerZattention_probsZnew_context_layer_shapeoutputsr-   r-   r.   r=      s4   

zIJepaSelfAttention.forwardr^   )r?   r@   rA   r   r   rC   rD   r   r   rE   r   r   r=   rF   r-   r-   r+   r.   r}      s    r}   c                       sF   e Zd ZdZdeddf fddZdejdejdejfd	d
Z  Z	S )IJepaSelfOutputz
    The residual connection is defined in IJepaLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r*   r1   Nc                    s.   t    t|j|j| _t|j| _d S N)	r   r   r&   rg   r    denserM   rN   rO   r   r+   r-   r.   r        
zIJepaSelfOutput.__init__r   input_tensorc                 C      |  |}| |}|S r   r   rO   r)   r   r   r-   r-   r.   r=        

zIJepaSelfOutput.forward)
r?   r@   rA   rB   r   r   rC   rD   r=   rF   r-   r-   r+   r.   r     s    $r   c                       s~   e Zd Zdeddf fddZdee ddfddZ			dd
ej	de
ej	 dedeeej	ej	f eej	 f fddZ  ZS )IJepaAttentionr*   r1   Nc                    s*   t    t|| _t|| _t | _d S r   )r   r   r}   	attentionr   outputsetpruned_headsr   r+   r-   r.   r   !  s   


zIJepaAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rX   )lenr   r   r~   r   r   r   rq   rr   rs   r   r   r   union)r)   r   indexr-   r-   r.   prune_heads'  s   zIJepaAttention.prune_headsFr   r   r   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r)   r   r   r   Zself_outputsattention_outputr   r-   r-   r.   r=   9  s   zIJepaAttention.forwardr^   )r?   r@   rA   r   r   r   r_   r   rC   rD   r   rE   r   r   r=   rF   r-   r-   r+   r.   r      s    r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	IJepaIntermediater*   r1   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r&   rg   r    intermediate_sizer   r!   Z
hidden_actstrr   intermediate_act_fnr   r+   r-   r.   r   H  s
   
zIJepaIntermediate.__init__r   c                 C   r   r   )r   r   )r)   r   r-   r-   r.   r=   P  r   zIJepaIntermediate.forward	r?   r@   rA   r   r   rC   rD   r=   rF   r-   r-   r+   r.   r   G  s    r   c                       sB   e Zd Zdeddf fddZdejdejdejfdd	Z  ZS )
IJepaOutputr*   r1   Nc                    s.   t    t|j|j| _t|j| _	d S r   )
r   r   r&   rg   r   r    r   rM   rN   rO   r   r+   r-   r.   r   X  r   zIJepaOutput.__init__r   r   c                 C   s    |  |}| |}|| }|S r   r   r   r-   r-   r.   r=   ]  s   

zIJepaOutput.forwardr   r-   r-   r+   r.   r   W  s    $r   c                       sl   e Zd ZdZdeddf fddZ		ddejd	eej d
e	de
eejejf eej f fddZ  ZS )rc   z?This corresponds to the Block class in the timm implementation.r*   r1   Nc                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   Zeps)r   r   Zchunk_size_feed_forwardZseq_len_dimr   r   r   intermediater   r   r&   ro   r    layer_norm_epslayernorm_beforelayernorm_afterr   r+   r-   r.   r   i  s   



zIJepaLayer.__init__Fr   r   r   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S )N)r   r   r   )r   r   r   r   r   )r)   r   r   r   Zself_attention_outputsr   r   Zlayer_outputr-   r-   r.   r=   s  s   


zIJepaLayer.forwardr^   )r?   r@   rA   rB   r   r   rC   rD   r   rE   r   r   r=   rF   r-   r-   r+   r.   rc   f  s    rc   c                       sb   e Zd Zdeddf fddZ				ddejd	eej d
ededede	e
ef fddZ  ZS )IJepaEncoderr*   r1   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r-   )rc   ).0r\   r*   r-   r.   
<listcomp>  s    z)IJepaEncoder.__init__.<locals>.<listcomp>F)	r   r   r*   r&   Z
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r+   r   r.   r     s   
 
zIJepaEncoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ]8\}}	|r||f }|d ur$|| nd }
| jr6| jr6| |	j||
|}n|	||
|}|d }|rI||d f }q|rQ||f }|s_tdd |||fD S t|||dS )Nr-   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r-   )r   vr-   r-   r.   	<genexpr>  s    z'IJepaEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	enumerater   r   rx   Z_gradient_checkpointing_func__call__tupler   )r)   r   r   r   r   r   Zall_hidden_statesZall_self_attentionsiZlayer_moduleZlayer_head_maskZlayer_outputsr-   r-   r.   r=     s6   

zIJepaEncoder.forward)NFFT)r?   r@   rA   r   r   rC   rD   r   rE   r   r   r   r=   rF   r-   r-   r+   r.   r     s&    	
r   c                       s*   e Zd Zdef fddZdd Z  ZS )IJepaPoolerr*   c                    s,   t    t|j|j| _t|j | _	d S r   )
r   r   r&   rg   r    Zpooler_output_sizer   r   Z
pooler_act
activationr   r+   r-   r.   r     s   
zIJepaPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r)   r   Zfirst_token_tensorpooled_outputr-   r-   r.   r=     s   

zIJepaPooler.forward)r?   r@   rA   r   r   r=   rF   r-   r-   r+   r.   r     s    r   c                       s   e Zd Zddededef fddZdefdd	Zd
ee	e
e	 f ddfddZe							ddeej deej deej dee dee dee dee deeef fddZ  ZS )
IJepaModelFr*   add_pooling_layerrH   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|r%t|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )rH   r   N)r   r   r*   rG   r<   r   encoderr&   ro   r    r   	layernormr   pooler	post_init)r)   r*   r   rH   r+   r-   r.   r     s   
zIJepaModel.__init__r1   c                 C   s   | j jS r   )r<   rK   )r)   r-   r-   r.   get_input_embeddings  s   zIJepaModel.get_input_embeddingsheads_to_pruneNc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r   r   r   )r)   r   r   r   r-   r-   r.   _prune_heads  s   zIJepaModel._prune_headsr/   rY   r   r   r   r0   r   c                 C   s
  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| || j j}| jjj	j
j}|j|kr?||}| j|||d}	| j|	||||d}
|
d }| |}| jdurd| |nd}|s{|durp||fn|f}||
dd  S t|||
j|
jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rY   r0   )r   r   r   r   r   r   )r   Zpooler_outputr   r   )r*   r   r   use_return_dictr6   Zget_head_maskr   r<   rK   r(   ri   rm   rk   r   r   r   r   r   r   )r)   r/   rY   r   r   r   r0   r   Zexpected_dtypeZembedding_outputZencoder_outputssequence_outputr   Zhead_outputsr-   r-   r.   r=     s@   


zIJepaModel.forward)FFNNNNNNN)r?   r@   rA   r   rE   r   r   r   r   r_   r   r   r   r   rC   rD   r`   r   r   r   r=   rF   r-   r-   r+   r.   r     s:    
	r   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )Zcustom_introc                       s   e Zd Zdeddf fddZe							ddeej deej deej d	ee	 d
ee	 dee	 dee	 de
eef fddZ  ZS )IJepaForImageClassificationr*   r1   Nc                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S )NF)r   r   )r   r   
num_labelsr   rb   r&   rg   r    ZIdentity
classifierr   r   r+   r-   r.   r   ?  s
   $z$IJepaForImageClassification.__init__r/   r   labelsr   r   r0   r   c                 C   sv  |dur|n| j j}| j||||||d}|d }	| |	jdd}
d}|dur||
j}| j jdu rX| jdkr>d| j _n| jdkrT|j	t
jksO|j	t
jkrTd| j _nd| j _| j jdkrvt }| jdkrp||
 | }n+||
|}n%| j jdkrt }||
d	| j|d	}n| j jdkrt }||
|}|s|
f|dd  }|dur|f| S |S t||
|j|jd
S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   r0   r   r   r   r   Z
regressionZsingle_label_classificationZmulti_label_classificationrP   )losslogitsr   r   )r*   r   rb   r   rf   rk   ZdeviceZproblem_typer   rm   rC   longr_   r   Zsqueezer
   rW   r	   r   r   r   )r)   r/   r   r   r   r   r0   r   r   r   r   r   Zloss_fctr   r-   r-   r.   r=   K  sP   	

"


z#IJepaForImageClassification.forwardr   )r?   r@   rA   r   r   r   r   rC   rD   rE   r   r   r   r=   rF   r-   r-   r+   r.   r   0  s6    
	r   )ra   r   r   )re   )7collections.abcr"   typingr   r   r   r   r   r   r   rC   Ztorch.nnr&   r	   r
   r   Zactivationsr   Zmodeling_outputsr   r   r   Zmodeling_utilsr   r   Zpytorch_utilsr   r   utilsr   r   r   Zconfiguration_ijepar   Z
get_loggerr?   r   Moduler   rG   ra   rD   floatr|   r}   r   r   r   r   rc   r   r   r   r   __all__r-   r-   r-   r.   <module>   s^   $
'Q'
>'*3]R