o
    ZhX                     @  s  d Z ddlmZ ddlZddlZddlmZ ddlm	Z	m
Z
mZ ddlZddlmZ ddlmZmZmZmZ dd	lmZmZmZmZmZmZ dd
lmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' e$(e)Z*dZ+dZ,g dZ-dZ.dZ/eG dd de Z0G dd dej1j2Z3G dd dej1j2Z4G dd dej1j2Z5G dd dej1j2Z6G dd dej1j2Z7G dd dej1j2Z8G d d! d!ej1j2Z9G d"d# d#ej1j2Z:G d$d% d%ej1j2Z;eG d&d' d'ej1j2Z<G d(d) d)eZ=d*Z>d+Z?e"d,e>G d-d. d.e=Z@G d/d0 d0ej1j2ZAG d1d2 d2ej1j2ZBG d3d4 d4ej1j2ZCe"d5e>G d6d7 d7e=ZDe"d8e>G d9d: d:e=eZEe"d;e>G d<d= d=e=ZFg d>ZGdS )?zTensorFlow DeiT model.    )annotationsN)	dataclass)OptionalTupleUnion   )get_tf_activation)TFBaseModelOutputTFBaseModelOutputWithPoolingTFImageClassifierOutputTFMaskedImageModelingOutput)TFPreTrainedModelTFSequenceClassificationLossget_initializerkeraskeras_serializableunpack_inputs)
shape_liststable_softmax)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
DeiTConfigr   z(facebook/deit-base-distilled-patch16-224)r      i   ztabby, tabby catc                   @  sN   e Zd ZU dZdZded< dZded< dZded< dZded< dZ	ded	< dS )
-TFDeiTForImageClassificationWithTeacherOutputa  
    Output type of [`DeiTForImageClassificationWithTeacher`].

    Args:
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores as the average of the cls_logits and distillation logits.
        cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
            class token).
        distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
            distillation token).
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
            the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    NzOptional[tf.Tensor]logits
cls_logitsdistillation_logitszTuple[tf.Tensor] | Nonehidden_states
attentions)
__name__
__module____qualname____doc__r   __annotations__r    r!   r"   r#    r)   r)   X/var/www/auris/lib/python3.10/site-packages/transformers/models/deit/modeling_tf_deit.pyr   C   s   
 r   c                      sJ   e Zd ZdZdd fd	d
ZdddZdddZ			d d!ddZ  ZS )"TFDeiTEmbeddingszv
    Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.
    Fconfigr   use_mask_tokenboolreturnNonec                   sD   t  jdi | || _|| _t|dd| _tjj|j	dd| _
d S )Npatch_embeddings)r,   namedropoutr2   r)   )super__init__r,   r-   TFDeiTPatchEmbeddingsr1   r   layersDropouthidden_dropout_probr3   )selfr,   r-   kwargs	__class__r)   r*   r6   g   s
   zTFDeiTEmbeddings.__init__Nc                 C  sV  | j dd| jjftj ddd| _| j dd| jjftj ddd| _d | _| j	r<| j dd| jjftj ddd| _| j
j}| j d|d | jjftj ddd| _| jrYd S d| _t| d	d d urt| j
j | j
d  W d    n1 s|w   Y  t| d
d d urt| jj | jd  W d    d S 1 sw   Y  d S d S )Nr   T	cls_token)shapeZinitializerZ	trainabler2   distillation_token
mask_token   position_embeddingsr1   r3   )Z
add_weightr,   hidden_sizer   ZinitializersZzerosr?   rA   rB   r-   r1   num_patchesrD   builtgetattrtf
name_scoper2   buildr3   )r;   input_shaperF   r)   r)   r*   rK   n   sN   "zTFDeiTEmbeddings.build
embeddings	tf.Tensorheightintwidthc              	   C  sF  |j d d }| jj d d }||kr||kr| jS | jd d dd d f }| jd d dd d f }| jd d dd d d f }|j d }	|| jj }
|| jj }|
d |d }
}t|dtt|tt||	f}tj	j
|t|
t|fdd}tj|g dd	}t|dd|	f}tjtj|dd
tj|dd
|gdd
S )Nr   rC   r   g?Zbicubic)sizemethodr   rC   r   r   permaxis)r@   rD   r,   
patch_sizerI   reshaperP   mathsqrtimageresize	transposeconcatexpand_dims)r;   rM   rO   rQ   rF   Znum_positionsZclass_pos_embedZdist_pos_embedZpatch_pos_embeddimZh0Zw0r)   r)   r*   interpolate_pos_encoding   s(   
 z)TFDeiTEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_postf.Tensor | Nonetrainingrd   c                 C  s   |j \}}}}| |}t|\}	}
}|d ur:t| j|	|
dg}tj|dd}tj||jd}|d|  ||  }tj	| j
|	dd}tj	| j|	dd}tj|||fdd}| j}|rb| |||}|| }| j||d}|S )	Nr   rR   rX   dtypeg      ?r   )ZrepeatsrY   rh   )r@   r1   r   rI   tilerB   rb   castrj   repeatr?   rA   ra   rD   rd   r3   )r;   re   rf   rh   rd   _rO   rQ   rM   
batch_sizeZ
seq_lengthZmask_tokensmaskZ
cls_tokensZdistillation_tokensZposition_embeddingr)   r)   r*   call   s"   
zTFDeiTEmbeddings.callF)r,   r   r-   r.   r/   r0   N)rM   rN   rO   rP   rQ   rP   r/   rN   )NFF)
re   rN   rf   rg   rh   r.   rd   r.   r/   rN   )	r$   r%   r&   r'   r6   rK   rd   rr   __classcell__r)   r)   r=   r*   r+   b   s    

'r+   c                      s6   e Zd ZdZd fddZdd
dZdddZ  ZS )r7   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    r,   r   r/   r0   c                   s   t  jdi | |j|j}}|j|j}}t|tjj	r |n||f}t|tjj	r-|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tjj|||dd| _d S )Nr   r   
projection)kernel_sizestridesr2   r)   )r5   r6   
image_sizerZ   num_channelsrE   
isinstancecollectionsabcIterablerF   r   r8   Conv2Drv   )r;   r,   r<   ry   rZ   rz   rE   rF   r=   r)   r*   r6      s    zTFDeiTPatchEmbeddings.__init__re   rN   c                 C  s^   t |\}}}}t r|| jkrtd| |}t |\}}}}t|||| |f}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rI   Zexecuting_eagerlyrz   
ValueErrorrv   r[   )r;   re   rp   rO   rQ   rz   xr)   r)   r*   rr      s   
zTFDeiTPatchEmbeddings.callNc                 C  sn   | j rd S d| _ t| dd d ur5t| jj | jd d d | jg W d    d S 1 s.w   Y  d S d S )NTrv   )rG   rH   rI   rJ   rv   r2   rK   rz   r;   rL   r)   r)   r*   rK         "zTFDeiTPatchEmbeddings.buildr,   r   r/   r0   )re   rN   r/   rN   rt   r$   r%   r&   r'   r6   rr   rK   ru   r)   r)   r=   r*   r7      s
    
r7   c                      s@   e Zd Zd fddZdd
dZ	ddddZdddZ  ZS )TFDeiTSelfAttentionr,   r   c                   s   t  jd
i | |j|j dkrtd|j d|j d|j| _t|j|j | _| j| j | _t	| j| _
tjj| jt|jdd| _tjj| jt|jdd| _tjj| jt|jdd| _tjj|jd	| _|| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()queryunitskernel_initializerr2   keyvalueZrater)   )r5   r6   rE   num_attention_headsr   rP   attention_head_sizeall_head_sizer\   r]   sqrt_att_head_sizer   r8   Denser   initializer_ranger   r   r   r9   Zattention_probs_dropout_probr3   r,   r;   r,   r<   r=   r)   r*   r6      s,   

zTFDeiTSelfAttention.__init__tensorrN   rp   rP   r/   c                 C  s,   t j||d| j| jfd}t j|g ddS )NrR   r   r@   r   rC   r   r   rV   )rI   r[   r   r   r`   )r;   r   rp   r)   r)   r*   transpose_for_scores  s   z(TFDeiTSelfAttention.transpose_for_scoresFr"   	head_maskoutput_attentionsr.   rh   Tuple[tf.Tensor]c                 C  s   t |d }| j|d}| j|d}| j|d}| ||}	| ||}
| ||}tj|	|
dd}tj| j|j	d}t
||}t|dd}| j||d}|d urXt||}t||}tj|g d	d
}tj||d| jfd}|rz||f}|S |f}|S )Nr   inputsT)Ztranspose_bri   rR   )r   rY   r   rh   r   rV   r   )r   r   r   r   r   rI   matmulrm   r   rj   divider   r3   multiplyr`   r[   r   )r;   r"   r   r   rh   rp   Zmixed_query_layerZmixed_key_layerZmixed_value_layerZquery_layerZ	key_layerZvalue_layerZattention_scoresZdkZattention_probsattention_outputoutputsr)   r)   r*   rr     s*   zTFDeiTSelfAttention.callNc                 C  s  | j rd S d| _ t| dd d ur2t| jj | jd d | jjg W d    n1 s-w   Y  t| dd d ur\t| j	j | j	d d | jjg W d    n1 sWw   Y  t| dd d urt| j
j | j
d d | jjg W d    d S 1 sw   Y  d S d S )NTr   r   r   )rG   rH   rI   rJ   r   r2   rK   r,   rE   r   r   r   r)   r)   r*   rK   H  s    "zTFDeiTSelfAttention.buildr,   r   )r   rN   rp   rP   r/   rN   rs   
r"   rN   r   rN   r   r.   rh   r.   r/   r   rt   )r$   r%   r&   r6   r   rr   rK   ru   r)   r)   r=   r*   r      s    
)r   c                      s8   e Zd ZdZd fddZddddZdddZ  ZS )TFDeiTSelfOutputz
    The residual connection is defined in TFDeiTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r,   r   c                   J   t  jdi | tjj|jt|jdd| _tjj	|j
d| _|| _d S Ndenser   r   r)   r5   r6   r   r8   r   rE   r   r   r   r9   r:   r3   r,   r   r=   r)   r*   r6   ^     
zTFDeiTSelfOutput.__init__Fr"   rN   input_tensorrh   r.   r/   c                 C  s   | j |d}| j||d}|S Nr   r   r   r3   r;   r"   r   rh   r)   r)   r*   rr   g  s   zTFDeiTSelfOutput.callNc                 C  n   | j rd S d| _ t| dd d ur5t| jj | jd d | jjg W d    d S 1 s.w   Y  d S d S NTr   	rG   rH   rI   rJ   r   r2   rK   r,   rE   r   r)   r)   r*   rK   m  r   zTFDeiTSelfOutput.buildr   rs   r"   rN   r   rN   rh   r.   r/   rN   rt   r   r)   r)   r=   r*   r   X  s
    	r   c                      s>   e Zd Zd fddZdd Z	ddddZdddZ  ZS )TFDeiTAttentionr,   r   c                   s2   t  jdi | t|dd| _t|dd| _d S )N	attentionr4   outputr)   )r5   r6   r   self_attentionr   dense_outputr   r=   r)   r*   r6   x  s   zTFDeiTAttention.__init__c                 C     t rt   NotImplementedError)r;   Zheadsr)   r)   r*   prune_heads~  s   zTFDeiTAttention.prune_headsFr   rN   r   r   r.   rh   r/   r   c                 C  s<   | j ||||d}| j|d ||d}|f|dd   }|S )Nr"   r   r   rh   r   r"   r   rh   r   )r   r   )r;   r   r   r   rh   Zself_outputsr   r   r)   r)   r*   rr     s   
zTFDeiTAttention.callNc                 C     | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urUt| jj | jd  W d    d S 1 sNw   Y  d S d S )NTr   r   )rG   rH   rI   rJ   r   r2   rK   r   r   r)   r)   r*   rK        "zTFDeiTAttention.buildr   rs   )
r   rN   r   rN   r   r.   rh   r.   r/   r   rt   )r$   r%   r&   r6   r   rr   rK   ru   r)   r)   r=   r*   r   w  s    r   c                      2   e Zd Zd fddZddd	ZdddZ  ZS )TFDeiTIntermediater,   r   c                   sZ   t  jdi | tjj|jt|jdd| _t	|j
tr$t|j
| _n|j
| _|| _d S )Nr   r   r)   )r5   r6   r   r8   r   intermediate_sizer   r   r   r{   Z
hidden_actstrr   intermediate_act_fnr,   r   r=   r)   r*   r6     s   
zTFDeiTIntermediate.__init__r"   rN   r/   c                 C  s   | j |d}| |}|S )Nr   )r   r   )r;   r"   r)   r)   r*   rr     s   
zTFDeiTIntermediate.callNc                 C  r   r   r   r   r)   r)   r*   rK     r   zTFDeiTIntermediate.buildr   r"   rN   r/   rN   rt   r$   r%   r&   r6   rr   rK   ru   r)   r)   r=   r*   r     s    
r   c                      s4   e Zd Zd fddZddddZdddZ  ZS )TFDeiTOutputr,   r   c                   r   r   r   r   r=   r)   r*   r6     r   zTFDeiTOutput.__init__Fr"   rN   r   rh   r.   r/   c                 C  s&   | j |d}| j||d}|| }|S r   r   r   r)   r)   r*   rr     s   zTFDeiTOutput.callNc                 C  r   r   )	rG   rH   rI   rJ   r   r2   rK   r,   r   r   r)   r)   r*   rK     r   zTFDeiTOutput.buildr   rs   r   rt   r   r)   r)   r=   r*   r     s    	r   c                      s:   e Zd ZdZd fddZ	ddddZdddZ  ZS )TFDeiTLayerz?This corresponds to the Block class in the timm implementation.r,   r   c                   sn   t  jdi | t|dd| _t|dd| _t|dd| _tj	j
|jdd| _tj	j
|jdd| _|| _d S )	Nr   r4   intermediater   layernorm_beforeepsilonr2   layernorm_afterr)   )r5   r6   r   r   r   r   r   deit_outputr   r8   LayerNormalizationlayer_norm_epsr   r   r,   r   r=   r)   r*   r6     s   
zTFDeiTLayer.__init__Fr"   rN   r   r   r.   rh   r/   r   c           
      C  sn   | j | j||d|||d}|d }|| }| j||d}| j||d}| j|||d}|f|dd   }	|	S )Nr   )r   r   r   rh   r   )r"   rh   r   r   )r   r   r   r   r   )
r;   r"   r   r   rh   Zattention_outputsr   Zlayer_outputZintermediate_outputr   r)   r)   r*   rr     s   zTFDeiTLayer.callNc                 C  s  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d urwt| jj | jd  W d    n1 srw   Y  t| dd d urt| j	j | j	d d | j
jg W d    n1 sw   Y  t| dd d urt| jj | jd d | j
jg W d    d S 1 sw   Y  d S d S )NTr   r   r   r   r   )rG   rH   rI   rJ   r   r2   rK   r   r   r   r,   rE   r   r   r)   r)   r*   rK     s0   "zTFDeiTLayer.buildr   rs   r   rt   r   r)   r)   r=   r*   r     s     r   c                      s6   e Zd Zd fddZ	ddddZdddZ  ZS )TFDeiTEncoderr,   r   c                   s0   t  jdi |  fddt jD | _d S )Nc                   s   g | ]}t  d | dqS )zlayer_._r4   )r   ).0ir,   r)   r*   
<listcomp>  s    z*TFDeiTEncoder.__init__.<locals>.<listcomp>r)   )r5   r6   rangenum_hidden_layerslayerr   r=   r   r*   r6     s   zTFDeiTEncoder.__init__Fr"   rN   r   r   r.   output_hidden_statesreturn_dictrh   r/   *Union[TFBaseModelOutput, Tuple[tf.Tensor]]c                 C  s   |rdnd }|r
dnd }t | jD ]"\}	}
|r||f }|
|||	 ||d}|d }|r3||d f }q|r;||f }|sItdd |||fD S t|||dS )Nr)   r   r   r   c                 s  s    | ]	}|d ur|V  qd S rt   r)   )r   vr)   r)   r*   	<genexpr>A  s    z%TFDeiTEncoder.call.<locals>.<genexpr>)last_hidden_stater"   r#   )	enumerater   tupler	   )r;   r"   r   r   r   r   rh   Zall_hidden_statesZall_attentionsr   Zlayer_moduleZlayer_outputsr)   r)   r*   rr   !  s,   	

zTFDeiTEncoder.callNc              	   C  sj   | j rd S d| _ t| dd d ur1| jD ]}t|j |d  W d    n1 s+w   Y  qd S d S )NTr   )rG   rH   r   rI   rJ   r2   rK   )r;   rL   r   r)   r)   r*   rK   G  s   
zTFDeiTEncoder.buildr   rs   )r"   rN   r   rN   r   r.   r   r.   r   r.   rh   r.   r/   r   rt   r   r)   r)   r=   r*   r     s
    &r   c                      sj   e Zd ZeZ	d#d$ fd
dZd%ddZdd Zdd Ze									d&d'dd Z
d(d!d"Z  ZS ))TFDeiTMainLayerTFr,   r   add_pooling_layerr.   r-   r/   r0   c                   sj   t  jdi | || _t||dd| _t|dd| _tjj	|j
dd| _|r0t|dd| _d S d | _d S )	NrM   )r-   r2   encoderr4   	layernormr   poolerr)   )r5   r6   r,   r+   rM   r   r   r   r8   r   r   r   TFDeiTPoolerr   r;   r,   r   r-   r<   r=   r)   r*   r6   U  s    zTFDeiTMainLayer.__init__r7   c                 C  s   | j jS rt   )rM   r1   )r;   r)   r)   r*   get_input_embeddingsa  s   z$TFDeiTMainLayer.get_input_embeddingsc                 C  r   )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        r   )r;   Zheads_to_pruner)   r)   r*   _prune_headsd  s   zTFDeiTMainLayer._prune_headsc                 C  s   |d urt d g| jj }|S rt   )r   r,   r   )r;   r   r)   r)   r*   get_head_maskk  s   zTFDeiTMainLayer.get_head_maskNre   rg   rf   r   r   Optional[bool]r   r   rd   rh   :Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor, ...]]c	                 C  s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&tdt|d}| |}| j||||d}	| j	|	|||||d}
|
d }| j
||d}| jd ur\| j||dnd }|ss|d urh||fn|f}||
dd   S t|||
j|
jdS )	Nz You have to specify pixel_valuesrU   )rf   rh   rd   )r   r   r   r   rh   r   rk   r   )r   Zpooler_outputr"   r#   )r,   r   r   use_return_dictr   rI   r`   r   rM   r   r   r   r
   r"   r#   )r;   re   rf   r   r   r   r   rd   rh   Zembedding_outputZencoder_outputssequence_outputpooled_outputZhead_outputsr)   r)   r*   rr   s  sD   
zTFDeiTMainLayer.callc                 C  sL  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d ur|t| jj | jd d | j	j
g W d    n1 sww   Y  t| dd d urt| jj | jd  W d    d S 1 sw   Y  d S d S )NTrM   r   r   r   )rG   rH   rI   rJ   rM   r2   rK   r   r   r,   rE   r   r   r)   r)   r*   rK     s(   "zTFDeiTMainLayer.buildTFr,   r   r   r.   r-   r.   r/   r0   )r/   r7   NNNNNNFF)re   rg   rf   rg   r   rg   r   r   r   r   r   r   rd   r.   rh   r.   r/   r   rt   )r$   r%   r&   r   config_classr6   r   r   r   r   rr   rK   ru   r)   r)   r=   r*   r   Q  s$    
=r   c                   @  s   e Zd ZdZeZdZdZdS )TFDeiTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    deitre   N)r$   r%   r&   r'   r   r   Zbase_model_prefixZmain_input_namer)   r)   r)   r*   r     s
    r   aR  
    This model is a TensorFlow
    [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular
    TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior.

    Parameters:
        config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`DeiTImageProcessor.__call__`] for details.

        head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare DeiT Model transformer outputting raw hidden-states without any specific head on top.c                	      sf   e Zd Z	dd fd
dZeeeeee	e
ded								d d!ddZd"ddZ  ZS )#TFDeiTModelTFr,   r   r   r.   r-   r/   r0   c                   s*   t  j|fi | t|||dd| _d S )Nr   r   r-   r2   )r5   r6   r   r   r   r=   r)   r*   r6     s   zTFDeiTModel.__init__Zvision)
checkpointoutput_typer   Zmodalityexpected_outputNre   rg   rf   r   r   r   r   r   rd   rh   *Union[Tuple, TFBaseModelOutputWithPooling]c	           
   
   C  s   | j ||||||||d}	|	S )N)re   rf   r   r   r   r   rd   rh   )r   )
r;   re   rf   r   r   r   r   rd   rh   r   r)   r)   r*   rr     s   
zTFDeiTModel.callc                 C  sd   | j rd S d| _ t| dd d ur0t| jj | jd  W d    d S 1 s)w   Y  d S d S )NTr   )rG   rH   rI   rJ   r   r2   rK   r   r)   r)   r*   rK   !  s   "zTFDeiTModel.buildr   r   r   )re   rg   rf   rg   r   rg   r   r   r   r   r   r   rd   r.   rh   r.   r/   r   rt   )r$   r%   r&   r6   r   r   DEIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr
   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPErr   rK   ru   r)   r)   r=   r*   r     s,    		r   c                      r   )r   r,   r   c                   s<   t  jdi | tjj|jt|j|jdd| _	|| _
d S )Nr   )r   r   Z
activationr2   r)   )r5   r6   r   r8   r   Zpooler_output_sizer   r   Z
pooler_actr   r,   r   r=   r)   r*   r6   ,  s   
zTFDeiTPooler.__init__r"   rN   r/   c                 C  s    |d d df }| j |d}|S )Nr   r   )r   )r;   r"   Zfirst_token_tensorr   r)   r)   r*   rr   7  s   zTFDeiTPooler.callNc                 C  r   r   r   r   r)   r)   r*   rK   ?  r   zTFDeiTPooler.buildr   r   rt   r   r)   r)   r=   r*   r   +  s    
r   c                      s,   e Zd ZdZd fddZdd
dZ  ZS )TFDeitPixelShufflez0TF layer implementation of torch.nn.PixelShuffleupscale_factorrP   r/   r0   c                   s<   t  jdi | t|tr|dk rtd| || _d S )NrC   z1upscale_factor must be an integer value >= 2 got r)   )r5   r6   r{   rP   r   r  )r;   r  r<   r=   r)   r*   r6   K  s   
zTFDeitPixelShuffle.__init__r   rN   c                   s~   |}t |\}}}}| jd  t|  t fddt D g}tj|t||dgdd}tjj	|| jdd}|S )	NrC   c                   s&   g | ]}t D ]}||   qqS r)   )r   )r   r   jZblock_size_squaredZoutput_depthr)   r*   r   [  s   & z+TFDeitPixelShuffle.call.<locals>.<listcomp>r   rR   )paramsindicesZ
batch_dimsZNHWC)
block_sizeZdata_format)
r   r  rP   rI   Zconstantr   gatherrl   nnZdepth_to_space)r;   r   r"   rp   ro   Znum_input_channelsZpermutationr)   r  r*   rr   Q  s   
zTFDeitPixelShuffle.call)r  rP   r/   r0   )r   rN   r/   rN   )r$   r%   r&   r'   r6   rr   ru   r)   r)   r=   r*   r  H  s    r  c                      s4   e Zd Zd fddZddddZdddZ  ZS )TFDeitDecoderr,   r   r/   r0   c                   sL   t  jdi | tjj|jd |j ddd| _t|jdd| _	|| _
d S )NrC   r   0)filtersrw   r2   1r4   r)   )r5   r6   r   r8   r   Zencoder_striderz   conv2dr  pixel_shuffler,   r   r=   r)   r*   r6   c  s   
zTFDeitDecoder.__init__Fr   rN   rh   r.   c                 C  s   |}|  |}| |}|S rt   )r  r  )r;   r   rh   r"   r)   r)   r*   rr   k  s   

zTFDeitDecoder.callNc                 C  s   | j rd S d| _ t| dd d ur3t| jj | jd d d | jjg W d    n1 s.w   Y  t| dd d ur[t| j	j | j	d  W d    d S 1 sTw   Y  d S d S )NTr  r  )
rG   rH   rI   rJ   r  r2   rK   r,   rE   r  r   r)   r)   r*   rK   q  s   "zTFDeitDecoder.buildr   rs   )r   rN   rh   r.   r/   rN   rt   r   r)   r)   r=   r*   r  b  s    r  zvDeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).c                      s\   e Zd Zd fddZeeeeee	d										ddddZ
dddZ  ZS )TFDeiTForMaskedImageModelingr,   r   r/   r0   c                   s0   t  | t|dddd| _t|dd| _d S )NFTr   r   decoderr4   )r5   r6   r   r   r  r  r;   r,   r=   r)   r*   r6     s   z%TFDeiTForMaskedImageModeling.__init__r   r   NFre   rg   rf   r   r   r   r   r   rd   r.   rh   )Union[tuple, TFMaskedImageModelingOutput]c	              
   C  s  |dur|n| j j}| j||||||||d}	|	d }
|
ddddf }
t|
\}}}t|d  }}t|
||||f}
| j|
|d}t|d}d}|dur| j j	| j j
 }t|d||f}t|| j j
d}t|| j j
d	}t|d}t|tj}tjt|d
t|d
}t|d}t|| }t|d | j j }|| }t|d}|s|f|	dd  }|dur|f| S |S t|||	j|	jdS )a  
        bool_masked_pos (`tf.Tensor` of type bool and shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, TFDeiTForMaskedImageModeling
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = TFDeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = tf.cast(tf.random.uniform((1, num_patches), minval=0, maxval=2, dtype=tf.int32), tf.bool)

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```N)rf   r   r   r   r   rd   rh   r   r   rR   g      ?rk   )r   r   r   rC   rC   )r   rC   r   r   gh㈵>)r   )lossZreconstructionr"   r#   )r,   r   r   r   rP   rI   r[   r  r`   ry   rZ   rn   rb   rm   Zfloat32r   ZlossesZmean_absolute_errorZ
reduce_sumrz   r   r"   r#   )r;   re   rf   r   r   r   r   rd   rh   r   r   rp   Zsequence_lengthrz   rO   rQ   Zreconstructed_pixel_valuesZmasked_im_lossrS   rq   Zreconstruction_lossZ
total_lossZnum_masked_pixelsr   r)   r)   r*   rr     sX   +

z!TFDeiTForMaskedImageModeling.callc                 C  r   )NTr   r  )rG   rH   rI   rJ   r   r2   rK   r  r   r)   r)   r*   rK     r   z"TFDeiTForMaskedImageModeling.buildr   r   )re   rg   rf   rg   r   rg   r   r   r   r   r   r   rd   r.   rh   r.   r/   r  rt   )r$   r%   r&   r6   r   r   r   r   r   r   rr   rK   ru   r)   r)   r=   r*   r  }  s    
cr  z
    DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                      s\   e Zd Zd fddZeeeeee	d								ddddZ
dddZ  ZS )TFDeiTForImageClassificationr,   r   c                   s\   t  | |j| _t|ddd| _|jdkr tjj|jddntjjddd| _	|| _
d S )NFr   r   r2   r   
classifierr4   linear)r5   r6   
num_labelsr   r   r   r8   r   
Activationr  r,   r  r=   r)   r*   r6     s   

z%TFDeiTForImageClassification.__init__r  NFre   rg   r   labelsr   r   r   r   rd   r.   rh   r/   )Union[tf.Tensor, TFImageClassifierOutput]c	              	   C  s   |dur|n| j j}| j|||||||d}	|	d }
| |
dddddf }|du r.dn| ||}|sJ|f|	dd  }|durH|f| S |S t|||	j|	jdS )a  
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFDeiTForImageClassification
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests

        >>> keras.utils.set_random_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a TFDeiTForImageClassificationWithTeacher from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = TFDeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
        Predicted class: little blue heron, Egretta caerulea
        ```Nr   r   r   r   rd   rh   r   r   )r  r   r"   r#   )r,   r   r   r  Zhf_compute_lossr   r"   r#   )r;   re   r   r  r   r   r   rd   rh   r   r   r   r  r   r)   r)   r*   rr     s,   /
z!TFDeiTForImageClassification.callc                 C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urZt| jj | jd d | jj	g W d    d S 1 sSw   Y  d S d S )NTr   r  )
rG   rH   rI   rJ   r   r2   rK   r  r,   rE   r   r)   r)   r*   rK   ^  s   "z"TFDeiTForImageClassification.buildr   r   )re   rg   r   rg   r  rg   r   r   r   r   r   r   rd   r.   rh   r.   r/   r  rt   )r$   r%   r&   r6   r   r   r   r   r   r   rr   rK   ru   r)   r)   r=   r*   r    s    
Jr  a  
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

            This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
            supported.
    c                      s^   e Zd Zd fddZeeeeee	e
ed									ddddZdddZ  ZS )'TFDeiTForImageClassificationWithTeacherr,   r   r/   r0   c                   s   t  | |j| _t|ddd| _|jdkr tjj|jddntjjddd| _	|jdkr7tjj|jddntjjddd| _
|| _d S )	NFr   r  r   cls_classifierr4   r  distillation_classifier)r5   r6   r  r   r   r   r8   r   r  r!  r"  r,   r  r=   r)   r*   r6   w  s   


z0TFDeiTForImageClassificationWithTeacher.__init__)r   r   r   r   NFre   rg   r   r   r   r   r   rd   r.   rh   ;Union[tuple, TFDeiTForImageClassificationWithTeacherOutput]c              	   C  s   |d ur|n| j j}| j|||||||d}|d }	| |	d d dd d f }
| |	d d dd d f }|
| d }|sK||
|f|dd   }|S t||
||j|jdS )Nr  r   r   rC   )r   r    r!   r"   r#   )r,   r   r   r!  r"  r   r"   r#   )r;   re   r   r   r   r   rd   rh   r   r   r    r!   r   r   r)   r)   r*   rr     s0   
z,TFDeiTForImageClassificationWithTeacher.callc                 C  s  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urWt| jj | jd d | jj	g W d    n1 sRw   Y  t| dd d urt| j
j | j
d d | jj	g W d    d S 1 s}w   Y  d S d S )NTr   r!  r"  )rG   rH   rI   rJ   r   r2   rK   r!  r,   rE   r"  r   r)   r)   r*   rK     s    "z-TFDeiTForImageClassificationWithTeacher.buildr   )NNNNNFF)re   rg   r   rg   r   r   r   r   r   r   rd   r.   rh   r.   r/   r#  rt   )r$   r%   r&   r6   r   r   r   r   _IMAGE_CLASS_CHECKPOINTr   r   _IMAGE_CLASS_EXPECTED_OUTPUTrr   rK   ru   r)   r)   r=   r*   r   j  s&    *r   )r  r   r  r   r   )Hr'   
__future__r   collections.abcr|   r\   dataclassesr   typingr   r   r   Z
tensorflowrI   Zactivations_tfr   Zmodeling_tf_outputsr	   r
   r   r   Zmodeling_tf_utilsr   r   r   r   r   r   Ztf_utilsr   r   utilsr   r   r   r   r   r   Zconfiguration_deitr   Z
get_loggerr$   loggerr   r   r  r$  r%  r   r8   ZLayerr+   r7   r   r   r   r   r   r   r   r   r   ZDEIT_START_DOCSTRINGr   r   r   r  r  r  r  r   __all__r)   r)   r)   r*   <module>   sv     
m.[(D6r4yh	U