o
    Zh-                     @  sT  d Z ddlmZ ddlZddlmZ ddlmZm	Z	m
Z
 ddlZddlmZ ddlmZmZmZmZmZmZmZ dd	lmZmZ dd
lmZmZmZmZmZ ddl m!Z! e"e#Z$dZ%eG dd deZ&G dd dej'j(Z)G dd dej'j(Z*G dd dej'j(Z+G dd dej'j(Z,G dd dej'j(Z-G dd dej'j(Z.G dd dej'j(Z/G dd dej'j(Z0G d d! d!ej'j(Z1G d"d# d#ej'j(Z2G d$d% d%ej'j(Z3G d&d' d'ej'j(Z4G d(d) d)ej'j(Z5G d*d+ d+ej'j(Z6eG d,d- d-ej'j(Z7G d.d/ d/eZ8d0Z9d1Z:ed2e9G d3d4 d4e8Z;ed5e9G d6d7 d7e8eZ<g d8Z=dS )9zTF 2.0 Cvt model.    )annotationsN)	dataclass)OptionalTupleUnion   )&TFImageClassifierOutputWithNoAttention)TFModelInputTypeTFPreTrainedModelTFSequenceClassificationLossget_initializerkeraskeras_serializableunpack_inputs)
shape_liststable_softmax)ModelOutputadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )	CvtConfigr   c                   @  s6   e Zd ZU dZdZded< dZded< dZded< dS )TFBaseModelOutputWithCLSTokena2  
    Base class for model's outputs.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
            Classification token at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
            the initial embedding outputs.
    NzOptional[tf.Tensor]last_hidden_statecls_token_valuezTuple[tf.Tensor, ...] | Nonehidden_states)__name__
__module____qualname____doc__r   __annotations__r   r    r"   r"   V/var/www/auris/lib/python3.10/site-packages/transformers/models/cvt/modeling_tf_cvt.pyr   4   s
   
 r   c                      s.   e Zd ZdZd fddZddd	d
Z  ZS )TFCvtDropPathzDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    References:
        (1) github.com:rwightman/pytorch-image-models
    	drop_probfloatc                   s   t  jdi | || _d S )Nr"   )super__init__r%   )selfr%   kwargs	__class__r"   r#   r(   O   s   
zTFCvtDropPath.__init__Nx	tf.Tensorc                 C  sr   | j dks|s	|S d| j  }t|d fdtt|d   }|tjj|dd| jd }t|}|| | S )N        r   r   )r   )Zdtype)r%   tfshapelenrandomuniformZcompute_dtypefloor)r)   r-   trainingZ	keep_probr1   Zrandom_tensorr"   r"   r#   callS   s   
&
zTFCvtDropPath.call)r%   r&   N)r-   r.   )r   r   r   r    r(   r7   __classcell__r"   r"   r+   r#   r$   I   s    r$   c                      s8   e Zd ZdZd fddZddddZdddZ  ZS )TFCvtEmbeddingsz-Construct the Convolutional Token Embeddings.configr   
patch_sizeintnum_channels	embed_dimstridepaddingdropout_rater&   c           	   	     s<   t  jdi | t||||||dd| _tj|| _d S )Nconvolution_embeddings)r<   r>   r?   r@   rA   namer"   )r'   r(   TFCvtConvEmbeddingsrC   r   layersDropoutdropout)	r)   r;   r<   r>   r?   r@   rA   rB   r*   r+   r"   r#   r(   `   s   	zTFCvtEmbeddings.__init__Fpixel_valuesr.   r6   boolreturnc                 C  s   |  |}| j||d}|S Nr6   )rC   rH   )r)   rI   r6   hidden_stater"   r"   r#   r7   w   s   
zTFCvtEmbeddings.callNc                 C  d   | j rd S d| _ t| dd d ur0t| jj | jd  W d    d S 1 s)w   Y  d S d S )NTrC   )builtgetattrr0   
name_scoperC   rD   buildr)   input_shaper"   r"   r#   rS   |      "zTFCvtEmbeddings.build)r;   r   r<   r=   r>   r=   r?   r=   r@   r=   rA   r=   rB   r&   F)rI   r.   r6   rJ   rK   r.   r8   r   r   r   r    r(   r7   rS   r9   r"   r"   r+   r#   r:   ]   s
    r:   c                      s6   e Zd ZdZd fd
dZdddZdddZ  ZS )rE   zcImage to Convolution Embeddings. This convolutional operation aims to model local spatial contexts.r;   r   r<   r=   r>   r?   r@   rA   c              	     s   t  jd	i | tjj|d| _t|tjj	r|n||f| _
tjj|||ddt|jdd| _tjjddd| _|| _|| _d S )
NrA   validZchannels_last
projection)filterskernel_sizestridesrA   Zdata_formatkernel_initializerrD   h㈵>normalizationepsilonrD   r"   )r'   r(   r   rF   ZeroPadding2DrA   
isinstancecollectionsabcIterabler<   Conv2Dr   initializer_ranger[   LayerNormalizationra   r>   r?   )r)   r;   r<   r>   r?   r@   rA   r*   r+   r"   r#   r(      s   


zTFCvtConvEmbeddings.__init__rI   r.   rK   c                 C  sr   t |tr	|d }| | |}t|\}}}}|| }tj||||fd}| |}tj|||||fd}|S )NrI   r1   )re   dictr[   rA   r   r0   reshapera   )r)   rI   
batch_sizeheightwidthr>   hidden_sizer"   r"   r#   r7      s   

zTFCvtConvEmbeddings.callNc                 C  s   | j rd S d| _ t| dd d ur2t| jj | jd d d | jg W d    n1 s-w   Y  t| dd d ur^t| jj | jd d | j	g W d    d S 1 sWw   Y  d S d S )NTr[   ra   )
rP   rQ   r0   rR   r[   rD   rS   r>   ra   r?   rT   r"   r"   r#   rS      s   "zTFCvtConvEmbeddings.build)r;   r   r<   r=   r>   r=   r?   r=   r@   r=   rA   r=   )rI   r.   rK   r.   r8   rX   r"   r"   r+   r#   rE      s
    
rE   c                      s8   e Zd ZdZd fd	d
ZddddZdddZ  ZS ) TFCvtSelfAttentionConvProjectionzConvolutional projection layer.r;   r   r?   r=   r]   r@   rA   c              
     sd   t  jd
i | tjj|d| _tjj||t|jd|dd|d| _	tjj
dddd	| _|| _d S )NrY   rZ   Fconvolution)r\   r]   r_   rA   r^   use_biasrD   groupsr`   g?ra   )rc   ZmomentumrD   r"   )r'   r(   r   rF   rd   rA   ri   r   rj   rt   ZBatchNormalizationra   r?   )r)   r;   r?   r]   r@   rA   r*   r+   r"   r#   r(      s   
z)TFCvtSelfAttentionConvProjection.__init__FrN   r.   r6   rJ   rK   c                 C  s"   |  | |}| j||d}|S rL   )rt   rA   ra   r)   rN   r6   r"   r"   r#   r7      s   z%TFCvtSelfAttentionConvProjection.callNc                 C  s   | j rd S d| _ t| dd d ur2t| jj | jd d d | jg W d    n1 s-w   Y  t| dd d ur_t| jj | jd d d | jg W d    d S 1 sXw   Y  d S d S )NTrt   ra   )	rP   rQ   r0   rR   rt   rD   rS   r?   ra   rT   r"   r"   r#   rS      s   "z&TFCvtSelfAttentionConvProjection.build)
r;   r   r?   r=   r]   r=   r@   r=   rA   r=   rW   rN   r.   r6   rJ   rK   r.   r8   rX   r"   r"   r+   r#   rs      s
    rs   c                   @  s   e Zd ZdZdddZdS )	"TFCvtSelfAttentionLinearProjectionz7Linear projection layer used to flatten tokens into 1D.rN   r.   rK   c                 C  s0   t |\}}}}|| }tj||||fd}|S )Nrl   )r   r0   rn   )r)   rN   ro   rp   rq   r>   rr   r"   r"   r#   r7      s   z'TFCvtSelfAttentionLinearProjection.callNrN   r.   rK   r.   )r   r   r   r    r7   r"   r"   r"   r#   ry      s    ry   c                      s<   e Zd ZdZ	dd fddZddddZdddZ  ZS )TFCvtSelfAttentionProjectionz'Convolutional Projection for Attention.dw_bnr;   r   r?   r=   r]   r@   rA   projection_methodstrc                   s<   t  jdi | |dkrt|||||dd| _t | _d S )Nr|   convolution_projectionrD   r"   )r'   r(   rs   r   ry   linear_projection)r)   r;   r?   r]   r@   rA   r}   r*   r+   r"   r#   r(      s   
z%TFCvtSelfAttentionProjection.__init__FrN   r.   r6   rJ   rK   c                 C  s   | j ||d}| |}|S rL   )r   r   rw   r"   r"   r#   r7     s   
z!TFCvtSelfAttentionProjection.callNc                 C  rO   )NTr   )rP   rQ   r0   rR   r   rD   rS   rT   r"   r"   r#   rS     rV   z"TFCvtSelfAttentionProjection.build)r|   )r;   r   r?   r=   r]   r=   r@   r=   rA   r=   r}   r~   rW   rx   r8   rX   r"   r"   r+   r#   r{      s    	r{   c                      sF   e Zd ZdZ	d$d% fddZd&ddZd'd(dd Zd)d"d#Z  ZS )*TFCvtSelfAttentionz
    Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
    query, key, and value embeddings.
    Tr;   r   	num_headsr=   r?   r]   stride_q	stride_kv	padding_q
padding_kvqkv_projection_methodr~   qkv_biasrJ   attention_drop_rater&   with_cls_tokenc              	     s   t  jdi | |d | _|| _|| _|| _t||||||	dkr#dn|	dd| _t||||||	dd| _t||||||	dd| _	t
jj|t|j|
dd	d
| _t
jj|t|j|
ddd
| _t
jj|t|j|
ddd
| _t
j|| _d S )Ng      Zavglinearconvolution_projection_query)r}   rD   convolution_projection_keyconvolution_projection_valuezerosprojection_queryunitsr_   ru   Zbias_initializerrD   projection_keyprojection_valuer"   )r'   r(   scaler   r?   r   r{   r   r   r   r   rF   Denser   rj   r   r   r   rG   rH   )r)   r;   r   r?   r]   r   r   r   r   r   r   r   r   r*   r+   r"   r#   r(     sl   
		
zTFCvtSelfAttention.__init__rN   r.   rK   c                 C  sD   t |\}}}| j| j }tj|||| j|fd}tj|dd}|S )Nrl   r      r   r   perm)r   r?   r   r0   rn   	transpose)r)   rN   ro   rr   _head_dimr"   r"   r#   "rearrange_for_multi_head_attention`  s
   z5TFCvtSelfAttention.rearrange_for_multi_head_attentionFrp   rq   r6   c                 C  s\  | j rt|d|| gd\}}t|\}}}tj|||||fd}| j||d}	| j||d}
| j||d}| j rUtj||
fdd}
tj||	fdd}	tj||fdd}| j	| j
 }| | |
}
| | |	}	| | |}tj|
|	dd| j }t|dd}| j||d}t||}t|\}}}}tj|d	d
}t|||| j
| f}|S )Nr   rl   rM   axisT)Ztranspose_b)logitsr   r   r   )r   r0   splitr   rn   r   r   r   concatr?   r   r   r   r   r   matmulr   r   rH   r   )r)   rN   rp   rq   r6   	cls_tokenro   rr   r>   keyqueryvaluer   Zattention_scoreZattention_probscontextr   r"   r"   r#   r7   g  s.   zTFCvtSelfAttention.callNc                 C  s  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d urwt| jj | jd  W d    n1 srw   Y  t| dd d urt| j	j | j	d d | j
g W d    n1 sw   Y  t| dd d urt| jj | jd d | j
g W d    n1 sw   Y  t| dd d urt| jj | jd d | j
g W d    d S 1 sw   Y  d S d S )NTr   r   r   r   r   r   )rP   rQ   r0   rR   r   rD   rS   r   r   r   r?   r   r   rT   r"   r"   r#   rS     s8   "zTFCvtSelfAttention.buildT)r;   r   r   r=   r?   r=   r]   r=   r   r=   r   r=   r   r=   r   r=   r   r~   r   rJ   r   r&   r   rJ   rz   rW   
rN   r.   rp   r=   rq   r=   r6   rJ   rK   r.   r8   )	r   r   r   r    r(   r   r7   rS   r9   r"   r"   r+   r#   r     s    
I"r   c                      s8   e Zd ZdZd fdd	ZddddZdddZ  ZS )TFCvtSelfOutputzOutput of the Attention layer .r;   r   r?   r=   	drop_rater&   c                   sD   t  jdi | tjj|t|jdd| _tj|| _	|| _
d S Ndense)r   r_   rD   r"   )r'   r(   r   rF   r   r   rj   r   rG   rH   r?   )r)   r;   r?   r   r*   r+   r"   r#   r(     s   
zTFCvtSelfOutput.__init__FrN   r.   r6   rJ   rK   c                 C  s   | j |d}| j||d}|S N)inputs)r   r6   r   rH   rw   r"   r"   r#   r7     s   zTFCvtSelfOutput.callNc                 C  l   | j rd S d| _ t| dd d ur4t| jj | jd d | jg W d    d S 1 s-w   Y  d S d S NTr   rP   rQ   r0   rR   r   rD   rS   r?   rT   r"   r"   r#   rS        "zTFCvtSelfOutput.build)r;   r   r?   r=   r   r&   rW   rx   r8   rX   r"   r"   r+   r#   r     s
    r   c                      sD   e Zd ZdZ	d$d% fddZdd Zd&d'dd Zd(d"d#Z  ZS ))TFCvtAttentionzDAttention layer. First chunk of the convolutional transformer block.Tr;   r   r   r=   r?   r]   r   r   r   r   r   r~   r   rJ   r   r&   r   r   c                   sL   t  jdi | t|||||||||	|
||dd| _t|||dd| _d S )N	attentionr   outputr"   )r'   r(   r   r   r   dense_output)r)   r;   r   r?   r]   r   r   r   r   r   r   r   r   r   r*   r+   r"   r#   r(     s"   zTFCvtAttention.__init__c                 C  s   t r8   )NotImplementedError)r)   Zheadsr"   r"   r#   prune_heads  s   zTFCvtAttention.prune_headsFrN   r.   rp   rq   r6   c                 C  s$   | j ||||d}| j||d}|S rL   )r   r   )r)   rN   rp   rq   r6   Zself_outputattention_outputr"   r"   r#   r7     s   zTFCvtAttention.callNc                 C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urUt| jj | jd  W d    d S 1 sNw   Y  d S d S )NTr   r   )rP   rQ   r0   rR   r   rD   rS   r   rT   r"   r"   r#   rS     s   "zTFCvtAttention.buildr   )r;   r   r   r=   r?   r=   r]   r=   r   r=   r   r=   r   r=   r   r=   r   r~   r   rJ   r   r&   r   r&   r   rJ   rW   )rN   r.   rp   r=   rq   r=   r6   rJ   r8   )	r   r   r   r    r(   r   r7   rS   r9   r"   r"   r+   r#   r     s    #r   c                      s6   e Zd ZdZd fddZdddZdddZ  ZS )TFCvtIntermediatezNIntermediate dense layer. Second chunk of the convolutional transformer block.r;   r   r?   r=   	mlp_ratioc                   s@   t  jdi | tjjt|| t|jddd| _|| _	d S )NZgelur   )r   r_   Z
activationrD   r"   )
r'   r(   r   rF   r   r=   r   rj   r   r?   )r)   r;   r?   r   r*   r+   r"   r#   r(     s   

zTFCvtIntermediate.__init__rN   r.   rK   c                 C  s   |  |}|S r8   )r   )r)   rN   r"   r"   r#   r7     s   
zTFCvtIntermediate.callNc                 C  r   r   r   rT   r"   r"   r#   rS     r   zTFCvtIntermediate.build)r;   r   r?   r=   r   r=   rz   r8   rX   r"   r"   r+   r#   r     s
    

r   c                      s8   e Zd ZdZd fdd	ZddddZdddZ  ZS )TFCvtOutputzu
    Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
    r;   r   r?   r=   r   r   c                   sJ   t  jdi | tjj|t|jdd| _tj|| _	|| _
|| _d S r   )r'   r(   r   rF   r   r   rj   r   rG   rH   r?   r   )r)   r;   r?   r   r   r*   r+   r"   r#   r(     s   
zTFCvtOutput.__init__FrN   r.   input_tensorr6   rJ   rK   c                 C  s&   | j |d}| j||d}|| }|S r   r   )r)   rN   r   r6   r"   r"   r#   r7     s   zTFCvtOutput.callNc                 C  sv   | j rd S d| _ t| dd d ur9t| jj | jd d t| j| j	 g W d    d S 1 s2w   Y  d S d S r   )
rP   rQ   r0   rR   r   rD   rS   r=   r?   r   rT   r"   r"   r#   rS   "  s    "zTFCvtOutput.build)r;   r   r?   r=   r   r=   r   r=   rW   )rN   r.   r   r.   r6   rJ   rK   r.   r8   rX   r"   r"   r+   r#   r     s
    	r   c                      s<   e Zd ZdZ	d%d& fddZd'd(d d!Zd)d#d$Z  ZS )*
TFCvtLayera&  
    Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
    consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
    `Block` class in the original implementation.
    Tr;   r   r   r=   r?   r]   r   r   r   r   r   r~   r   rJ   r   r&   r   r   drop_path_rater   c                   s   t  jdi | t|||||||||	|
|||dd| _t|||dd| _t||||dd| _|dkr9t|ddnt	j
jddd| _t	j
jdd	d
| _t	j
jddd
| _|| _d S )Nr   r   intermediater   r/   	drop_pathr   r`   layernorm_beforerb   layernorm_afterr"   )r'   r(   r   r   r   r   r   r   r$   r   rF   Z
Activationr   rk   r   r   r?   )r)   r;   r   r?   r]   r   r   r   r   r   r   r   r   r   r   r   r*   r+   r"   r#   r(   2  s4   
zTFCvtLayer.__init__FrN   r.   rp   rq   r6   rK   c                 C  s`   | j | ||||d}| j||d}|| }| |}| |}| ||}| j||d}|S rL   )r   r   r   r   r   r   )r)   rN   rp   rq   r6   r   Zlayer_outputr"   r"   r#   r7   c  s   

zTFCvtLayer.callNc                 C  s  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d urwt| jj | jd  W d    n1 srw   Y  t| dd d urt| j	j | j	d  W d    n1 sw   Y  t| dd d urt| j
j | j
d d | jg W d    n1 sw   Y  t| dd d urt| jj | jd d | jg W d    d S 1 sw   Y  d S d S )NTr   r   r   r   r   r   )rP   rQ   r0   rR   r   rD   rS   r   r   r   r   r?   r   rT   r"   r"   r#   rS   t  s8   "zTFCvtLayer.buildr   )r;   r   r   r=   r?   r=   r]   r=   r   r=   r   r=   r   r=   r   r=   r   r~   r   rJ   r   r&   r   r&   r   r&   r   r&   r   rJ   rW   r   r8   rX   r"   r"   r+   r#   r   +  s    1r   c                      s8   e Zd ZdZd fddZddddZdddZ  ZS )
TFCvtStageaK  
    Cvt stage (encoder block). Each stage has 2 parts :
    - (1) A Convolutional Token Embedding layer
    - (2) A Convolutional Transformer Block (layer).
    The classification token is added only in the last stage.

    Args:
        config ([`CvtConfig`]): Model configuration class.
        stage (`int`): Stage number.
    r;   r   stager=   c              
     s  t  jdi |  _|_jjj r+jddjjd ftjjddd_t	j j
j jdkr; jn jjd   jj  jj  jj  jj dd_td	 jj  j| d
d D  fddt jj D _d S )Nr   r   Tzcvt.encoder.stages.2.cls_token)r1   ZinitializerZ	trainablerD   r   	embedding)r<   r>   r@   r?   rA   rB   rD   r/   c                 S  s   g | ]}|   qS r"   )numpyitem).0r-   r"   r"   r#   
<listcomp>  s    z'TFCvtStage.__init__.<locals>.<listcomp>c                   s   g | ]R}t  f jj  jj  jj  jj  jj  jj  jj  j	j  j
j  jj  jj  jj j  jj d | dqS )zlayers.)r   r?   r]   r   r   r   r   r   r   r   r   r   r   r   rD   )r   r   r   r?   Z
kernel_qkvr   r   r   r   r   r   r   r   r   r   )r   jr;   Zdrop_path_ratesr)   r"   r#   r     s,    












r"   )r'   r(   r;   r   r   Z
add_weightr?   r   rj   r:   Zpatch_sizesr>   Zpatch_strideZpatch_paddingr   r   r0   Zlinspacer   depthrangerF   )r)   r;   r   r*   r+   r   r#   r(     s2   





zTFCvtStage.__init__FrN   r.   r6   rJ   c                 C  s   d }|  ||}t|\}}}}|| }tj||||fd}| jj| j r7tj| j|dd}tj||fdd}| j	D ]}	|	||||d}
|
}q:| jj| j r[t
|d|| gd\}}tj|||||fd}||fS )Nrl   r   )Zrepeatsr   r   r   rM   )r   r   r0   rn   r;   r   r   repeatr   rF   r   )r)   rN   r6   r   ro   rp   rq   r>   rr   layerZlayer_outputsr"   r"   r#   r7     s   
zTFCvtStage.callNc              	   C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urV| jD ]}t|j |d  W d    n1 sPw   Y  q8d S d S )NTr   rF   )rP   rQ   r0   rR   r   rD   rS   rF   r)   rU   r   r"   r"   r#   rS     s   
zTFCvtStage.build)r;   r   r   r=   rW   )rN   r.   r6   rJ   r8   rX   r"   r"   r+   r#   r     s
    /r   c                      sB   e Zd ZdZeZd fddZ			ddddZdddZ  Z	S )TFCvtEncoderz
    Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
    (depth) being 1, 2 and 10.

    Args:
        config ([`CvtConfig`]): Model configuration class.
    r;   r   c                   s:   t  jdi |  | _ fddtt jD | _d S )Nc                   s    g | ]}t  |d | dqS )zstages.r   )r   )r   Z	stage_idxr;   r"   r#   r     s    z)TFCvtEncoder.__init__.<locals>.<listcomp>r"   )r'   r(   r;   r   r2   r   stagesr)   r;   r*   r+   r   r#   r(     s
   
zTFCvtEncoder.__init__FTrI   r	   output_hidden_statesOptional[bool]return_dictr6   rK   6Union[TFBaseModelOutputWithCLSToken, Tuple[tf.Tensor]]c           
      C  s   |rdnd }|}t j|dd}d }t| jD ]\}}	|	||d\}}|r)||f }qt j|dd}|r<tdd |D }|sJtdd	 |||fD S t|||d
S )Nr"   )r   r   r   r   r   rM   r   r   r   r   c                 S  s   g | ]	}t j|d dqS )r   r   )r0   r   )r   hsr"   r"   r#   r     s    z%TFCvtEncoder.call.<locals>.<listcomp>c                 s  s    | ]	}|d ur|V  qd S r8   r"   )r   vr"   r"   r#   	<genexpr>  s    z$TFCvtEncoder.call.<locals>.<genexpr>r   r   r   )r0   r   	enumerater   tupler   )
r)   rI   r   r   r6   Zall_hidden_statesrN   r   r   Zstage_moduler"   r"   r#   r7     s&   
zTFCvtEncoder.callNc              	   C  sj   | j rd S d| _ t| dd d ur1| jD ]}t|j |d  W d    n1 s+w   Y  qd S d S )NTr   )rP   rQ   r   r0   rR   rD   rS   r   r"   r"   r#   rS     s   
zTFCvtEncoder.buildr;   r   )FTF)
rI   r	   r   r   r   r   r6   r   rK   r   r8   )
r   r   r   r    r   config_classr(   r7   rS   r9   r"   r"   r+   r#   r     s    
!r   c                      sH   e Zd ZdZeZd fddZe				ddddZdddZ	  Z
S )TFCvtMainLayerzConstruct the Cvt model.r;   r   c                   s*   t  jdi | || _t|dd| _d S )Nencoderr   r"   )r'   r(   r;   r   r   r   r+   r"   r#   r(   /  s   zTFCvtMainLayer.__init__NFrI   TFModelInputType | Noner   r   r   r6   rK   r   c                 C  sR   |d u rt d| j||||d}|d }|s |f|dd   S t||j|jdS )N You have to specify pixel_valuesr   r   r6   r   r   r   )
ValueErrorr   r   r   r   )r)   rI   r   r   r6   Zencoder_outputssequence_outputr"   r"   r#   r7   4  s    zTFCvtMainLayer.callc                 C  rO   )NTr   )rP   rQ   r0   rR   r   rD   rS   rT   r"   r"   r#   rS   Q  rV   zTFCvtMainLayer.buildr   NNNF)
rI   r   r   r   r   r   r6   r   rK   r   r8   )r   r   r   r    r   r   r(   r   r7   rS   r9   r"   r"   r+   r#   r   )  s    r   c                   @  s   e Zd ZdZeZdZdZdS )TFCvtPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    cvtrI   N)r   r   r   r    r   r   Zbase_model_prefixZmain_input_namer"   r"   r"   r#   r   Z  s
    r   a  

    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TF 2.0 models accepts two formats as inputs:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional arguments.

    This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
    tensors in the first argument of the model call function: `model(inputs)`.

    </Tip>

    Args:
        config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
al  
    Args:
        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
            for details.

        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
            used instead.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
            eager mode, in graph mode the value will always be set to True.
        training (`bool`, *optional*, defaults to `False``):
            Whether or not to use the model in training mode (some modules like dropout modules have different
            behaviors between training and evaluation).
z]The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.c                      sT   e Zd Zd fddZeeeeee	d				ddddZ
dddZ  ZS )
TFCvtModelr;   r   c                   s,   t  j|g|R i | t|dd| _d S )Nr   r   )r'   r(   r   r   r)   r;   r   r*   r+   r"   r#   r(     s   zTFCvtModel.__init__output_typer   NFrI   tf.Tensor | Noner   r   r   r6   rK   r   c                 C  sP   |du rt d| j||||d}|s|d f|dd  S t|j|j|jdS )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFCvtModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
        >>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr   )rI   r   r   r6   r   r   r   )r   r   r   r   r   r   )r)   rI   r   r   r6   outputsr"   r"   r#   r7     s   zTFCvtModel.callc                 C  rO   )NTr   )rP   rQ   r0   rR   r   rD   rS   rT   r"   r"   r#   rS     rV   zTFCvtModel.buildr   r   )
rI   r   r   r   r   r   r6   r   rK   r   r8   )r   r   r   r(   r   r   TFCVT_INPUTS_DOCSTRINGr   r   _CONFIG_FOR_DOCr7   rS   r9   r"   r"   r+   r#   r     s    
/r   z
    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                      sV   e Zd Zd fddZeeeeee	d					ddddZ
dddZ  ZS )TFCvtForImageClassificationr;   r   c                   sl   t  j|g|R i | |j| _t|dd| _tjjddd| _tjj	|jt
|jdddd	| _|| _d S )
Nr   r   r`   	layernormrb   Tr   
classifierr   )r'   r(   Z
num_labelsr   r   r   rF   rk   r   r   r   rj   r   r;   r   r+   r"   r#   r(     s   
z$TFCvtForImageClassification.__init__r   NFrI   r   labelsr   r   r   r6   rK   ?Union[TFImageClassifierOutputWithNoAttention, Tuple[tf.Tensor]]c                 C  s   | j ||||d}|d }|d }| jjd r| |}n t|\}	}
}}tj||	|
|| fd}tj|dd}| |}tj|dd}| 	|}|d	u rOd	n| j
||d
}|sl|f|dd	  }|d	urj|f| S |S t|||jdS )a+  
        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFCvtForImageClassification
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
        >>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
        ```r   r   r   r   rl   )r   r   r   r   r   N)r   r   r   )lossr   r   )r   r;   r   r   r   r0   rn   r   Zreduce_meanr   Zhf_compute_lossr   r   )r)   rI   r   r   r   r6   r   r   r   ro   r>   rp   rq   Zsequence_output_meanr   r   r   r"   r"   r#   r7     s*   )

z TFCvtForImageClassification.callc                 C  s$  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urYt| jj | jd d | jj	d g W d    n1 sTw   Y  t| dd d urt
| jdrt| jj | jd d | jj	d g W d    d S 1 sw   Y  d S d S d S )NTr   r   r   r   rD   )rP   rQ   r0   rR   r   rD   rS   r   r;   r?   hasattrr   rT   r"   r"   r#   rS   8  s$   "z!TFCvtForImageClassification.buildr   )NNNNF)rI   r   r   r   r   r   r   r   r6   r   rK   r   r8   )r   r   r   r(   r   r   r   r   r   r   r7   rS   r9   r"   r"   r+   r#   r     s    
Br   )r   r   r   )>r    
__future__r   collections.abcrf   dataclassesr   typingr   r   r   Z
tensorflowr0   Zmodeling_tf_outputsr   Zmodeling_tf_utilsr	   r
   r   r   r   r   r   Ztf_utilsr   r   utilsr   r   r   r   r   Zconfiguration_cvtr   Z
get_loggerr   loggerr   r   rF   ZLayerr$   r:   rE   rs   ry   r{   r   r   r   r   r   r   r   r   r   r   ZTFCVT_START_DOCSTRINGr   r   r   __all__r"   r"   r"   r#   <module>   s\   $	
(:%" :a`=0Ah