o
    Zh7                    @  s*  d Z ddlmZ ddlZddlZddlZddlmZ ddl	m
Z
 ddlmZmZmZmZmZmZmZmZ ddlZddlmZ dd	lmZmZmZmZmZmZ dd
lm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) e&*e+Z,dZ-dZ.g dZ/dZ0dZ1eG dd de"Z2eG dd de"Z3eG dd de"Z4eG dd de"Z5dcdd Z6ddd$d%Z7	(dedfd/d0Z8G d1d2 d2ej9j:Z;G d3d4 d4ej9j:Z<G d5d6 d6ej9j:Z=G d7d8 d8ej9j:Z>G d9d: d:ej9j:Z?G d;d< d<ej9j:Z@G d=d> d>ej9j:ZAG d?d@ d@ej9j:ZBG dAdB dBej9j:ZCG dCdD dDej9j:ZDG dEdF dFej9j:ZEG dGdH dHej9j:ZFG dIdJ dJeZGdKZHdLZIdgdOdPZJG dQdR dRej9j:ZKeG dSdT dTej9j:ZLe$dUeHG dVdW dWeGZMG dXdY dYej9j:ZNG dZd[ d[ej9j:ZOe$d\eHG d]d^ d^eGZPe$d_eHG d`da daeGeZQg dbZRdS )hzTF 2.0 Swin Transformer model.    )annotationsN)	dataclass)partial)AnyCallableDictIterableListOptionalTupleUnion   )ACT2FN)TFPreTrainedModelTFSequenceClassificationLossget_initializerkeraskeras_serializableunpack_inputs)
shape_list)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
SwinConfigr   z&microsoft/swin-tiny-patch4-window7-224)r   1   i   ztabby, tabby catc                   @  sB   e Zd ZU dZdZded< dZded< dZded< dZded< dS )	TFSwinEncoderOutputaH  
    Swin encoder's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    NOptional[tf.Tensor]last_hidden_stateTuple[tf.Tensor, ...] | Nonehidden_states
attentionsreshaped_hidden_states)	__name__
__module____qualname____doc__r!   __annotations__r#   r$   r%    r+   r+   X/var/www/auris/lib/python3.10/site-packages/transformers/models/swin/modeling_tf_swin.pyr   C   s   
 r   c                   @  N   e Zd ZU dZdZded< dZded< dZded< dZded	< dZ	ded
< dS )TFSwinModelOutputa  
    Swin model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr    r!   tf.Tensor | Nonepooler_outputr"   r#   r$   r%   )
r&   r'   r(   r)   r!   r*   r0   r#   r$   r%   r+   r+   r+   r,   r.   d      
 r.   c                   @  sZ   e Zd ZU dZdZded< dZded< dZded< dZded	< dZ	ded
< e
dd ZdS )TFSwinMaskedImageModelingOutputa  
    Swin masked image model outputs.

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
            Masked image modeling (MLM) loss.
        reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Reconstructed pixel values.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr/   lossr    reconstructionr"   r#   r$   r%   c                 C  s   t dt | jS )Nzlogits attribute is deprecated and will be removed in version 5 of Transformers. Please use the reconstruction attribute to retrieve the final output instead.)warningswarnFutureWarningr4   selfr+   r+   r,   logits   s
   z&TFSwinMaskedImageModelingOutput.logits)r&   r'   r(   r)   r3   r*   r4   r#   r$   r%   propertyr:   r+   r+   r+   r,   r2      s   
 r2   c                   @  r-   )TFSwinImageClassifierOutputa  
    Swin outputs for image classification.

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr/   r3   r    r:   r"   r#   r$   r%   )
r&   r'   r(   r)   r3   r*   r:   r#   r$   r%   r+   r+   r+   r,   r<      r1   r<   input_feature	tf.Tensorwindow_sizeintreturnc              	   C  sT   t | \}}}}t| ||| ||| ||f} t| d}t|d|||f}|S )z2
    Partitions the given input into windows.
    r   r   r            )r   tfreshape	transpose)r=   r?   
batch_sizeheightwidthnum_channelswindowsr+   r+   r,   window_partition   s   rO   rN   rK   rL   c              	   C  sz   t | d }t || ||  t j}t j||}t | ||| || ||df} t | d} t | |||df} | S )z?
    Merges windows to produce higher resolution features.
    r   rF   rB   )rG   shapecastint32mathfloordivrH   rI   )rN   r?   rK   rL   xyrJ   r+   r+   r,   window_reverse   s   rW           FTinput	drop_probfloattrainingboolscale_by_keepc           	      C  sz   |dks|s| S d| }t | }t|}|d gdg|d   }tj|}t||kdd}|dkr9|r9|| }| | S )zb
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    rX   r   r         ?)r   lenrG   randomuniformwhere)	rY   rZ   r\   r^   Z	keep_probinput_shapendimrP   Zrandom_tensorr+   r+   r,   	drop_path   s   rf   c                      s<   e Zd ZdZdd fd	d
ZdddZ	ddddZ  ZS )TFSwinEmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    Fconfigr   use_mask_tokenr]   rA   Nonec                   sz   t  jdi | t|dd| _| jj| _| jj| _|j| _|| _|j	| _	t
jjddd| _t
jj|jdd| _|| _d S )Npatch_embeddingsnamenormh㈵>)rm   epsilondropoutr+   )super__init__TFSwinPatchEmbeddingsrk   num_patches	grid_size
patch_grid	embed_dimri   use_absolute_embeddingsr   layersLayerNormalizationrn   Dropouthidden_dropout_probrq   rh   )r9   rh   ri   kwargs	__class__r+   r,   rs     s   


zTFSwinEmbeddings.__init__rd   tf.TensorShapec                 C  sX  | j r| jdd| jfddd| _nd | _| jr(| jd| jd | jfddd| _nd | _| jr0d S d| _t| dd d urXt	
| jj | jd  W d    n1 sSw   Y  t| d	d d urt	
| jj | jd d | jjg W d    n1 s}w   Y  t| d
d d urt	
| jj | jd  W d    d S 1 sw   Y  d S d S )Nr   zeros
mask_tokenrP   initializerrm   Zpositional_embeddings)r   rm   Trk   rn   rq   )ri   
add_weightrx   r   ry   ru   position_embeddingsbuiltgetattrrG   
name_scoperk   rm   buildrn   rh   rq   r9   rd   r+   r+   r,   r     s0   
"zTFSwinEmbeddings.buildNpixel_valuesr>   bool_masked_posOptional[bool]r\   !Tuple[tf.Tensor, Tuple[int, int]]c                 C  s   | j ||d\}}| j||d}t|\}}}|d urAt| j|d}	t|	|d}	t|d}
t|
|	j}
|d|
  |	|
  }| j	d urK|| j	 }| j
||d}||fS )Nr\   r   r   rF   r_   )rk   rn   r   rG   repeatr   expand_dimsrQ   dtyper   rq   )r9   r   r   r\   
embeddingsoutput_dimensionsrJ   Zseq_len_Zmask_tokensmaskr+   r+   r,   call5  s   

zTFSwinEmbeddings.callF)rh   r   ri   r]   rA   rj   rd   r   rA   rj   )NF)r   r>   r   r   r\   r]   rA   r   )r&   r'   r(   r)   rs   r   r   __classcell__r+   r+   r   r,   rg   	  s    
rg   c                      s@   e Zd ZdZ fddZdd
dZddddZdddZ  ZS )rt   z#
    Image to Patch Embedding.
    c                   s   t  jdi | |j|j}}|j|j}}t|tjj	r |n||f}t|tjj	r-|n||f}|d |d  |d |d   }|| _|| _|| _|| _
|d |d  |d |d  f| _tjj|| j| jddd| _d S )Nr   r   Zvalid
projection)filterskernel_sizestridespaddingrm   r+   )rr   rs   
image_size
patch_sizerM   rx   
isinstancecollectionsabcr   ru   rv   r   rz   Conv2Dr   )r9   rh   r~   r   r   rM   hidden_sizeru   r   r+   r,   rs   R  s$    "zTFSwinPatchEmbeddings.__init__r   r>   rK   r@   rL   rA   c                 C  s   || j d  dkr!dddd| j d || j d   ff}t||}|| j d  dkrBddd| j d || j d   fdf}t||}|S )Nr   r   r   r   )r   rG   pad)r9   r   rK   rL   
pad_valuesr+   r+   r,   	maybe_padg  s   $$zTFSwinPatchEmbeddings.maybe_padFr\   r]   r   c                 C  s   t |\}}}}t r|| jkrtd| |||}t|d}| j||d}t|d}t |\}}	}}||f}
t|||	df}t|d}||
fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   rC   r   r   r   r   r   r   rC   rF   r   rC   r   )	r   rG   Zexecuting_eagerlyrM   
ValueErrorr   rI   r   rH   )r9   r   r\   r   rM   rK   rL   r   rJ   channelsr   r+   r+   r,   r   p  s   zTFSwinPatchEmbeddings.callNc                 C  sn   | j rd S d| _ t| dd d ur5t| jj | jd d d | jg W d    d S 1 s.w   Y  d S d S )NTr   )r   r   rG   r   r   rm   r   rM   r   r+   r+   r,   r     s   "zTFSwinPatchEmbeddings.build)r   r>   rK   r@   rL   r@   rA   r>   r   )r   r>   r\   r]   rA   r   N	r&   r'   r(   r)   rs   r   r   r   r   r+   r+   r   r,   rt   M  s    
	rt   c                      sF   e Zd ZdZ	dd fddZdddZddddZdddZ  ZS ) TFSwinPatchMergingaB  
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`keras.layer.Layer`, *optional*, defaults to `keras.layers.LayerNormalization`):
            Normalization layer class.
    Ninput_resolutionTuple[int, int]dimr@   
norm_layerOptional[Callable]rA   rj   c                   sd   t  jd	i | || _|| _tjjd| ddd| _|d u r*tjjddd| _	d S |dd| _	d S )
NrC   F	reduction)use_biasrm   ro   rn   rp   rm   rl   r+   )
rr   rs   r   r   r   rz   Denser   r{   rn   )r9   r   r   r   r~   r   r+   r,   rs     s   zTFSwinPatchMerging.__init__r=   r>   rK   rL   c                 C  sH   |d dkp|d dk}|r"dd|d fd|d fdf}t ||}|S )NrC   r   r   r   )rG   r   )r9   r=   rK   rL   Z
should_padr   r+   r+   r,   r     s
   zTFSwinPatchMerging.maybe_padFinput_dimensionsr\   r]   c                 C  s  |\}}t |\}}}t|||||f}| |||}|d d dd ddd dd d f }	|d d dd ddd dd d f }
|d d dd ddd dd d f }|d d dd ddd dd d f }t|	|
||gd}t||dd| f}| j||d}| j||d}|S )Nr   rC   r   rF   rD   r   )r   rG   rH   r   concatrn   r   )r9   r=   r   r\   rK   rL   rJ   r   rM   Zinput_feature_0Zinput_feature_1Zinput_feature_2Zinput_feature_3r+   r+   r,   r     s   $$$$zTFSwinPatchMerging.callc                 C  s   | j rd S d| _ t| dd d ur3t| jj | jd d d| j g W d    n1 s.w   Y  t| dd d urat| jj | jd d d| j g W d    d S 1 sZw   Y  d S d S )NTr   rD   rn   )	r   r   rG   r   r   rm   r   r   rn   r   r+   r+   r,   r     s   "zTFSwinPatchMerging.buildr   )r   r   r   r@   r   r   rA   rj   )r=   r>   rK   r@   rL   r@   rA   r>   r   )r=   r>   r   r   r\   r]   rA   r>   r   r+   r+   r   r,   r     s    
r   c                      s0   e Zd ZdZdd fd
dZddddZ  ZS )TFSwinDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).NTrZ   Optional[float]r^   r]   rA   rj   c                   s&   t t| jdi | || _|| _d S Nr+   )rr   r   rs   rZ   r^   )r9   rZ   r^   r~   r   r+   r,   rs     s   
zTFSwinDropPath.__init__FrY   r>   r\   c                 C  s   t || j|| jS r   )rf   rZ   r^   )r9   rY   r\   r+   r+   r,   r     s   zTFSwinDropPath.call)NT)rZ   r   r^   r]   rA   rj   r   )rY   r>   r\   r]   rA   r>   r&   r'   r(   r)   rs   r   r   r+   r+   r   r,   r     s    r   c                      sF   e Zd Zd fdd	ZdddZd ddZ				d!d"ddZ  ZS )#TFSwinSelfAttentionrh   r   r   r@   	num_headsrA   rj   c                   s   t  jd	i | || dkrtd| d| d|| _t|| | _| j| j | _|j}t|t	j
jr7|n||f| _tjj| jt|j|jdd| _tjj| jt|j|jdd| _tjj| jt|j|jdd| _tj|j| _d S )
Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()query)Zkernel_initializerr   rm   keyvaluer+   )rr   rs   r   num_attention_headsr@   attention_head_sizeall_head_sizer?   r   r   r   r   r   rz   r   r   Zinitializer_rangeZqkv_biasr   r   r   r|   attention_probs_dropout_probrq   )r9   rh   r   r   r~   r?   r   r+   r,   rs     s<   zTFSwinSelfAttention.__init__rd   r   c           	      C  s  | j d| jd  d d| jd  d  | jfddd| _| j | jd d | jd d fdtjdd	| _t| jd }t| jd }ttj	||d
d}t
|t|d df}|d d d d d f |d d d d d f  }t|d}tj|dd\}}|| jd d 7 }|d| jd  d 9 }|| jd d 7 }tj||gdd}| jttj|ddtj | jrd S d| _t| dd d urt| jj | jd d | jg W d    n1 sw   Y  t| dd d urt| jj | jd d | jg W d    n	1 sw   Y  t| dd d ur?t| jj | jd d | jg W d    d S 1 s8w   Y  d S d S )NrC   r   r   r   relative_position_bias_tabler   Frelative_position_index)rP   Z	trainabler   rm   Zij)ZindexingrF   )r   rC   r   axisTr   r   r   )r   r?   r   r   rG   rR   r   rangestackmeshgridrH   r   rI   ZunstackZassignrQ   
reduce_sumr   r   r   r   rm   r   r   r   r   )	r9   rd   Zcoords_hZcoords_wZcoordsZcoords_flattenZrelative_coordsZstack_0Zstack_1r+   r+   r,   r     sN   (, $zTFSwinSelfAttention.buildrU   r>   c                 C  s4   t |d d | j| jg }t||}t|dS )NrF   r   rC   r   r   )r   r   r   rG   rH   rI   )r9   rU   Znew_x_shaper+   r+   r,   transpose_for_scores5  s   z(TFSwinSelfAttention.transpose_for_scoresNFr#   attention_maskr/   	head_maskoutput_attentionsr]   r\   Tuple[tf.Tensor, ...]c                 C  s  t |\}}}| |}	| | |}
| | |}| |	}t|t|
d}|t	| j
 }t| jt| jd}t|| jd | jd  | jd | jd  df}t|d}|t|d }|d urt |d }t||| || j||f}t|d}t|d}|| }t|d| j||f}tjj|dd}| j||d}|d ur|| }t||}t|d	}t |d d
 | jg }t||}|r||f}|S |f}|S )N)r   r   r   rC   rF   r   r   rF   )rC   r   r   r   r   r   )r   r   r   r   r   rG   matmulrI   rS   sqrtr   gatherr   rH   r   r?   r   r   nnZsoftmaxrq   r   )r9   r#   r   r   r   r\   rJ   r   r   Zmixed_query_layerZ	key_layerZvalue_layerZquery_layerZattention_scoresZrelative_position_biasZ
mask_shapeZattention_probsZcontext_layerZnew_context_layer_shapeoutputsr+   r+   r,   r   :  sN   

(zTFSwinSelfAttention.callrh   r   r   r@   r   r@   rA   rj   r   rU   r>   rA   r>   NNFF)r#   r>   r   r/   r   r/   r   r]   r\   r]   rA   r   )r&   r'   r(   rs   r   r   r   r   r+   r+   r   r,   r     s    
$
*r   c                      s4   e Zd Zd fddZddddZdddZ  ZS )TFSwinSelfOutputrh   r   r   r@   rA   rj   c                   sB   t  jdi | tjj|dd| _tjj|jdd| _|| _	d S Ndenserl   rq   r+   )
rr   rs   r   rz   r   r   r|   r   rq   r   r9   rh   r   r~   r   r+   r,   rs   {  s   
zTFSwinSelfOutput.__init__Fr#   r>   input_tensorr\   r]   c                 C  s   |  |}| j||d}|S Nr   r   rq   )r9   r#   r   r\   r+   r+   r,   r        
zTFSwinSelfOutput.callNc                 C  s   | j rd S d| _ t| dd d ur1t| jj | jd d | jg W d    n1 s,w   Y  t| dd d urYt| jj | jd  W d    d S 1 sRw   Y  d S d S )NTr   rq   )	r   r   rG   r   r   rm   r   r   rq   r   r+   r+   r,   r     s   "zTFSwinSelfOutput.buildrh   r   r   r@   rA   rj   r   )r#   r>   r   r>   r\   r]   rA   r>   r   r&   r'   r(   rs   r   r   r   r+   r+   r   r,   r   z  s    r   c                      sD   e Zd Zd fdd	Zd
d Z				ddddZdddZ  ZS )TFSwinAttentionrh   r   r   r@   r   rA   rj   c                   s@   t  jdi | t|||dd| _t||dd| _t | _d S )Nr9   rl   outputr+   )rr   rs   r   r9   r   self_outputsetZpruned_heads)r9   rh   r   r   r~   r   r+   r,   rs     s   zTFSwinAttention.__init__c                 C  s   t )z
        Prunes heads of the model. See base class PreTrainedModel heads: dict of {layer_num: list of heads to prune in
        this layer}
        )NotImplementedError)r9   headsr+   r+   r,   prune_heads  s   zTFSwinAttention.prune_headsNFr#   r>   r   r/   r   r   r]   r\   c           	      C  s>   | j |||||d}| j|d ||d}|f|dd   }|S )Nr   r   r   )r9   r   )	r9   r#   r   r   r   r\   Zself_outputsattention_outputr   r+   r+   r,   r     s   zTFSwinAttention.callc                 C     | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urUt| jj | jd  W d    d S 1 sNw   Y  d S d S )NTr9   r   )r   r   rG   r   r9   rm   r   r   r   r+   r+   r,   r        "zTFSwinAttention.buildr   r   )r#   r>   r   r/   r   r/   r   r]   r\   r]   rA   r>   r   )r&   r'   r(   rs   r   r   r   r   r+   r+   r   r,   r     s    
r   c                      s2   e Zd Zd fddZdddZdddZ  ZS )TFSwinIntermediaterh   r   r   r@   rA   rj   c                   sZ   t  jdi | tjjt|j| dd| _t|j	t
r$t|j	 | _n|j	| _|| _d S )Nr   rl   r+   )rr   rs   r   rz   r   r@   	mlp_ratior   r   Z
hidden_actstrr   intermediate_act_fnr   r   r   r+   r,   rs     s   
zTFSwinIntermediate.__init__r#   r>   c                 C  s   |  |}| |}|S r   )r   r   )r9   r#   r+   r+   r,   r     s   

zTFSwinIntermediate.callNc                 C  sl   | j rd S d| _ t| dd d ur4t| jj | jd d | jg W d    d S 1 s-w   Y  d S d S NTr   )r   r   rG   r   r   rm   r   r   r   r+   r+   r,   r     s   "zTFSwinIntermediate.buildr   )r#   r>   rA   r>   r   r   r+   r+   r   r,   r     s    
	r   c                      s4   e Zd Zd fddZddddZdddZ  ZS )TFSwinOutputrh   r   r   r@   rA   rj   c                   sF   t  jdi | tjj|dd| _tj|jd| _|| _	|| _
d S r   )rr   rs   r   rz   r   r   r|   r}   rq   rh   r   r   r   r+   r,   rs     s
   
zTFSwinOutput.__init__Fr#   r>   r\   r]   c                 C  s   |  |}| j||d}|S r   r   )r9   r#   r\   r+   r+   r,   r     r   zTFSwinOutput.callNc                 C  sx   | j rd S d| _ t| dd d ur:t| jj | jd d t| jj	| j
 g W d    d S 1 s3w   Y  d S d S r   )r   r   rG   r   r   rm   r   r@   rh   r   r   r   r+   r+   r,   r     s   ""zTFSwinOutput.buildr   r   )r#   r>   r\   r]   rA   r>   r   r   r+   r+   r   r,   r     s    r   c                      sT   e Zd Z		d$d% fddZd&ddZd'ddZ			d(d)d d!Zd*d"d#Z  ZS )+TFSwinLayerrX   r   r   r   r   r@   drop_path_rater[   
shift_sizerA   rj   c           	        s   t  jdi | |j| _t|}||jkr|n|j| _|| jkr$dn|| _|| _tj	j
|jdd| _t|||dd| _|dkrGt|ddntj	jddd| _tj	j
|jd	d| _t||d
d| _t||dd| _|| _d S )Nr   layernorm_beforer   	attentionrl   rX   rf   linearlayernorm_afterintermediater   r+   )rr   rs   Zchunk_size_feed_forwardrG   
reduce_minr?   r  r   r   rz   r{   layer_norm_epsr  r   r  r   
Activationrf   r  r   r  r   swin_outputr   )	r9   rh   r   r   r   r  r  r~   min_resr   r+   r,   rs     s    


zTFSwinLayer.__init__rK   rL   r?   r/   c              	   C  s  t ||f}d| f| | f| dff}d| f| | f| dff}|dkrd}|D ]T}	|D ]O}
t |	d | |	d | d }t |
d | |
d | d }t t jt ||ddd}t|dkr|t jt|f|jd| }t 	|||}|d7 }q1q-t 
|d}t 
|d}t||}t |d|| f}t 
|dt 
|d }t |dktd|}t |dktd	|}|S )
Nr   rF   r   r   )rF   rC   )r   rC   g      YrX   )rG   r   r   rH   r   r   r`   Zonesr   Ztensor_scatter_nd_updater   rO   rc   r[   )r9   rK   rL   r?   r  Zimg_maskZheight_slicesZwidth_slicescountZheight_sliceZwidth_sliceZheight_indsZ
width_indsindicesZupdatesZmask_windows	attn_maskr+   r+   r,   get_attn_mask  s.     
	
zTFSwinLayer.get_attn_maskr#   r>   Tuple[tf.Tensor, tf.Tensor]c                 C  s\   |||  | }|||  | }ddgd|gd|gddgg}t ||}t |d}||fS )Nr   r   )rG   r   rH   )r9   r#   r?   rK   rL   	pad_rightZ
pad_bottomr   r+   r+   r,   r   !  s   zTFSwinLayer.maybe_padNFr   r   r   r]   r\   c                 C  s  t |}|| jkrdn| j}|| jkr|n| j}|\}	}
t|\}}}|}| j||d}t |||	|
|f}| |||	|
\}}t|\}}}}|dkrZt j|| | fdd}n|}t	||}t |d|| |f}| j
||||d}| j|||||d}|d }t |d|||f}t||||}|dkrt j|||fdd}n|}|d dkp|d	 dk}|r|d d d |	d |
d d f }t |||	|
 |f}|| j||d }| j||d}| |}|| j||d }|r||d
 f}|S |f}|S )Nr   r   )r   rC   )shiftr   rF   )rK   rL   r?   r  )r   r\   r   rE   r   )rG   r  r?   r  r   r  rH   r   ZrollrO   r  r  rW   rf   r  r  r  )r9   r#   r   r   r   r\   r  r  r?   rK   rL   rJ   r   r   Zshortcutr   Z
height_padZ	width_padZshifted_hidden_statesZhidden_states_windowsr  Zattention_outputsr   Zattention_windowsZshifted_windowsZ
was_paddedZlayer_outputlayer_outputsr+   r+   r,   r   +  sN   
	

 
zTFSwinLayer.callc                 C  s  | j rd S d| _ t| dd d ur1t| jj | jd d | jg W d    n1 s,w   Y  t| dd d urVt| jj | jd  W d    n1 sQw   Y  t| dd d ur{t| j	j | j	d  W d    n1 svw   Y  t| dd d urt| j
j | j
d d | jg W d    n1 sw   Y  t| dd d urt| jj | jd  W d    n1 sw   Y  t| dd d urt| jj | jd  W d    d S 1 sw   Y  d S d S )NTr  r  rf   r  r  r  )r   r   rG   r   r  rm   r   r   r  rf   r  r  r  r   r+   r+   r,   r   m  s8   "zTFSwinLayer.build)rX   r   )
r   r   r   r@   r  r[   r  r@   rA   rj   )
rK   r@   rL   r@   r?   r@   r  r@   rA   r/   )
r#   r>   r?   r@   rK   r@   rL   r@   rA   r  NFF)r#   r>   r   r   r   r/   r   r]   r\   r]   rA   r>   r   )	r&   r'   r(   rs   r  r   r   r   r   r+   r+   r   r,   r     s    

Br   c                      s:   e Zd Zd! fddZ			d"d#ddZd$dd Z  ZS )%TFSwinStagerh   r   r   r@   r   r   depthr   rf   List[float]
downsampler   rA   rj   c           	        sv   t  jdi |  | _| _ fddt|D | _|d ur3|ttjj	dddd| _
nd | _
d| _d S )	Nc                   sB   g | ]}t  |d  dkrdn jd  | d| dqS )rC   r   zblocks.)rh   r   r   r   r  r  rm   )r   r?   ).0irh   r   rf   r   r   r+   r,   
<listcomp>  s    
z(TFSwinStage.__init__.<locals>.<listcomp>ro   )rp   r  )r   r   rm   Fr+   )rr   rs   rh   r   r   blocksr   r   rz   r{   r  Zpointing)	r9   rh   r   r   r  r   rf   r  r~   r   r  r,   rs     s   


zTFSwinStage.__init__NFr#   r>   r   r   r/   r   r   r\   r]   r   c                 C  s   |\}}t | jD ]\}}	|d ur|| nd }
|	|||
||d}|d }q	| jd urH|d d |d d }}||||f}| j|d ||d}n||||f}||f}|r\||dd  7 }|S )Nr   r   r   rC   )	enumerater  r  )r9   r#   r   r   r   r\   rK   rL   r  layer_modulelayer_head_maskr  Zheight_downsampledZwidth_downsampledr   Zstage_outputsr+   r+   r,   r     s    


zTFSwinStage.callc              	   C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urV| jD ]}t|j |d  W d    n1 sPw   Y  q8d S d S )NTr  r  )r   r   rG   r   r  rm   r   r  r9   rd   layerr+   r+   r,   r     s   
zTFSwinStage.build)rh   r   r   r@   r   r   r  r@   r   r@   rf   r  r  r   rA   rj   r  )r#   r>   r   r   r   r/   r   r   r\   r]   rA   r   r   r   r+   r+   r   r,   r    s    ,r  c                      s>   e Zd Zd fddZ						ddddZdddZ  ZS )TFSwinEncoderrh   r   rv   r   c                   sp   t  jdi | t j_ _ttddt	 j j
   fddtjD _d_d S )Nr   r   c                   s   g | ]I}t  t jd |  d d |  d d |  f j|  j| t jd| t jd|d   |jd k rCtndd| dqS )rC   r   r   Nzlayers.)rh   r   r   r  r   rf   r  rm   )r  r@   rx   depthsr   sum
num_layersr   )r  Zi_layerrh   Zdprrv   r9   r+   r,   r    s    *z*TFSwinEncoder.__init__.<locals>.<listcomp>Fr+   )rr   rs   r`   r%  r'  rh   listrG   Zlinspacer&  r  numpyr   rz   Zgradient_checkpointing)r9   rh   rv   r~   r   r(  r,   rs     s   "
zTFSwinEncoder.__init__NFTr#   r>   r   r   r/   r   r]   output_hidden_statesreturn_dictr\   rA   1Union[Tuple[tf.Tensor, ...], TFSwinEncoderOutput]c                 C  s`  d}|rdnd }	|rdnd }
|rdnd }|r9t |\}}}t||g||R }t|d}|	|f7 }	|
|f7 }
t| jD ][\}}|d urJ|| nd }||||||d}|d }|d }|d |d f}||f7 }|rt |\}}}t||g||R }t|d}|	|f7 }	|
|f7 }
|r||dd  7 }q>|std	d
 ||	|fD S t||	||
dS )Nr+   r   r   r   r   r   rF   rC   c                 s  s    | ]	}|d ur|V  qd S r   r+   )r  vr+   r+   r,   	<genexpr>   s    z%TFSwinEncoder.call.<locals>.<genexpr>)r!   r#   r$   r%   )r   rG   rH   rI   r  rz   tupler   )r9   r#   r   r   r   r+  r,  r\   Zall_input_dimensionsZall_hidden_statesZall_reshaped_hidden_statesZall_self_attentionsrJ   r   r   Zreshaped_hidden_stater  r   r!  r  r   r+   r+   r,   r     sH   






zTFSwinEncoder.callc              	   C  sj   | j rd S d| _ t| dd d ur1| jD ]}t|j |d  W d    n1 s+w   Y  qd S d S )NTrz   )r   r   rz   rG   r   rm   r   r"  r+   r+   r,   r   )  s   
zTFSwinEncoder.build)rh   r   rv   r   )NFFTF)r#   r>   r   r   r   r/   r   r]   r+  r]   r,  r]   r\   r]   rA   r-  r   r   r+   r+   r   r,   r$    s    9r$  c                   @  s   e Zd ZdZeZdZdZdS )TFSwinPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    swinr   N)r&   r'   r(   r)   r   config_classZbase_model_prefixZmain_input_namer+   r+   r+   r,   r1  3  s
    r1  a`  
    This model is a Tensorflow
    [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a
    regular Tensorflow Module and refer to the Tensorflow documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SwinConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a:  
    Args:
        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.
        head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
r   r   c                 C  s6   | du r	t j } |  }|dvrtdt|  |S )z
    From tensorflow addons
    https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/utils/keras_utils.py#L71
    N>   Zchannels_firstchannels_lastzWThe `data_format` argument must be one of "channels_first", "channels_last". Received: )r   backendZimage_data_formatlowerr   r   )r   data_formatr+   r+   r,   normalize_data_format`  s   

r8  c                      sL   e Zd ZdZejdfd fddZdddZdddZd fddZ	  Z
S )AdaptiveAveragePooling1Da|  
    Args:
    Average 1D Pooling with adaptive kernel size.
      output_size: An integer or tuple/list of a single integer, specifying pooled_features.
        The new size of output channels.
      data_format: A string,
        one of `channels_last` (default) or `channels_first`. The ordering of the dimensions in the inputs.
        `channels_last` corresponds to inputs with shape `(batch, steps, channels)` while `channels_first` corresponds
        to inputs with shape `(batch, channels, steps)`.
    Input shape:
      - If `data_format='channels_last'`: 3D tensor with shape `(batch, steps, channels)`.
      - If `data_format='channels_first'`: 3D tensor with shape `(batch, channels, steps)`.
    Output shape:
      - If `data_format='channels_last'`: 3D tensor with shape `(batch_size, pooled_steps, channels)`.
      - If `data_format='channels_first'`: 3D tensor with shape `(batch_size, channels, pooled_steps)`.

    Adapted from [tensorflow-addon's adaptive pooling.py](
        https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/layers/adaptive_pooling.py#L90-L120
    )
    Noutput_sizeUnion[int, Iterable[int]]reduce_functionr   r7  Optional[str]rA   rj   c                   s@   t || _|| _t|tr|fnt|| _t jdi | d S r   )	r8  r7  r<  r   r@   r0  r:  rr   rs   )r9   r:  r<  r7  r~   r   r+   r,   rs     s   
z!AdaptiveAveragePooling1D.__init__inputsr>   c                 G  st   | j d }| jdkr"tj||dd}tj|dd}| j|dd}|S tj||dd}tj|dd}| j|dd}|S )Nr   r4  r   r   rC   r   )r:  r7  rG   splitr   r<  )r9   r>  argsZbinsZsplitsZout_vectr+   r+   r,   r     s   

zAdaptiveAveragePooling1D.callrd   Iterable[int]r   c                 C  s\   t | }| jdkrt |d | jd |d g}|S t |d |d | jd g}|S )Nr4  r   rC   r   )rG   ZTensorShapeas_listr7  r:  )r9   rd   rP   r+   r+   r,   compute_output_shape  s   
z-AdaptiveAveragePooling1D.compute_output_shapeDict[str, Any]c                   s$   | j | jd}t  }i ||S )N)r:  r7  )r:  r7  rr   
get_config)r9   rh   Zbase_configr   r+   r,   rE    s
   
z#AdaptiveAveragePooling1D.get_config)r:  r;  r<  r   r7  r=  rA   rj   )r>  r>   rA   rj   )rd   rA  rA   r   )rA   rD  )r&   r'   r(   r)   rG   Zreduce_meanrs   r   rC  rE  r   r+   r+   r   r,   r9  o  s    

r9  c                      sl   e Zd ZeZ	d&d' fd
dZd(ddZd)ddZd*ddZe								d+d,d"d#Z
d-d$d%Z  ZS ).TFSwinMainLayerTFrh   r   add_pooling_layerr]   ri   rA   rj   c                   s   t  jdi | || _t|j| _t|jd| jd   | _t	||dd| _
t|| j
jdd| _tjj|jdd| _|rEtd	d
| _d S d | _d S )NrC   r   r   )ri   rm   encoderrl   	layernormr   r   )r:  r+   )rr   rs   rh   r`   r%  r'  r@   rx   num_featuresrg   r   r$  rw   rH  r   rz   r{   r	  rI  r9  poolerr9   rh   rG  ri   r~   r   r+   r,   rs     s   zTFSwinMainLayer.__init__rt   c                 C  s   | j jS r   )r   rk   r8   r+   r+   r,   get_input_embeddings  s   z$TFSwinMainLayer.get_input_embeddingsheads_to_pruneDict[int, List]c                 C  s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrH  r#  r  r   )r9   rO  r#  r   r+   r+   r,   _prune_heads  s   zTFSwinMainLayer._prune_headsr   Optional[Any]r	   c                 C  s   |d urt d gt| jj S r   )r   r`   rh   r%  )r9   r   r+   r+   r,   get_head_mask  s   zTFSwinMainLayer.get_head_maskNr   r/   r   r   r   r+  r,  r\   /Union[TFSwinModelOutput, Tuple[tf.Tensor, ...]]c              	   C  s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| |}| j|||d\}}	| j||	|||||d}
|
d }| j||d}d }| j	d urgt
|\}}}| 	|}t|||f}|su||f|
dd   }|S t|||
j|
j|
jdS )N You have to specify pixel_values)r   r\   r   r   r+  r,  r\   r   r   r   )r!   r0   r#   r$   r%   )rh   r   r+  use_return_dictr   rT  r   rH  rI  rL  r   rG   rH   r.   r#   r$   r%   )r9   r   r   r   r   r+  r,  r\   Zembedding_outputr   Zencoder_outputssequence_outputpooled_outputrJ   r   rK  r   r+   r+   r,   r     sH   




zTFSwinMainLayer.callc                 C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d ur~t| jj | jd d | j	g W d    d S 1 sww   Y  d S d S )NTr   rH  rI  )
r   r   rG   r   r   rm   r   rH  rI  rK  r   r+   r+   r,   r     s    "zTFSwinMainLayer.buildTFrh   r   rG  r]   ri   r]   rA   rj   )rA   rt   )rO  rP  )r   rS  rA   r	   NNNNNNFr   r/   r   r/   r   r/   r   r   r+  r   r,  r   r\   r]   rA   rU  r   )r&   r'   r(   r   r3  rs   rN  rR  rT  r   r   r   r   r+   r+   r   r,   rF    s"    


<rF  z^The bare Swin Model transformer outputting raw hidden-states without any specific head on top.c                      sd   e Zd Z	dd fd
dZeeeeee	de
de							dd ddZd!ddZ  ZS )"TFSwinModelTFrh   r   rG  r]   ri   rA   rj   c                   s,   t  j|fi | || _t|dd| _d S )Nr2  rl   )rr   rs   rh   rF  r2  rM  r   r+   r,   rs   !  s   zTFSwinModel.__init__Zvision)
checkpointoutput_typer3  Zmodalityexpected_outputNr   r/   r   r   r   r   r+  r,  r\   rU  c           	   	   C  sh   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| j|||||||d}|S )z
        bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        NrV  )r   r   r   r   r+  r,  r\   )rh   r   r+  rX  r   r2  )	r9   r   r   r   r   r+  r,  r\   Zswin_outputsr+   r+   r,   r   (  s    
zTFSwinModel.callc                 C  sd   | j rd S d| _ t| dd d ur0t| jj | jd  W d    d S 1 s)w   Y  d S d S )NTr2  )r   r   rG   r   r2  rm   r   r   r+   r+   r,   r   T  s   "zTFSwinModel.buildr[  r\  r]  r^  r   )r&   r'   r(   rs   r   SWIN_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr.   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r   r   r   r+   r+   r   r,   r_    s*    #r_  c                      s,   e Zd ZdZd fddZdd
dZ  ZS )TFSwinPixelShufflez0TF layer implementation of torch.nn.PixelShuffleupscale_factorr@   rA   rj   c                   s<   t  jdi | t|tr|dk rtd| || _d S )NrC   z1upscale_factor must be an integer value >= 2 got r+   )rr   rs   r   r@   r   rh  )r9   rh  r~   r   r+   r,   rs   `  s   
zTFSwinPixelShuffle.__init__rU   r>   c                   s~   |}t |\}}}}| jd  t|  t fddt D g}tj|t||dgdd}tjj	|| jdd}|S )	NrC   c                   s&   g | ]}t D ]}||   qqS r+   )r   )r  r  jZblock_size_squaredZoutput_depthr+   r,   r  p  s   & z+TFSwinPixelShuffle.call.<locals>.<listcomp>r   rF   )paramsr  Z
batch_dimsZNHWC)
block_sizer7  )
r   rh  r@   rG   Zconstantr   r   Ztiler   Zdepth_to_space)r9   rU   r#   rJ   r   Znum_input_channelsZpermutationr+   rj  r,   r   f  s   
zTFSwinPixelShuffle.call)rh  r@   rA   rj   r   r   r+   r+   r   r,   rg  ]  s    rg  c                      s2   e Zd Zd fddZddd	ZdddZ  ZS )TFSwinDecoderrh   r   c                   sN   t  jdi | tjj|jd |j dddd| _t|jdd| _	|| _
d S )NrC   r   0)r   r   r   rm   1rl   r+   )rr   rs   r   rz   r   Zencoder_striderM   conv2drg  pixel_shufflerh   )r9   rh   r~   r   r+   r,   rs   x  s   
zTFSwinDecoder.__init__rU   r>   rA   c                 C  s4   |}t |d}| |}| |}t |d}|S )Nr   r   )rG   rI   rp  rq  )r9   rU   r#   r+   r+   r,   r     s   

zTFSwinDecoder.callNc                 C  s   | j rd S d| _ t| dd d ur3t| jj | jd d d | jjg W d    n1 s.w   Y  t| dd d ur[t| j	j | j	d  W d    d S 1 sTw   Y  d S d S )NTrp  rq  )
r   r   rG   r   rp  rm   r   rh   r   rq  r   r+   r+   r,   r     s   "zTFSwinDecoder.buildrh   r   r   r   r   r+   r+   r   r,   rm  w  s    

rm  zvSwin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://arxiv.org/abs/2111.09886).c                      sZ   e Zd Zd fddZeeeeede								ddddZ
dddZ  ZS )TFSwinForMaskedImageModelingrh   r   c                   s0   t  | t|dddd| _t|dd| _d S )NFTr2  )rG  ri   rm   decoderrl   )rr   rs   rF  r2  rm  rt  r9   rh   r   r+   r,   rs     s   z%TFSwinForMaskedImageModeling.__init__)ra  r3  NFr   r/   r   r   r   r   r+  r,  r\   r]   rA   -Union[Tuple, TFSwinMaskedImageModelingOutput]c              	   C  s  |dur|n| j j}| j|||||||d}|d }	t|	d}	t|	\}
}}t|d  }}t|	|
|||f}	| |	}d}|dur| j j	| j j
 }t|d||f}t|| j j
d}t|| j j
d}t|d}t|tj}tjt|d	t|d	}t|d}t|| }t|d
 | j j }|| }t|d}|s|f|dd  }|dur|f| S |S t|||j|j|jdS )aA  
        bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, TFSwinForMaskedImageModeling
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
        >>> model = TFSwinForMaskedImageModeling.from_pretrained("microsoft/swin-tiny-patch4-window7-224")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = tf.random.uniform((1, num_patches)) >= 0.5

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```N)r   r   r   r+  r,  r\   r   r   g      ?rF   r   rC   )r   rC   r   r   ro   rJ  )r3   r4   r#   r$   r%   )rh   rX  r2  rG   rI   r   r@   rH   rt  r   r   r   r   rQ   Zfloat32r   ZlossesZmean_absolute_errorr   rM   r2   r#   r$   r%   )r9   r   r   r   r   r+  r,  r\   r   rY  rJ   rM   Zsequence_lengthrK   rL   Zreconstructed_pixel_valuesZmasked_im_losssizer   Zreconstruction_lossZ
total_lossZnum_masked_pixelsr   r+   r+   r,   r     sV   *



z!TFSwinForMaskedImageModeling.callc                 C  r   )NTr2  rt  )r   r   rG   r   r2  rm   r   rt  r   r+   r+   r,   r     r   z"TFSwinForMaskedImageModeling.buildrr  r]  )r   r/   r   r/   r   r/   r   r   r+  r   r,  r   r\   r]   rA   rv  r   )r&   r'   r(   rs   r   rc  r   r2   re  r   r   r   r   r+   r+   r   r,   rs    s    
]rs  z
    Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                      s^   e Zd Zd fddZeeeeee	e
de							ddddZdddZ  ZS )TFSwinForImageClassificationrh   r   c                   sZ   t  | |j| _t|dd| _|jdkr"tjj|jdd| _	d S tjjddd| _	d S )Nr2  rl   r   
classifierr  )
rr   rs   Z
num_labelsrF  r2  r   rz   r   r
  ry  ru  r   r+   r,   rs     s   
z%TFSwinForImageClassification.__init__)r`  ra  r3  rb  NFr   r/   r   labelsr   r   r+  r,  r\   r]   rA   9Union[Tuple[tf.Tensor, ...], TFSwinImageClassifierOutput]c                 C  s   |dur|n| j j}| j||||||d}|d }	| j|	|d}
|du r&dn| ||
}|sB|
f|dd  }|dur@|f| S |S t||
|j|j|jdS )a  
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrW  r   r   rC   )r3   r:   r#   r$   r%   )	rh   rX  r2  ry  Zhf_compute_lossr<   r#   r$   r%   )r9   r   r   rz  r   r+  r,  r\   r   rZ  r:   r3   r   r+   r+   r,   r   $  s,   	z!TFSwinForImageClassification.callc                 C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d ur`t| jdrbt| jj | jd d | jj	g W d    d S 1 sYw   Y  d S d S d S )NTr2  ry  rm   )
r   r   rG   r   r2  rm   r   hasattrry  rK  r   r+   r+   r,   r   Y  s   "z"TFSwinForImageClassification.buildrr  r]  )r   r/   r   r/   rz  r/   r   r   r+  r   r,  r   r\   r]   rA   r{  r   )r&   r'   r(   rs   r   rc  r   _IMAGE_CLASS_CHECKPOINTr<   re  _IMAGE_CLASS_EXPECTED_OUTPUTr   r   r   r   r+   r+   r   r,   rx    s&    -rx  )rx  rs  r_  r1  )r=   r>   r?   r@   rA   r>   )
rN   r>   r?   r@   rK   r@   rL   r@   rA   r>   )rX   FT)
rY   r>   rZ   r[   r\   r]   r^   r]   rA   r>   )r   r   rA   r   )Sr)   
__future__r   collections.abcr   rS   r5   dataclassesr   	functoolsr   typingr   r   r   r   r	   r
   r   r   Z
tensorflowrG   Zactivations_tfr   Zmodeling_tf_utilsr   r   r   r   r   r   Ztf_utilsr   utilsr   r   r   r   r   r   Zconfiguration_swinr   Z
get_loggerr&   loggerre  rd  rf  r}  r~  r   r.   r2   r<   rO   rW   rf   rz   ZLayerrg   rt   r   r   r   r   r   r   r   r   r  r$  r1  ZSWIN_START_DOCSTRINGrc  r8  r9  rF  r_  rg  rm  rs  rx  __all__r+   r+   r+   r,   <module>   s   (  
 #,
#
DDI ' UY
?m=tP