o
    Zh,                    @  sn  d Z ddlmZ ddlZddlZddlmZmZmZm	Z	 ddl
ZddlmZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZmZ dd	lmZ dd
lmZm Z m!Z! ddl"m#Z# dZ$g dg dgZ%g dg dg dgg dg dg dggZ&dZ'G dd dej(j)Z*G dd dej(j)Z+G dd dej(j)Z,G dd dej(j)Z-G dd  d ej(j)Z.G d!d" d"ej(j)Z/G d#d$ d$ej(j)Z0G d%d& d&ej(j)Z1G d'd( d(ej(j)Z2eG d)d* d*ej(j)Z3G d+d, d,eZ4d-Z5d.Z6ed/e5G d0d1 d1e4Z7G d2d3 d3ej(j)Z8ed4e5G d5d6 d6e4eZ9ed7e5G d8d9 d9e4eZ:ed:e5G d;d< d<e4eZ;g d=Z<dS )>zTF 2.0 LayoutLMv3 model.    )annotationsN)ListOptionalTupleUnion   )get_tf_activation)TFBaseModelOutputTFQuestionAnsweringModelOutputTFSequenceClassifierOutputTFTokenClassifierOutput)TFPreTrainedModelTFQuestionAnsweringLossTFSequenceClassificationLossTFTokenClassificationLossget_initializerkeraskeras_serializableunpack_inputs)check_embeddings_within_bounds)add_start_docstrings%add_start_docstrings_to_model_forwardreplace_return_docstrings   )LayoutLMv3Configr   )      r   )r      r   )r   r   r      )   r   r      )	   
         )            )            )            g    חc                      s6   e Zd ZdZd fddZdd	d
ZdddZ  ZS )TFLayoutLMv3PatchEmbeddingsz$LayoutLMv3 image (patch) embeddings.configr   c              
     s   t  jd	i | t|jtjjr|jn|j|jf}tjj	|j
||dddt|jdd| _|j
| _
|jd |d |d   | _|| _d S )
NZvalidZchannels_lastTproj)filtersZkernel_sizestridespaddingZdata_formatuse_biaskernel_initializernamer   r   r    )super__init__
isinstance
patch_sizecollectionsabcIterabler   layersZConv2Dhidden_sizer   initializer_ranger3   
input_sizenum_patchesr2   )selfr2   kwargsZpatch_sizes	__class__r:   d/var/www/auris/lib/python3.10/site-packages/transformers/models/layoutlmv3/modeling_tf_layoutlmv3.pyr<   B   s$   


z$TFLayoutLMv3PatchEmbeddings.__init__pixel_values	tf.Tensorreturnc                 C  s6   t j|g dd}| |}t |d| j| jf}|S )N)r   r   r   r   perm)tf	transposer3   reshaperF   rC   )rG   rL   
embeddingsr:   r:   rK   callW   s   
z TFLayoutLMv3PatchEmbeddings.callNc                 C  sp   | j rd S d| _ t| dd d ur6t| jj | jd d d | jjg W d    d S 1 s/w   Y  d S d S )NTr3   )	builtgetattrrR   
name_scoper3   r9   buildr2   Znum_channelsrG   input_shaper:   r:   rK   rZ   `   s   "z!TFLayoutLMv3PatchEmbeddings.buildr2   r   rL   rM   rN   rM   N__name__
__module____qualname____doc__r<   rV   rZ   __classcell__r:   r:   rI   rK   r1   ?   s
    
	r1   c                      sl   e Zd ZdZd  fddZd!d	d
Zd"ddZd#ddZd$ddZ						d%d&ddZ	d'ddZ
  ZS )(TFLayoutLMv3TextEmbeddingszm
    LayoutLMv3 text embeddings. Same as `RobertaEmbeddings` but with added spatial (layout) embeddings.
    r2   r   c                   s0  t  jdi | tjj|j|jt|jdd| _	tjj|j
|jt|jdd| _tjj|jdd| _tj|j| _|j| _tjj|j|jt|jdd| _tjj|j|jt|jdd| _tjj|j|jt|jdd| _tjj|j|jt|jd	d| _tjj|j|jt|jd
d| _|j| _|| _d S )Nword_embeddings)Zembeddings_initializerr9   token_type_embeddings	LayerNormepsilonr9   position_embeddingsx_position_embeddingsy_position_embeddingsh_position_embeddingsw_position_embeddingsr:   )r;   r<   r   rB   Z	EmbeddingZ
vocab_sizerC   r   rD   rg   Ztype_vocab_sizerh   LayerNormalizationlayer_norm_epsri   Dropouthidden_dropout_probdropoutZpad_token_idpadding_token_indexZmax_position_embeddingsrl   Zmax_2d_position_embeddingsZcoordinate_sizerm   rn   Z
shape_sizero   rp   max_2d_positionsr2   rG   r2   rH   rI   r:   rK   r<   n   s`   
z#TFLayoutLMv3TextEmbeddings.__init__bboxrM   rN   c              
   C  sz  z.|d d d d df }|d d d d df }|d d d d df }|d d d d df }W n t y? } zt d|d }~ww z| |}| |}| |}	| |}
W n t yl } z
t d| j d|d }~ww | jd }| t|d d d d df |d d d d df  d|}| t|d d d d df |d d d d df  d|}tj|||	|
||gdd	}|S )
Nr   r   r   r   z9Bounding box is not of shape (batch_size, seq_length, 4).z0The `bbox` coordinate values should be within 0-z range.rQ   axis)	
IndexErrorrm   rn   rw   ro   rR   Zclip_by_valuerp   concat)rG   ry   Zleft_position_idsZupper_position_idsZright_position_idsZlower_position_ids	exceptionZleft_position_embeddingsZupper_position_embeddingsZright_position_embeddingsZlower_position_embeddingsZmax_position_idro   rp   spatial_position_embeddingsr:   r:   rK   %calculate_spatial_position_embeddings   sP   




44z@TFLayoutLMv3TextEmbeddings.calculate_spatial_position_embeddingsinputs_embdsc                 C  sh   t |}|d }| jd }| j| d }t j||t jd}|d }t |d|f}t ||df}|S )z
        We are provided embeddings directly. We cannot infer which are padded, so just generate sequential position
        ids.
        r   dtyper   )rR   shaperv   rangeint32rT   tile)rG   r   r\   Zsequence_lengthstart_indexZ	end_indexposition_ids
batch_sizer:   r:   rK   &create_position_ids_from_inputs_embeds   s   

zATFLayoutLMv3TextEmbeddings.create_position_ids_from_inputs_embeds	input_idsc                 C  s8   t t || j|j}t j|dd| }|| j }|S )z}
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_token_index + 1.
        r   rz   )rR   cast	not_equalrv   r   Zcumsum)rG   r   maskr   r:   r:   rK   "create_position_ids_from_input_ids   s   
z=TFLayoutLMv3TextEmbeddings.create_position_ids_from_input_idsinputs_embedsc                 C  s   |d u r	|  |S | |S r_   )r   r   )rG   r   r   r:   r:   rK   create_position_ids   s   

z.TFLayoutLMv3TextEmbeddings.create_position_idsNFtf.Tensor | NoneOptional[tf.Tensor]token_type_idsr   trainingboolc                 C  s   |d u r
|  ||}|d urt|}n	t|d d }|d u r)tj||jd}|d u r9t|| jj | |}| |}|| }	| 	|}
|	|
7 }	| 
|}|	|7 }	| |	}	| j|	|d}	|	S )NrQ   r   r   )r   rR   r   zerosr   r   rg   Z	input_dimrh   rl   r   ri   ru   )rG   r   ry   r   r   r   r   r\   rh   rU   rl   r   r:   r:   rK   rV      s&   	




zTFLayoutLMv3TextEmbeddings.callc                 C  s|  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d ur|t| jj | jd d | j	j
g W d    n1 sww   Y  t| dd d urt| jj | jd  W d    n1 sw   Y  t| dd d urt| jj | jd  W d    n1 sw   Y  t| dd d urt| jj | jd  W d    n1 sw   Y  t| dd d urt| jj | jd  W d    n	1 sw   Y  t| d	d d ur<t| jj | jd  W d    d S 1 s5w   Y  d S d S )
NTrg   rh   ri   rl   rm   rn   ro   rp   )rW   rX   rR   rY   rg   r9   rZ   rh   ri   r2   rC   rl   rm   rn   ro   rp   r[   r:   r:   rK   rZ     sH   $z TFLayoutLMv3TextEmbeddings.buildr]   )ry   rM   rN   rM   )r   rM   rN   rM   )r   rM   rN   rM   )r   rM   r   rM   rN   rM   )NNNNNF)r   r   ry   r   r   r   r   r   r   r   r   r   rN   rM   r_   )ra   rb   rc   rd   r<   r   r   r   r   rV   rZ   re   r:   r:   rI   rK   rf   i   s    
2
)

	%rf   c                      sP   e Zd Zd  fddZd!ddZd"d#ddZ			d$d%ddZd&ddZ  ZS )'TFLayoutLMv3SelfAttentionr2   r   c                   s   t  jd	i | |j|j dkrtd|j d|j d|j| _t|j|j | _| j| j | _t	| j| _
tjj| jt|jdd| _tjj| jt|jdd| _tjj| jt|jdd| _tj|j| _|j| _|j| _|| _d S )
Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()queryr8   r9   keyvaluer:   )r;   r<   rC   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizemathsqrtattention_score_normaliserr   rB   Denser   rD   r   r   r   rs   Zattention_probs_dropout_probru   has_relative_attention_biashas_spatial_attention_biasr2   rx   rI   r:   rK   r<   +  s<   

z"TFLayoutLMv3SelfAttention.__init__xrM   c                 C  s@   t |}|d |d | j| jf}t ||}t j|g ddS )Nr   r   r   r   r   r   rO   )rR   r   r   r   rT   rS   )rG   r   r   Z	new_shaper:   r:   rK   transpose_for_scoresM  s   
z.TFLayoutLMv3SelfAttention.transpose_for_scores    attention_scoresalphaUnion[float, int]c                 C  s<   || }t jt j|dddd}|| | }t jj|ddS )a  
        https://arxiv.org/abs/2105.13290 Section 2.4 Stabilization of training: Precision Bottleneck Relaxation
        (PB-Relax). A replacement of the original keras.layers.Softmax(axis=-1)(attention_scores). Seems the new
        attention_probs will result in a slower speed and a little bias. Can use
        tf.debugging.assert_near(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison. The
        smaller atol (e.g., 1e-08), the better.
        rQ   rz   )rR   expand_dimsZ
reduce_maxr   Zsoftmax)rG   r   r   Zscaled_attention_scoresZ	max_valueZnew_attention_scoresr:   r:   rK   cogview_attentionX  s   z+TFLayoutLMv3SelfAttention.cogview_attentionNFhidden_statesattention_maskr   	head_maskoutput_attentionsr   rel_pos
rel_2d_posr   rN   4Union[Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor]]c                 C  s"  |  | |}|  | |}	|  | |}
|
| j }tj|g dd}t||}| jr<| j	r<||| | j 7 }n
| jrF||| j 7 }|d urN||7 }| 
|}| j||d}|d urb|| }t||	}tj|g dd}t|}t||d |d | jf}|r||f}|S |f}|S )N)r   r   r   r   rO   r   r   r   r   )r   r   r   r   r   rR   rS   matmulr   r   r   ru   r   rT   r   )rG   r   r   r   r   r   r   r   Z	key_layerZvalue_layerZquery_layerZnormalised_query_layerZtransposed_key_layerr   Zattention_probsZcontext_layerr   outputsr:   r:   rK   rV   e  s<   



zTFLayoutLMv3SelfAttention.callc                 C  s  | j rd S d| _ t| dd d ur2t| jj | jd d | jjg W d    n1 s-w   Y  t| dd d ur\t| j	j | j	d d | jjg W d    n1 sWw   Y  t| dd d urt| j
j | j
d d | jjg W d    d S 1 sw   Y  d S d S )NTr   r   r   )rW   rX   rR   rY   r   r9   rZ   r2   rC   r   r   r[   r:   r:   rK   rZ     s    "zTFLayoutLMv3SelfAttention.buildr]   )r   rM   )r   )r   rM   r   r   NNFr   rM   r   r   r   r   r   r   r   r   r   r   r   r   rN   r   r_   )	ra   rb   rc   r<   r   r   rV   rZ   re   r:   r:   rI   rK   r   *  s    
"5r   c                      4   e Zd Zd fddZddddZdddZ  ZS )TFLayoutLMv3SelfOutputr2   r   c                   ^   t  jdi | tjj|jt|jdd| _tjj	|j
dd| _tjj|jd| _|| _d S Ndenseunitsr8   r9   ri   rj   )Zrater:   r;   r<   r   rB   r   rC   r   rD   r   rq   rr   ri   rs   rt   ru   r2   rx   rI   r:   rK   r<        
zTFLayoutLMv3SelfOutput.__init__Fr   rM   input_tensorr   r   rN   c                 C  .   | j |d}| j||d}| j|| d}|S Ninputs)r   r   r   ru   ri   rG   r   r   r   r:   r:   rK   rV        zTFLayoutLMv3SelfOutput.callNc                 C  s   | j rd S d| _ t| dd d ur2t| jj | jd d | jjg W d    n1 s-w   Y  t| dd d ur_t| j	j | j	d d | jjg W d    d S 1 sXw   Y  d S d S NTr   ri   )
rW   rX   rR   rY   r   r9   rZ   r2   rC   ri   r[   r:   r:   rK   rZ        "zTFLayoutLMv3SelfOutput.buildr]   Fr   rM   r   rM   r   r   rN   rM   r_   ra   rb   rc   r<   rV   rZ   re   r:   r:   rI   rK   r         
r   c                      :   e Zd Zd fddZ			ddddZdddZ  ZS )TFLayoutLMv3Attentionr2   r   c                   s2   t  jdi | t|dd| _t|dd| _d S )NrG   r9   outputr:   )r;   r<   r   self_attentionr   self_outputrx   rI   r:   rK   r<     s   zTFLayoutLMv3Attention.__init__NFr   rM   r   r   r   r   r   r   r   r   rN   r   c              	   C  sB   | j |||||||d}| j|d ||d}	|	f|dd   }
|
S )Nr   r   r   )r   r   )rG   r   r   r   r   r   r   r   Zself_outputsattention_outputr   r:   r:   rK   rV     s   
	zTFLayoutLMv3Attention.callc                 C     | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urUt| jj | jd  W d    d S 1 sNw   Y  d S d S )NTr   r   )rW   rX   rR   rY   r   r9   rZ   r   r[   r:   r:   rK   rZ        "zTFLayoutLMv3Attention.buildr]   r   r   r_   r   r:   r:   rI   rK   r     s    r   c                      s2   e Zd Zd fddZddd	ZdddZ  ZS )TFLayoutLMv3Intermediater2   r   c                   sZ   t  jdi | tjj|jt|jdd| _t	|j
tr$t|j
| _n|j
| _|| _d S )Nr   r   r:   )r;   r<   r   rB   r   intermediate_sizer   rD   r   r=   Z
hidden_actstrr   intermediate_act_fnr2   rx   rI   r:   rK   r<     s   
z!TFLayoutLMv3Intermediate.__init__r   rM   rN   c                 C  s   | j |d}| |}|S )Nr   )r   r   )rG   r   r:   r:   rK   rV      s   
zTFLayoutLMv3Intermediate.callNc                 C  sn   | j rd S d| _ t| dd d ur5t| jj | jd d | jjg W d    d S 1 s.w   Y  d S d S )NTr   )	rW   rX   rR   rY   r   r9   rZ   r2   rC   r[   r:   r:   rK   rZ     s   "zTFLayoutLMv3Intermediate.buildr]   )r   rM   rN   rM   r_   r   r:   r:   rI   rK   r     s    
r   c                      r   )TFLayoutLMv3Outputr2   r   c                   r   r   r   rx   rI   r:   rK   r<     r   zTFLayoutLMv3Output.__init__Fr   rM   r   r   r   rN   c                 C  r   r   r   r   r:   r:   rK   rV     r   zTFLayoutLMv3Output.callNc                 C  s   | j rd S d| _ t| dd d ur2t| jj | jd d | jjg W d    n1 s-w   Y  t| dd d ur_t| j	j | j	d d | jj
g W d    d S 1 sXw   Y  d S d S r   )rW   rX   rR   rY   r   r9   rZ   r2   r   ri   rC   r[   r:   r:   rK   rZ   "  r   zTFLayoutLMv3Output.buildr]   r   r   r_   r   r:   r:   rI   rK   r     r   r   c                      r   )TFLayoutLMv3Layerr2   r   c                   s@   t  jdi | t|dd| _t|dd| _t|dd| _d S )N	attentionr   intermediater   r:   )r;   r<   r   r   r   r   r   bert_outputrx   rI   r:   rK   r<   /  s   zTFLayoutLMv3Layer.__init__NFr   rM   r   r   r   r   r   r   r   r   rN   r   c              	   C  sT   | j |||||||d}|d }	|dd  }
| |	}| j||	|d}|f|
 }
|
S )N)r   r   r   r   r   r   r   )r   r   r   )rG   r   r   r   r   r   r   r   Zself_attention_outputsr   r   Zintermediate_outputZlayer_outputr:   r:   rK   rV   5  s   
	

zTFLayoutLMv3Layer.callc                 C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d urzt| jj | jd  W d    d S 1 ssw   Y  d S d S )NTr   r   r   )	rW   rX   rR   rY   r   r9   rZ   r   r   r[   r:   r:   rK   rZ   O  s    "zTFLayoutLMv3Layer.buildr]   r   r   r_   r   r:   r:   rI   rK   r   .  s    r   c                      sl   e Zd Zd( fddZd)d
dZd*ddZd+ddZd,ddZ								d-d.d$d%Zd/d&d'Z	  Z
S )0TFLayoutLMv3Encoderr2   r   c                   s   t  jdi |  | _ fddt jD | _ j| _ j| _| jr; j| _ j	| _	t
jj jt jddd| _| jrf j| _ j| _t
jj jt jddd| _t
jj jt jddd| _d S d S )	Nc                   s   g | ]}t  d | dqS )zlayer.r   )r   ).0ir2   r:   rK   
<listcomp>b  s    z0TFLayoutLMv3Encoder.__init__.<locals>.<listcomp>Frel_pos_bias)r   r8   r7   r9   rel_pos_x_biasrel_pos_y_biasr:   )r;   r<   r2   r   num_hidden_layerslayerr   r   rel_pos_binsmax_rel_posr   rB   r   r   r   rD   r   max_rel_2d_posrel_2d_pos_binsr   r   rx   rI   r   rK   r<   _  s<   zTFLayoutLMv3Encoder.__init__relative_positionsrM   num_bucketsr   max_distancec                 C  s   |d }t |}|d }||k }t jt |t j| }t|| }|| ||  }	||	 }
t |
|j}
t |
|d }
t |dk|j| t |||
 S )Nr   r   r   )	rR   absr   logr   float32r   minimumwhere)rG   r   r   r   ZbucketsZmax_exact_bucketsZis_smallZbuckets_log_ratioZdistance_log_ratioZbuckets_big_offsetZbuckets_bigr:   r:   rK   relative_position_bucket  s   
z,TFLayoutLMv3Encoder.relative_position_bucketdense_layerkeras.layers.Denser   c           	      C  sh   t j|ddt j|dd }| |||}t j||| jd}||}t |g d}t j|| jd}|S )Nrz   rQ   )depthr   )r   r   r   r   r   )rR   r   r   Zone_hotcompute_dtyperS   r   )	rG   r   r   r   r   Zrel_pos_matrixr   Zrel_pos_one_hotZ	embeddingr:   r:   rK   _cal_pos_emb  s   z TFLayoutLMv3Encoder._cal_pos_embc                 C  s   |  | j|| j| jS r_   )r  r   r   r   )rG   r   r:   r:   rK   _cal_1d_pos_emb  s   z#TFLayoutLMv3Encoder._cal_1d_pos_embry   c                 C  sd   |d d d d df }|d d d d df }|  | j|| j| j}|  | j|| j| j}|| }|S )Nr   r   )r  r   r   r   r   )rG   ry   Zposition_coord_xZposition_coord_yZ	rel_pos_xZ	rel_pos_yr   r:   r:   rK   _cal_2d_pos_emb  s    z#TFLayoutLMv3Encoder._cal_2d_pos_embNFTr   r   r   r   r   r   output_hidden_statesreturn_dictr   rN   oUnion[TFBaseModelOutput, Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor], Tuple[tf.Tensor, tf.Tensor, tf.Tensor]]c
              
   C  s   |rdnd }
|r
dnd }| j r| |nd }| jr| |nd }t| jD ]-\}}|r0|
|f }
|d ur8|| nd }||||||||	d}|d }|rR||d f }q%|rZ|
|f }
|rct||
|dS tdd ||
|fD S )Nr:   )r   r   r   r   r   Zlast_hidden_stater   
attentionsc                 s  s    | ]	}|d ur|V  qd S r_   r:   )r   r   r:   r:   rK   	<genexpr>  s    z+TFLayoutLMv3Encoder.call.<locals>.<genexpr>)r   r  r   r  	enumerater   r	   tuple)rG   r   ry   r   r   r   r  r  r   r   Zall_hidden_statesZall_self_attentionsr   r   r   Zlayer_moduleZlayer_head_maskZlayer_outputsr:   r:   rK   rV     s@   


zTFLayoutLMv3Encoder.callc              	   C  s`  | j rd S d| _ t| dd d ur1t| jj | jd d | jg W d    n1 s,w   Y  t| dd d urZt| jj | jd d | j	g W d    n1 sUw   Y  t| dd d urt| j
j | j
d d | j	g W d    n1 s~w   Y  t| dd d ur| jD ]}t|j |d  W d    n1 sw   Y  qd S d S )NTr   r   r   r   )rW   rX   rR   rY   r   r9   rZ   r   r   r   r   r   )rG   r\   r   r:   r:   rK   rZ     s.   
zTFLayoutLMv3Encoder.buildr]   )r   rM   r   r   r   r   )r   r   r   rM   r   r   r   r   )r   rM   )ry   rM   )NNNFFTNF)r   rM   ry   r   r   r   r   r   r   r   r  r   r  r   r   r   r   r   rN   r	  r_   )ra   rb   rc   r<   r   r  r  r  rV   rZ   re   r:   r:   rI   rK   r   ^  s     
"


9r   c                      s   e Zd ZeZd9 fddZd:ddZd;d
dZd<ddZdd Z	d=d>ddZ
d?ddZd@d d!ZdAd#d$ZdBd(d)Ze												*dCdDd7d8Z  ZS )ETFLayoutLMv3MainLayerr2   r   c                   s   t  jdi | || _|jrt|dd| _|jrQt|dd| _t	j
j|jdd| _t	j
j|jdd| _|js:|jrH|j|j }| j||fd t	j
jdd	d| _t|d
d| _d S )NrU   r   patch_embedri   rj   ru   )
image_sizegư>normencoderr:   )r;   r<   r2   Z
text_embedrf   rU   visual_embedr1   r  r   rB   rq   rr   ri   rs   rt   ru   r   r   rE   r>   init_visual_bboxr  r   r  )rG   r2   rH   r  rI   r:   rK   r<     s   zTFLayoutLMv3MainLayer.__init__Nc                 C  sR  | j jr2| j j| j j }| jdd| j jfddtjdd| _| jd|| d | j jfddtjdd| _	| j
r7d S d| _
t| dd d ur_t| jj | jd  W d    n1 sZw   Y  t| dd d urt| jj | jd  W d    n1 sw   Y  t| d	d d urt| jj | jd  W d    n1 sw   Y  t| d
d d urt| jj | jd d | j jg W d    n1 sw   Y  t| dd d urt| jj | jd  W d    n1 sw   Y  t| dd d ur't| jj | jd d | j jg W d    d S 1 s w   Y  d S d S )Nr   r   T	cls_token)r   ZinitializerZ	trainabler   r9   	pos_embedr  rU   r  ri   ru   r  )r2   r  rE   r>   Z
add_weightrC   rR   r   r  r  rW   rX   rY   r  r9   rZ   rU   r  ri   ru   r  )rG   r\   r  r:   r:   rK   rZ   $  sX   $zTFLayoutLMv3MainLayer.buildrN   keras.layers.Layerc                 C  s   | j jS r_   )rU   rg   )rG   r:   r:   rK   get_input_embeddingsL  s   z*TFLayoutLMv3MainLayer.get_input_embeddingsr   tf.Variablec                 C  s   || j j_d S r_   )rU   rg   weight)rG   r   r:   r:   rK   set_input_embeddingsO  s   z*TFLayoutLMv3MainLayer.set_input_embeddingsc                 C  s   t )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        )NotImplementedError)rG   Zheads_to_pruner:   r:   rK   _prune_headsS  s   z"TFLayoutLMv3MainLayer._prune_heads  r  Tuple[int, int]max_lenr   c           	      C  s  |\}}t d||d  || }t j|dd}t ||dg}t d||d  || }t j|dd}t |d|g}t j|d d d df |d d |d d dd f |dd  gdd}t |ddg}t jdd|d |d ggt jd}t j||gdd| _	d S )Nr   r   rz   rQ   r   r   )
rR   r   r   r   stackrT   Zconstantr   r}   visual_bbox)	rG   r  r!  heightwidthZvisual_bbox_xZvisual_bbox_yr#  Zcls_token_boxr:   r:   rK   r  Z  s   :"z&TFLayoutLMv3MainLayer.init_visual_bboxr   r   tf.DTypec                 C  s4   t j| jdd}t ||ddg}t j||d}|S )Nr   rz   r   r   )rR   r   r#  r   r   )rG   r   r   r#  r:   r:   rK   calculate_visual_bboxq  s   z+TFLayoutLMv3MainLayer.calculate_visual_bboxrL   rM   c                 C  sf   |  |}t|d }t| j|ddg}tj||gdd}t| dd d ur,|| j7 }| |}|S )Nr   r   rz   r  )	r  rR   r   r   r  r}   rX   r  r  )rG   rL   rU   r   Z
cls_tokensr:   r:   rK   embed_imagew  s   


z!TFLayoutLMv3MainLayer.embed_imager   c                 C  sx   t |j}|dkrtj|dd}n|dkr$tj|dd}tj|dd}n	td|j dt|| j}d| t }|S )Nr   r   rz   r   z&Wrong shape for attention_mask (shape ).g      ?)lenr   rR   r   r   r   r  LARGE_NEGATIVE)rG   r   n_dimsextended_attention_maskr:   r:   rK   get_extended_attention_mask  s   
z1TFLayoutLMv3MainLayer.get_extended_attention_maskr   r   (Union[tf.Tensor, List[tf.Tensor | None]]c                 C  s   |d u rd g| j j S t|}|dkr>tj|dd}tj|dd}tj|dd}tj|dd}t|| j jddddg}n'|dkrXtj|dd}tj|dd}tj|dd}n|dkretd|j dt|dkswJ d	t| d
t|| j	}|S )Nr   r   rz   rQ   r   r   z!Wrong shape for head_mask (shape r)  zGot head_mask rank of z, but require 5.)
r2   r   rR   Zrankr   r   r   r   r   r  )rG   r   r,  r:   r:   rK   get_head_mask  s(   
$z#TFLayoutLMv3MainLayer.get_head_maskFr   ry   r   r   r   r   Optional[bool]r  r  r   r   r	  c              
   C  s  |	d ur|	n| j j}	|
d ur|
n| j j}
|d ur|n| j j}|d ur0t|}|d }|d }n"|d urBt|}|d }|d }n|d urNt|d }ntd|d urZ|j}n|d urb|j}n|d urj|j}n|d urr|j}ntj}|d us}|d ur|d u rtj	||f|d}|d u rtj
||f|d}|d u rtj
||df|d}| j||||||d}d }d }|d uri| |}tj	|t|d f|d}|d u r|}n	tj||gdd}| j jr| ||}|d u r|}n	tj||gdd}| j js| j jrFtjdt|d |d}tj|dd}t||dg}|d us%|d urDtjtjd||ddd}t||dg}tj||gdd}n|}|d u rS|d u rS|}n	tj||gdd}| |}| j||d}n-| j jss| j jr| j jrtjtjd||ddd}t||dg}|}| j jr|}| |}| |}| j||||||	|
|d	}|d }|s|f|dd   S t||j|jd
S )Nr   r   zEYou have to specify either input_ids or inputs_embeds or pixel_valuesr   r   )r   ry   r   r   r   r   rz   r   )ry   r   r   r   r   r  r  r
  )r2   r   r  r  rR   r   r   r   r   Zonesr   rU   r(  r}   r   r'  r   r   r   r   ri   ru   r.  r0  r  r	   r   r  )rG   r   ry   r   r   r   r   r   rL   r   r  r  r   r\   r   
seq_lengthZ	int_dtypeZembedding_outputZ
final_bboxZfinal_position_idsZvisual_embeddingsZvisual_attention_maskr#  Zvisual_position_idsr-  Zencoder_outputssequence_outputr:   r:   rK   rV     s   



	






zTFLayoutLMv3MainLayer.callr]   r_   )rN   r  )r   r  )r  )r  r   r!  r   )r   r   r   r&  r^   )r   rM   rN   rM   )r   r   rN   r/  NNNNNNNNNNNFr   r   ry   r   r   r   r   r   r   r   r   r   r   r   rL   r   r   r1  r  r1  r  r1  r   r   rN   r	  )ra   rb   rc   r   config_classr<   rZ   r  r  r  r  r'  r(  r.  r0  r   rV   re   r:   r:   rI   rK   r    s4    

(




r  c                      s,   e Zd ZdZeZdZe fddZ  Z	S )TFLayoutLMv3PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    
layoutlmv3c                   s"   t  j}tjdtjdd|d< |S )N)NNr   ry   r   )r;   input_signaturerR   Z
TensorSpecr   )rG   sigrI   r:   rK   r9  g  s   z+TFLayoutLMv3PreTrainedModel.input_signature)
ra   rb   rc   rd   r   r6  Zbase_model_prefixpropertyr9  re   r:   r:   rI   rK   r7  ^  s    r7  a	  
    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
    behavior.

    <Tip>

    TensorFlow models and layers in `transformers` accept two formats as input:

    - having all inputs as keyword arguments (like PyTorch models), or
    - having all inputs as a list, tuple or dict in the first positional argument.

    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
    positional argument:

    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`

    Note that when creating models and layers with
    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
    about any of this, as you can just pass inputs like you would to any other Python function!

    </Tip>

    Parameters:
        config ([`LayoutLMv3Config`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)

        bbox (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Batch of document images. Each image is divided into patches of shape `(num_channels, config.patch_size,
            config.patch_size)` and the total number of patches (=`patch_sequence_length`) equals to `((height /
            config.patch_size) * (width / config.patch_size))`.

        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are attention masks?](../glossary#attention-mask)
        token_type_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.

            [What are position IDs?](../glossary#position-ids)
        head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
zdThe bare LayoutLMv3 Model transformer outputting raw hidden-states without any specific head on top.c                      sh   e Zd ZdgZ fddZeeeee	e
d												ddddZdddZ  ZS )TFLayoutLMv3Modelr   c                   s,   t  j|g|R i | t|dd| _d S )Nr8  r   )r;   r<   r  r8  )rG   r2   r   rH   rI   r:   rK   r<     s   zTFLayoutLMv3Model.__init__output_typer6  NFr   r   ry   r   r   r   r   rL   r   r1  r  r  r   r   rN   r	  c                 C  s&   | j |||||||||	|
||d}|S )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, TFAutoModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModel.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="tf")

        >>> outputs = model(**encoding)
        >>> last_hidden_states = outputs.last_hidden_state
        ```)r   ry   r   r   r   r   r   rL   r   r  r  r   )r8  )rG   r   ry   r   r   r   r   r   rL   r   r  r  r   r   r:   r:   rK   rV     s   /zTFLayoutLMv3Model.callc                 C  sd   | j rd S d| _ t| dd d ur0t| jj | jd  W d    d S 1 s)w   Y  d S d S )NTr8  )rW   rX   rR   rY   r8  r9   rZ   r[   r:   r:   rK   rZ   0  s   "zTFLayoutLMv3Model.buildr4  r5  r_   )ra   rb   rc   "_keys_to_ignore_on_load_unexpectedr<   r   r   LAYOUTLMV3_INPUTS_DOCSTRINGr   r	   _CONFIG_FOR_DOCrV   rZ   re   r:   r:   rI   rK   r<    s(    
=r<  c                      s8   e Zd ZdZd fddZddddZdddZ  ZS )TFLayoutLMv3ClassificationHeadz\
    Head for sentence-level classification tasks. Reference: RobertaClassificationHead
    r2   r   c                   s~   t  jdi | tjj|jdt|jdd| _|j	d ur |j	n|j
}tjj|dd| _tjj|jt|jdd| _|| _d S )	Ntanhr   )Z
activationr8   r9   ru   r   out_projr   r:   )r;   r<   r   rB   r   rC   r   rD   r   classifier_dropoutrt   rs   ru   
num_labelsrD  r2   )rG   r2   rH   rE  rI   r:   rK   r<   >  s&   
z'TFLayoutLMv3ClassificationHead.__init__Fr   rM   r   r   rN   c                 C  s4   | j ||d}| |}| j ||d}| |}|S )Nr   )ru   r   rD  )rG   r   r   r   r:   r:   rK   rV   T  s
   

z#TFLayoutLMv3ClassificationHead.callNc                 C  s  | j rd S d| _ t| dd d ur2t| jj | jd d | jjg W d    n1 s-w   Y  t| dd d urWt| j	j | j	d  W d    n1 sRw   Y  t| dd d urt| j
j | j
d d | jjg W d    d S 1 s}w   Y  d S d S )NTr   ru   rD  )rW   rX   rR   rY   r   r9   rZ   r2   rC   ru   rD  r[   r:   r:   rK   rZ   [  s    "z$TFLayoutLMv3ClassificationHead.buildr]   r   )r   rM   r   r   rN   rM   r_   r`   r:   r:   rI   rK   rB  9  s
    rB  a
  
    LayoutLMv3 Model with a sequence classification head on top (a linear layer on top of the final hidden state of the
    [CLS] token) e.g. for document image classification tasks such as the
    [RVL-CDIP](https://www.cs.cmu.edu/~aharley/rvl-cdip/) dataset.
    c                      l   e Zd ZdgZd fddZeeeee	e
d													ddddZd ddZ  ZS )!%TFLayoutLMv3ForSequenceClassificationr   r2   r   c                   s:   t  j|fi | || _t|dd| _t|dd| _d S )Nr8  r   
classifier)r;   r<   r2   r  r8  rB  rI  rx   rI   r:   rK   r<   v  s   z.TFLayoutLMv3ForSequenceClassification.__init__r=  NFr   r   r   r   r   r   labelsr   r1  r  r  ry   rL   r   rN   Union[TFSequenceClassifierOutput, Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor], Tuple[tf.Tensor, tf.Tensor, tf.Tensor], Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]]c                 C  s   |
dur|
n| j j}
| j||||||||	|
|||d}|d dddddf }| j||d}|du r5dn| ||}|
sQ|f|dd  }|durO|f| S |S t|||j|jdS )a  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, TFAutoModelForSequenceClassification
        >>> from datasets import load_dataset
        >>> import tensorflow as tf

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, words, boxes=boxes, return_tensors="tf")
        >>> sequence_label = tf.convert_to_tensor([1])

        >>> outputs = model(**encoding, labels=sequence_label)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```Nr   r   r   r   r   r   r  r  ry   rL   r   r   r   r   losslogitsr   r  )r2   use_return_dictr8  rI  hf_compute_lossr   r   r  )rG   r   r   r   r   r   r   rJ  r   r  r  ry   rL   r   r   r3  rO  rN  r   r:   r:   rK   rV   |  s6   4z*TFLayoutLMv3ForSequenceClassification.callc                 C  r   )NTr8  rI  )rW   rX   rR   rY   r8  r9   rZ   rI  r[   r:   r:   rK   rZ     r   z+TFLayoutLMv3ForSequenceClassification.buildr]   NNNNNNNNNNNNF)r   r   r   r   r   r   r   r   r   r   r   r   rJ  r   r   r1  r  r1  r  r1  ry   r   rL   r   r   r1  rN   rK  r_   )ra   rb   rc   r?  r<   r   r   r@  r   r   rA  rV   rZ   re   r:   r:   rI   rK   rH  j  s*    

QrH  a  
    LayoutLMv3 Model with a token classification head on top (a linear layer on top of the final hidden states) e.g.
    for sequence labeling (information extraction) tasks such as [FUNSD](https://guillaumejaume.github.io/FUNSD/),
    [SROIE](https://rrc.cvc.uab.es/?ch=13), [CORD](https://github.com/clovaai/cord) and
    [Kleister-NDA](https://github.com/applicaai/kleister-nda).
    c                      rG  )!"TFLayoutLMv3ForTokenClassificationr   r2   r   c                   s~   t  j|fi | |j| _t|dd| _tjj|jdd| _	|jdk r3tjj
|jt|jdd| _nt|dd| _|| _d S )Nr8  r   ru   r"   rI  r   )r;   r<   rF  r  r8  r   rB   rs   rt   ru   r   r   rD   rI  rB  r2   rx   rI   r:   rK   r<     s   


z+TFLayoutLMv3ForTokenClassification.__init__r=  NFr   r   ry   r   r   r   r   rJ  r   r1  r  r  rL   r   rN   Union[TFTokenClassifierOutput, Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor], Tuple[tf.Tensor, tf.Tensor, tf.Tensor], Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]]c                 C  s   |dur|n| j j}| j||||||||	|
|||d}|dur%t|}n	t|dd }|d }|d ddd|f }| j||d}| |}|du rPdn| ||}|sl|f|dd  }|durj|f| S |S t|||j	|j
dS )ag  
        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, TFAutoModelForTokenClassification
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForTokenClassification.from_pretrained("microsoft/layoutlmv3-base", num_labels=7)

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
        >>> word_labels = example["ner_tags"]

        >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="tf")

        >>> outputs = model(**encoding)
        >>> loss = outputs.loss
        >>> logits = outputs.logits
        ```N)ry   r   r   r   r   r   r   r  r  rL   r   rQ   r   r   r   rM  )r2   rP  r8  rR   r   ru   rI  rQ  r   r   r  )rG   r   ry   r   r   r   r   r   rJ  r   r  r  rL   r   r   r\   r2  r3  rO  rN  r   r:   r:   rK   rV     s@   6
z'TFLayoutLMv3ForTokenClassification.callc                 C  s  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d urt| jj | jd d | j	j
g W d    d S 1 sxw   Y  d S d S )NTr8  ru   rI  )rW   rX   rR   rY   r8  r9   rZ   ru   rI  r2   rC   r[   r:   r:   rK   rZ   W  s    "z(TFLayoutLMv3ForTokenClassification.buildr]   rR  )r   r   ry   r   r   r   r   r   r   r   r   r   r   r   rJ  r   r   r1  r  r1  r  r1  rL   r   r   r1  rN   rT  r_   )ra   rb   rc   r?  r<   r   r   r@  r   r   rA  rV   rZ   re   r:   r:   rI   rK   rS    s*    
[rS  a  
    LayoutLMv3 Model with a span classification head on top for extractive question-answering tasks such as
    [DocVQA](https://rrc.cvc.uab.es/?ch=17) (a linear layer on top of the text part of the hidden-states output to
    compute `span start logits` and `span end logits`).
    c                      sn   e Zd ZdgZd fddZeeeee	e
d														d d!ddZd"ddZ  ZS )# TFLayoutLMv3ForQuestionAnsweringr   r2   r   c                   s<   t  j|fi | |j| _t|dd| _t|dd| _d S )Nr8  r   
qa_outputs)r;   r<   rF  r  r8  rB  rV  rx   rI   r:   rK   r<   r  s   z)TFLayoutLMv3ForQuestionAnswering.__init__r=  NFr   r   r   r   r   r   start_positionsend_positionsr   r1  r  ry   rL   r  r   r   rN   Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor], Tuple[tf.Tensor, tf.Tensor], Tuple[tf.Tensor, tf.Tensor, tf.Tensor], Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]]c                 C  s   |dur|n| j j}| j|||||||	|
||||d}|d }| j||d}tj|ddd\}}tj|dd}tj|dd}d}|durV|durV||d	}| j|||fd
}|sm||f|dd  }|durk|f| S |S t||||j	|j
dS )ak  
        start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoProcessor, TFAutoModelForQuestionAnswering
        >>> from datasets import load_dataset
        >>> import tensorflow as tf

        >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)
        >>> model = TFAutoModelForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")

        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> question = "what's his name?"
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="tf")
        >>> start_positions = tf.convert_to_tensor([1])
        >>> end_positions = tf.convert_to_tensor([3])

        >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
        >>> loss = outputs.loss
        >>> start_scores = outputs.start_logits
        >>> end_scores = outputs.end_logits
        ```NrL  r   r   r   rQ   )r   Znum_or_size_splitsr{   )inputr{   )Zstart_positionZend_position)rO  r   )rN  start_logits
end_logitsr   r  )r2   rP  r8  rV  rR   splitZsqueezerQ  r
   r   r  )rG   r   r   r   r   r   r   rW  rX  r   r  ry   rL   r  r   r   r3  rO  r[  r\  rN  rJ  r   r:   r:   rK   rV   z  sD   A
z%TFLayoutLMv3ForQuestionAnswering.callc                 C  r   )NTr8  rV  )rW   rX   rR   rY   r8  r9   rZ   rV  r[   r:   r:   rK   rZ     r   z&TFLayoutLMv3ForQuestionAnswering.buildr]   )NNNNNNNNNNNNNF)r   r   r   r   r   r   r   r   r   r   r   r   rW  r   rX  r   r   r1  r  r1  ry   r   rL   r   r  r1  r   r   rN   rY  r_   )ra   rb   rc   r?  r<   r   r   r@  r   r
   rA  rV   rZ   re   r:   r:   rI   rK   rU  f  s,    

hrU  )rU  rH  rS  r<  r7  )=rd   
__future__r   r?   r   typingr   r   r   r   Z
tensorflowrR   Zactivations_tfr   Zmodeling_tf_outputsr	   r
   r   r   Zmodeling_tf_utilsr   r   r   r   r   r   r   r   Ztf_utilsr   utilsr   r   r   Zconfiguration_layoutlmv3r   rA  Z_DUMMY_INPUT_IDSZ_DUMMY_BBOXr+  rB   ZLayerr1   rf   r   r   r   r   r   r   r   r  r7  ZLAYOUTLMV3_START_DOCSTRINGr@  r<  rB  rH  rS  rU  __all__r:   r:   r:   rK   <module>   s~   (
* B *0 .  T)MQ1j	  