o
    Zht                    @   s&  d Z ddlZddlZddlZddlZddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZmZmZ ddlZddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0 e/ rddl1m2Z2 ddl3m4Z4 e5e6Z7eG dd de-Z8dQddZ9dRddZ:					dSddZ;G d d! d!ej<Z=e.G d"d# d#e(Z>G d$d% d%ej<Z?G d&d' d'ej<Z@G d(d) d)ej<ZAG d*d+ d+ej<ZBG d,d- d-ej<ZCG d.d/ d/ej<ZDG d0d1 d1ej<ZEG d2d3 d3ej<ZFG d4d5 d5ej<ZGeCjHZId6ZJG d7d8 d8ej<eZKG d9d: d:eKZLG d;d< d<eKZMG d=d> d>eKZNG d?d@ d@ej<ZOeLeMeNdAZPdBedCeeK fdDdEZQG dFdG dGe>ZRe.G dHdI dIe>ZSe.dJdKG dLdM dMe>e$ZTe.G dNdO dOe>ZUg dPZVdS )TzPyTorch UDOP model.    N)ABCabstractmethod)deepcopy)	dataclass)AnyDictOptionalSequenceTupleUnion)Tensornn)CrossEntropyLoss)
UdopConfig)Seq2SeqLMOutputSeq2SeqModelOutput   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringis_torch_flex_attn_availableis_torchdynamo_compiling)	BlockMask)make_flex_block_causal_maskc                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeeej   ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )	 BaseModelOutputWithAttentionMaska
  
    Class for the model's outputs that may also contain a past key/values (to speed up sequential decoding). Includes
    an additional attention mask.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model. If `past_key_values` is used only
            the last hidden-state of the sequences of shape `(batch_size, 1, hidden_size)` is output.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or
        when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`. Contains pre-computed hidden-states (key and values in the
            self-attention blocks and optionally if `config.is_encoder_decoder=True` in the cross-attention blocks)
            that can be used (see `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or
        when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when
        `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and
        `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
            used to compute the weighted average in the cross-attention heads.
    Nlast_hidden_stateattention_maskpast_key_valueshidden_states
attentionscross_attentions)__name__
__module____qualname____doc__r#   r   torchFloatTensor__annotations__r$   r%   r
   r&   r'   r(    r0   r0   U/var/www/auris/lib/python3.10/site-packages/transformers/models/udop/modeling_udop.pyr"   ;   s   
 !r"         c                 C   s   | | | | g}t dd|d d  d}||d  }t dd|d d  d}||d  }t j|d d |d d|d d |d ddd|dd  |d d|dd  |d dddgdd}|dd}|S )Nr         ?   dim   )r-   arangestackrepeat	transposeview)
image_size
patch_sizeZimage_feature_pool_shapeZvisual_bbox_xZvisual_bbox_yZvisual_bbox_inputr0   r0   r1   get_visual_bboxf   s   
rA   c                 C   sn   t | tjr| jd }n	t| }t| } || }|dkr1t|g| | }tj| |gdd} | d | S )Nr   r7   )	
isinstancer-   r   shapelentensorr;   tocat)seqZ
target_lenZ	pad_valuenmretr0   r0   r1   pad_sequence}   s   
rL      c	                    s  |}	t t dddddf dddddf  d |	  d|	d }
t t dddddf dddddf  d |	  d|	d |	 }|
| }t jddkdd	kB }t d|d	dd
d}d||< ||7 }t dddddf d
 t jt t|dddf 	d|
ddddddf ||dddddf gdd}|dd}t| \}}d||f< fddttD }du rt||dd	
dddjfddttD  dur fddD }dkr(
dn|
d t fdd|D }t fddD  dur[t  fdd|D }t ||gd}t gd durxt  |gd | fS )a  
    Combine the image and text embeddings for the input to the encoder/decoder of UDOP.

    First, the image embeddings are created by checking for each visual patch if it is inside the bounding box of a
    token. If it is, the visual patch is combined with the token embedding. Then, the visual bounding boxes are combined
    with the text bounding boxes. Finally, the visual bounding boxes are combined with the text attention mask.
    Nr      g       @r5   r   r6           r4   Tr7   Fc                    s   g | ]
} | |  qS r0   r0   .0i)image_embeddings
patch_indsr0   r1   
<listcomp>       z1combine_image_text_embeddings.<locals>.<listcomp>)r?   r@   c                    s   g | ]
}|  |  qS r0   r0   rP   )rT   visual_bboxr0   r1   rU      rV   c                    s&   g | ]}t d gt|  qS )r5   )r-   rE   rD   rF   rQ   item)r$   r0   r1   rU      s   & c              	      "   g | ]}t |t d  qS )r   r   rL   r-   
zeros_likerX   )rS   max_lenr0   r1   rU         " c              	      rZ   r[   r\   rX   )bboxr^   r0   r1   rU      r_   c              	      rZ   r[   r\   rX   )r$   r^   r0   r1   rU      r_   )r-   clipfloorlongrF   Zfloat64meangather	unsqueezer<   size	full_likeboolrG   r:   rD   flattenziprangerA   devicer;   )rS   inputs_embedsr`   rW   r$   num_patchesr^   r?   r@   sequence_lengthZocr_points_xZocr_points_yZ
ocr_pointsZ
target_segZrepeated_vision_embedsindrowscolsZinput_vision_patchesZvisual_attention_maskZinputs_vision_patchesr0   )r$   r`   rS   r^   rT   rW   r1   combine_image_text_embeddings   sd   DJ">





rt   c                       s(   e Zd ZdZ fddZdd Z  ZS )UdopPatchEmbeddingsz2D Image to Patch Embeddingsc                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr5   r   )Zkernel_sizeZstride)super__init__r?   r@   num_channelshidden_sizerB   collectionsabcIterablero   r   Conv2dproj)selfconfigr?   r@   rx   ry   ro   	__class__r0   r1   rw      s   
 zUdopPatchEmbeddings.__init__c              
   C   sx   |j \}}}}|| jd ks|| jd kr,td| d| d| jd  d| jd  d	| |}|ddd}|S )Nr   r5   zInput image size (*z) doesn't match model (z).rN   )rC   r?   
ValueErrorr~   rj   r=   )r   pixel_values
batch_sizerx   heightwidth
embeddingsr0   r0   r1   forward   s   (
zUdopPatchEmbeddings.forward)r)   r*   r+   r,   rw   r   __classcell__r0   r0   r   r1   ru      s    ru   c                   @   s6   e Zd ZeZdZdZdZdZdgZ	dd Z
dd Zd	S )
UdopPreTrainedModelZtransformerTFwoc                 C   s  | j j}t|tr|jj|d  dS t|tjr5|jjj	d|d |j
dur3|jj|j
   dS dS t|tjr`tjj|jjtjd|d|jj|j_|jdur^|jj  dS dS t|tr}| j j}| j j}|jjjj	d||d  d dS t|tr|jjjj	d|d d dS t|trt|dr| j js|jjjj	d|d d dS dS dS t|tr|jjjj	d|| j jd  d t|jdr|jjdur|jjj  |jjjj	d|| j j d  d t|jdr|jjdur|jjj  dS dS dS t|t!r~|j"jjj	d|| j jd  d t|j"dr.|j"jdur.|j"jj  |j#jjj	d|| j jd  d t|j#drS|j#jdurS|j#jj  |jjjj	d|| j j d  d t|jdrz|jjdur||jjj  dS dS dS t|t$r| j j}| j j%}| j j&}|j'jjj	d||| d  d |j(jjj	d||d  d |j)jjj	d||d  d |j*jjj	d||| d  d |j+r|jjjj	d||d  d dS dS dS )zInitialize the weightsr4   rO   )rd   ZstdN      lm_headbias),r   Zinitializer_factorrB   UdopLayerNormweightdataZfill_r   	EmbeddingZnormal_Zpadding_idxZzero_r}   initZtrunc_normal_rF   r-   float32dtyper   RelativePositionBiasBased_modelrelative_attention_bias	UdopModelsharedUdopForConditionalGenerationhasattrtie_word_embeddingsr   UdopDenseActDensewir   d_ffUdopDenseGatedActDensewi_0wi_1UdopAttentiond_kv	num_headsqkvohas_relative_attention_bias)r   modulefactorr   key_value_proj_dimn_headsr0   r0   r1   _init_weights   sp   



 

        
z!UdopPreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d usJ d||j}|dd df  |ddd f< ||d< |d us2J d||dk| t|dk	 sGJ d	|S )
Nzself.model.config.decoder_start_token_id has to be defined. In Udop it is usually set to the pad_token_id. See Udop docs for more information.r6   r5   ).r   z1self.model.config.pad_token_id has to be defined.r   z8Verify that `shifted_input_ids` has only positive values)
r   decoder_start_token_idpad_token_idZ	new_zerosrC   cloneZmasked_fill_r-   allrY   )r   	input_idsr   r   Zshifted_input_idsr0   r0   r1   _shift_right>  s   
 z UdopPreTrainedModel._shift_rightN)r)   r*   r+   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_supports_cache_classZ_supports_static_cacheZ_keep_in_fp32_modulesr   r   r0   r0   r0   r1   r      s    >r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )r   ư>c                    s&   t    tt|| _|| _dS )ze
        Construct a layernorm module in the Udop style. No bias and no subtraction of mean.
        N)rv   rw   r   	Parameterr-   onesr   variance_epsilon)r   ry   epsr   r0   r1   rw   W  s   

zUdopLayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )NrN   r6   T)Zkeepdim)rF   r-   r   powrd   Zrsqrtr   r   r   float16Zbfloat16)r   r&   Zvariancer0   r0   r1   r   _  s
   
zUdopLayerNorm.forward)r   r)   r*   r+   rw   r   r   r0   r0   r   r1   r   V  s    r   c                       *   e Zd Zdef fddZdd Z  ZS )r   r   c                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFr   )rv   rw   r   Linearr   r   r   r   Dropoutdropout_ratedropoutr   dense_act_fnactr   r   r   r0   r1   rw   q  s
   
zUdopDenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)r   r   r   rB   r   r   r-   r   r   int8rF   )r   r&   r0   r0   r1   r   x  s   



zUdopDenseActDense.forwardr)   r*   r+   r   rw   r   r   r0   r0   r   r1   r   p  s    r   c                       r   )r   r   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S r   )rv   rw   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   r1   rw     s   
zUdopDenseGatedActDense.__init__c                 C   sz   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S r   )r   r   r   r   rB   r   r   r-   r   r   r   rF   )r   r&   Zhidden_geluZhidden_linearr0   r0   r1   r     s   


zUdopDenseGatedActDense.forwardr   r0   r0   r   r1   r     s    r   c                       r   )UdopLayerFFr   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr   )rv   rw   Zis_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   r   r0   r1   rw     s   

zUdopLayerFF.__init__c                 C   s&   |  |}| |}|| | }|S r   )r   r   r   )r   r&   Zforwarded_statesr0   r0   r1   r     s   

zUdopLayerFF.forwardr   r0   r0   r   r1   r     s    
r   c                       sl   e Zd Z		ddedee f fddZdd ZedddZ	dddZ
									dddZ  ZS )r   FNr   	layer_idxc                    s  t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _t | _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )rv   rw   
is_decoderr   relative_attention_num_bucketsrelative_attention_max_distancer   r   r   r   r   r   r   	inner_dimr   loggerwarning_oncer   r)   r   r   r   r   r   r   r   r   setpruned_headsgradient_checkpointingr   r   r   r   r   r0   r1   rw     s.   

zUdopAttention.__init__c                 C   s   t |dkrd S t|| j| j| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt | | _| j| j | _
| j|| _d S )Nr   r5   r7   )rD   r   r   r   r   r   r   r   r   r   r   union)r   headsindexr0   r0   r1   prune_heads  s   zUdopAttention.prune_headsT       c                 C   s   d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   rN   r5   )rF   r-   rc   absminr]   logfloatmathrh   where)relative_positionbidirectionalnum_bucketsmax_distanceZrelative_bucketsZ	max_exactZis_smallZrelative_position_if_larger0   r0   r1   _relative_position_bucket  s*   z'UdopAttention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|| j | j	| j
d}|  |}	|	g dd}	|	S )z%Compute binned relative position biasNr   rm   r   r   r   )rN   r   r5   r   )r   r   rm   r-   r:   rc   rF   r   r   r   r   permuterf   )
r   query_length
key_lengthrm   cache_positioncontext_positionmemory_positionr   Zrelative_position_bucketvaluesr0   r0   r1   compute_bias  s    
 
zUdopAttention.compute_biasc                 C   s  |j dd \}}|du}| |}||d| j| jdd}|dur4|j| j}|r1|j	}n|j
}|r8|n|}|rO|durO|rO|j| j }|j| j }nE| |}| |}||d| j| jdd}||d| j| jdd}|dur|s}|
nd}
|||| jd|
i\}}|rd|j| j< t||dd}|du r|j d }|dur|n|
d d }| jstjd| j||f|j|jd	}| jr| jrd|_n| j|||j|
d
}|dddd| dddf }|dur|ddddddd|j d f }|| }| jr%t|j d }d|t| j< |dd| f }n|}||7 }tjj |! dd"|}tjj#|| j#| jd}|durL|| }t||}|dd$ }||d| j%}| &|}|||f}|	rt||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        NrN   r6   r5   r   Tr   rm   r   )rm   r   r   r7   )ptraining)'rC   r   r>   r   r   r=   
is_updatedgetr   Zcross_attention_cacheself_attention_cacheZ	key_cacheZvalue_cacher   r   updater-   matmulr   zerosrm   r   r   r   Zrequires_gradr   r   r   listri   r   Z
functionalZsoftmaxr   Ztype_asr   
contiguousr   r   )r   r&   maskkey_value_statesposition_biaspast_key_valuelayer_head_maskr   	use_cacheoutput_attentionsr   r   
seq_lengthZis_cross_attentionZquery_statesr   Zcurr_past_key_valueZcurrent_statesZ
key_statesZvalue_statesZscoresr   Zreal_seq_lengthcausal_maskZposition_bias_maskedZattn_weightsZattn_outputoutputsr0   r0   r1   r   0  sx   





"
&



zUdopAttention.forwardFN)Tr   r   NN)	NNNNNNFFN)r)   r*   r+   r   r   intrw   r   staticmethodr   r   r   r   r0   r0   r   r1   r     s,    #
/r   c                       s@   e Zd Zddee f fddZ							d	ddZ  ZS )
UdopLayerSelfAttentionFNr   c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nr   r   r   )rv   rw   r   SelfAttentionr   r   r   r   r   r   r   r   r   r   r0   r1   rw     s   
zUdopLayerSelfAttention.__init__c	              
   C   sL   |  |}	| j|	|||||||d}
|| |
d  }|f|
dd   }|S )N)r  r  r
  r	  r  r  r   r   r5   )r   r  r   )r   r&   r$   r  r
  r	  r  r  r   normed_hidden_statesattention_outputr  r0   r0   r1   r     s   

zUdopLayerSelfAttention.forwardr  )NNNNFFNr)   r*   r+   r   r  rw   r   r   r0   r0   r   r1   r    s    r  c                       sB   e Zd Zddee f fddZ								d	ddZ  ZS )
UdopLayerCrossAttentionNr   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFr  r   )rv   rw   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   r0   r1   rw     s   
z UdopLayerCrossAttention.__init__Fc                 C   sP   |  |}| j|||||||||	|
d
}|| |d  }|f|dd   }|S )N)	r  r  r  r
  r	  r  r   r  r   r   r5   )r   r  r   )r   r&   r  r$   r  r
  r	  r  r   r  r   r  r  Zlayer_outputr  r0   r0   r1   r     s    
zUdopLayerCrossAttention.forwardr   )NNNNFNFNr  r0   r0   r   r1   r    s    
r  c                       sJ   e Zd Zd	dee f fddZ												d
ddZ  ZS )	UdopBlockFNr   c                    s`   t    |j| _t | _| jt|||d | jr&| jt||d | jt	| d S )Nr  )r   )
rv   rw   r   r   
ModuleListlayerappendr  r  r   r   r   r0   r1   rw     s   

zUdopBlock.__init__Tc                 C   s  | j d |||||	|
||d}|d d \}}	|dd  }|jtjkrDtt| t|jjd t|jj}tj	|| |d}| j
oJ|d u}|r| j d ||||||	|d d |
|d	}|d d \}}	|jtjkrtt| t|jjd t|jj}tj	|| |d}||dd   }| j d |}|jtjkrtt| t|jjd t|jj}tj	|| |d}|f}|
r||	f | }|S || }|S )	Nr   )r$   r  r
  r	  r  r  r   rN   i  )r   maxr5   r6   )r  r$   r  r
  r	  r   r  r  )r  r   r-   r   r   isinfanyfinfor   clampr   )r   r&   r$   r  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr
  Zcross_attn_layer_head_maskr	  r  r  return_dictr   Zself_attention_outputsZattention_outputsZclamp_valueZdo_cross_attentionZcross_attention_outputsr  r0   r0   r1   r     sn   

zUdopBlock.forwardr  )NNNNNNNNFFTNr  r0   r0   r   r1   r    s    r  c                       s&   e Zd Zd fdd	Zdd Z  ZS )UdopCellEmbeddings     c                    s4   t t|   || _t||| _t||| _d S r   )rv   r)  rw   max_2d_position_embeddingsr   r   x_position_embeddingsy_position_embeddings)r   r,  ry   r   r0   r1   rw   P  s   zUdopCellEmbeddings.__init__c                 C   s   t |dd}|| jd   }| |d d d d df }| |d d d d df }| |d d d d df }| |d d d d df }|| | | }|S )NrO   r4   r5   r   rN   r   )r-   ra   r,  rc   r-  r.  )r   r`   Zleft_position_embeddingsZupper_position_embeddingsZright_position_embeddingsZlower_position_embeddingsr   r0   r0   r1   r   W  s   zUdopCellEmbeddings.forward)r*  r+  r   r0   r0   r   r1   r)  O  s    r)  )g?g      ?c                	       s   e Zd ZdZ									d fd	d
	Ze		ddee deee	e
f  defddZddee deee	e
f  defddZdd Zddee deee	e
f  defddZ  ZS )r   a  
    Base class of relative biases.

    Args:
        num_heads (`int`):
            Number of attention heads in the model, it will create embeddings of size `num_heads`, which will be added to the scores of each token pair.
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            Pair token metric (distance in the sequence, distance in pixels etc.) will be bucketed, parameter is defining number of such
            buckets.
        bidirectional (`bool`, *optional*, defaults to `True`):
            Whether the distance should be bidirectional for a pair of tokens. If `False`, then distance(tok1, tok2) == distance(tok2, tok1).
        scaling_factor (`int`, *optional*, defaults to 1):
            Defining factor which will be used to scale relative distance.
        max_distance (`int`, *optional*, defaults to 128):
            All distances above this value will end up in the one/same bucket.
        augmentation (`bool`, *optional*, defaults to `False`):
            Whether to multiply relative distances by a random scalar.
        expand (`bool`, *optional*, defaults to `False`):
            Whether to expand an existing pretrained model with subsequent additions of prefix_bucket.
    Nr   Tr5   r   tokensFc
                    sp   t t|   || _|| _|| _|| _|| _|| _|| _	|	| _
|| _|r)| j
s)dnd}
t| j|
 | j	| _d S )NrN   r   )rv   r   rw   prefix_bucketaugmentationlevelr   scaling_factorr   r   expandr   r   r   r   )r   r   r   r   r3  r   r2  r1  r0  r4  Z
extra_headr   r0   r1   rw     s   z!RelativePositionBiasBase.__init__r$   r`   returnc                 C   s   d S r   r0   )r   r$   r`   r0   r0   r1   prepare_input  s   z&RelativePositionBiasBase.prepare_inputc                 C   s&   |  ||}t|| j| j| jd}|S )Nr   )r6  get_relative_position_bucketr   r   r   )r   r$   r`   r   	rp_bucketr0   r0   r1   
get_bucket  s   z#RelativePositionBiasBase.get_bucketc                 C   sd   |d d d d d f }|d d d d d f }|| }| j r'| jr'|tjt 9 }|| j9 }|tjS r   )	r1  r   randomuniformAUGMENTATION_RANGEr3  rF   r-   rc   )r   Z	positionsr   r   r   r0   r0   r1   get_relative_position  s   
z.RelativePositionBiasBase.get_relative_positionc           
      C   s:  | j r+| jr+t| jd | j}| jjj|jjd | j< d|jj| jd < || _d| _ | 	||}| jr|
ddkrL|
ddkrL||
ddd}|d d d d df dk }|d}t|  D ]\}}| j||d ||d f< | jd |||d d |f< qf| |}	|	 dkrtd|	g d	}	|	S )
NrN   g?Fr   r5   r6   r9   z Wrong dimension of values tensor)r   r   r5   rN   )r4  r0  r   r   r   r   r   r   r   r9  rg   r<   sum	enumeratecpunumpyr8   r   r   )
r   r$   r`   Znew_biasr8  Z	is_prefixZ
num_prefixidxZnum_prefix_rowr   r0   r0   r1   r     s(   

z RelativePositionBiasBase.forward)	Nr   Tr5   r   r/  FFFr  )r)   r*   r+   r,   rw   r   r   r   r   strr   r6  r9  r=  r   r   r0   r0   r   r1   r   o  s2    (
0
r   c                       sF   e Zd Zd fdd	Zddee deeeef  defd	d
Z	  Z
S )RelativePositionBias1Dr5   r   c                       t  jd||d| dS )z
        Reimplementation of T5 relative position bias. Distance between given tokens is their distance in the sequence.
        Parameters are the same as in base class
        r3  r   Nr0   rv   rw   r   r3  r   kwargsr   r0   r1   rw        zRelativePositionBias1D.__init__Nr$   r`   r5  c                 C   sB   | j dkr	td| tj|dtj|jdd d d f }|S )Nr5   zNo need to scale 1d featuresr   )r3  r   r=  r-   r:   rg   rc   rm   )r   r$   r`   r   r0   r0   r1   r6    s   
$z$RelativePositionBias1D.prepare_input)r5   r   r  r)   r*   r+   rw   r   r   r   rC  r   r6  r   r0   r0   r   r1   rD        0rD  c                       F   e Zd Zd
 fdd	Zddee deeeef  defdd	Z	  Z
S )RelativePositionBiasHorizontald   c                    rE  )z
        Represents in the bucket embeddings horizontal distance between two tokens. Parameters are the same as in base
        class
        rF  Nr0   rG  rH  r   r0   r1   rw     rJ  z'RelativePositionBiasHorizontal.__init__Nr$   r`   r5  c                 C   N   | j dks	td|d u rtd|d d d d ddgf jdd}| |S )Nr4   ENeed to scale the values of bboxes, as there are in small (0,1) rangez6Bbox is required for horizontal relative position biasr   rN   r6   r7   r3  r   rd   r=  )r   r$   r`   Zhorizontal_positionr0   r0   r1   r6       
"
z,RelativePositionBiasHorizontal.prepare_inputrO  rO  r  rK  r0   r0   r   r1   rN    rL  rN  c                       rM  )RelativePositionBiasVerticalrO  c                    rE  )z
        Represents in the bucket embeddings vertical distance between two tokens. Parameters are the same as in base
        class
        rF  Nr0   rG  rH  r   r0   r1   rw     rJ  z%RelativePositionBiasVertical.__init__Nr$   r`   r5  c                 C   rP  )Nr4   rQ  z4Bbox is required for vertical relative position biasr5   r   r6   r7   rR  )r   r$   r`   Zvertical_positionr0   r0   r1   r6    rS  z*RelativePositionBiasVertical.prepare_inputrT  r  rK  r0   r0   r   r1   rU    rL  rU  c                	       sX   e Zd Zdee f fddZ	d
dee deee	e
f  deeef fdd	Z  ZS )RelativePositionBiasAggregatedmodulesc                    s   t    t|| _dS )z
        Class which sums up various computed biases.

        Args:
            modules (Sequence[RelativePositionBiasBase]):
                List of relative bias modules.
        N)rv   rw   r   r  biases)r   rW  r   r0   r1   rw     s   
z'RelativePositionBiasAggregated.__init__Nr$   r`   r5  c                 C   s"   d}| j D ]	}|||| }q|S )NrO   )rX  )r   r$   r`   outputr   r0   r0   r1   r     s   
z&RelativePositionBiasAggregated.forwardr  )r)   r*   r+   r	   r   rw   r   r   r   rC  r   r   r   r   r   r0   r0   r   r1   rV    s    
rV  )Z1d
horizontalverticalr   r5  c                 C   s   g }t | dr@| jD ]5}t|}|d}t | dr| jn| j}d|v r/|d |kr.tdn||d< |t| di | q
|S )z
    Creates empty list or one/multiple relative biases.

    :param config: Model's configuration :return: Sequence with created bias modules.
    relative_bias_argstyper   z4Number of heads must match num of heads in the modelNr0   )	r   r\  r   popr   Znum_attention_headsr   r  BIAS_CLASSES)r   Z	bias_listZbias_kwargs_orgZbias_kwargsZ	bias_typeZmodel_num_headsr0   r0   r1   create_relative_bias,  s   


r`  c                       s   e Zd ZdZd" fdd	Zdd Zededefd	d
Z	dd Z
dd Zdd Z																		d#ddZ	d$deejdf dejdejdedef
ddZedejdededejdejdefd d!Z  ZS )%	UdopStackz
    This class is based on `T5Stack`, but modified to take into account the image modality as well as 2D position
    embeddings.
    Nc                    s   t    || _|| _ j| _ j| _ j| _t	 fddt
| jD | _t j jd| _t j| _| jsCt j j| _|  | _d S )Nc                    s"   g | ]}t  t|d k|dqS )r   r  )r  ri   rP   r   r0   r1   rU   R  r_   z&UdopStack.__init__.<locals>.<listcomp>r   )rv   rw   embed_tokensembed_patchesr   
max_lengthZ_max_length
num_layersr   r  rl   blockr   r   r   final_layer_normr   r   r   r)  r,  ry   cell_2d_embedding_get_relative_biasrelative_bias)r   r   rc  rd  r   rb  r1   rw   H  s   zUdopStack.__init__c                 C   s:   | j jD ]}t|tr| |j| jd jd jj qd S )Nr   )	rk  rX  rB   rD  Z_tie_or_clone_weightsr   rg  r  r  )r   r   r0   r0   r1   _tie_weights^  s   
zUdopStack._tie_weightsr   r5  c                 C   s   t | }t|S r   )r`  rV  )r   Zrelative_bias_listr0   r0   r1   rj  e  s   zUdopStack._get_relative_biasc                 C      | j S r   rc  r   r0   r0   r1   get_input_embeddingsj     zUdopStack.get_input_embeddingsc                 C   rm  r   rn  ro  r0   r0   r1   get_output_embeddingsm  rq  zUdopStack.get_output_embeddingsc                 C   
   || _ d S r   rn  r   Znew_embeddingsr0   r0   r1   set_input_embeddingsp     
zUdopStack.set_input_embeddingsc           (      C   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d ur$|n| j j}|d urB|d urB| jr5dnd}td| d| d|d urZt|dkrZ|	 }|
d|d }nb|d u r|d urt|dkrtjd| j j|j|jd	}tjd|j|jd	}tjd
|j|jd	}|	 }t| ||}
td n|d ur|	 d d }n| jrdnd}td| d| d|d u r| jd u rtd| |}|d ur| |}	|	d ur| j j| j j }t|	|||||d| j j| j j	\}}}|	 d d }| js|d ur|| |7 }|\}}|du r"| js"J d| d}d}| jrg|s2|d urgt|trGt|tsGd}t|t }n&t|tsZd}td t |}n|d u rftt t }n| jsmd }|d urv|! nd}|d u rtj"||| |jd}|d u rt# s|| }tj$|||jd}| j jr| %||||d ur|j&nd |}n|d d d d d d f }|j'|jd}d| t(|jj) }| jr|d ur| *|}nd }| +|| j,}|rdnd }|rdnd }|r| jrdnd } | jrd }
n| j-||d}
|
| }
d }!|}"| .|"}"t/| j0D ]d\}#}$|r0||"f }|$|"||
|||!||# ||||d}%|du rT|%d d d |%dd   }%|%d d \}"}&|%d }
| jrr|d urr|%|rodnd }!|r||%d f }| jr| |%d f } q$| 1|"}"| .|"}"|r||"f }|r|&nd }'|r|j&}'|r|2 }'|st3d d! |"||'||| fD S t4|"||'||| d"S )#NZdecoder_ zYou cannot specify both zinputs and zinputs_embeds at the same timer   r6   )r9   r+  r   )r9   r+  r9   zEmpty batchzYou have to specify either z
inputs or rn   z<You have to initialize the model with valid token embeddingsTz@`use_cache` can only be set to `True` if {} is used as a decoderFzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.rm   )r   r4   r0   )r$   r`   )
r$   r  r%  r&  r'  r
  r	  r  r  r   r5   r   rN   r9   r      c                 s   s    | ]	}|d ur|V  qd S r   r0   )rQ   r   r0   r0   r1   	<genexpr>=  s    	z$UdopStack.forward.<locals>.<genexpr>)r#   r$   r%   r&   r'   r(   )5r   r  r  output_hidden_statesuse_return_dictr   r   r-   Znumelrg   r>   fullr   rm   r   r  r]   Zget_extended_attention_maskr   warningrc  rd  r?   r@   rt   ri  formatrB   r   r   r   r   Zfrom_legacy_cacheget_seq_lengthr:   r   r   _update_causal_maskr   rF   r#  r   Zinvert_attention_maskZget_head_maskrf  rk  r   r?  rg  rh  Zto_legacy_cachetupler"   )(r   r   r$   r`   r%  r&  rn   r   rW   rS   r  	head_maskcross_attn_head_maskr%   r  r  r{  r(  r   Zerr_msg_prefixZinput_shapero   r   r  Zreturn_legacy_cacheZreturn_self_attention_cachepast_key_values_lengthZmask_seq_lengthr  Zencoder_extended_attention_maskZall_hidden_statesZall_attentionsZall_cross_attentionsr'  r&   rR   Zlayer_moduleZlayer_outputsZnext_decoder_cacheZ
next_cacher0   r0   r1   r   s  s2  













zUdopStack.forwardFr$   r    input_tensorr   r%   r  c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2rO   Zflex_attentionr   FZsdpa)rn   r  Zis_trainingr5   r6   )rp   target_lengthr   r   r   )cudaZxpuZnpu)r   Z_attn_implementationr"  rB   r-   r   r!   r  Zis_compileabler   Z_ignore_causal_mask_sdpar   r   rC   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrm   r]  r#  r   Z_unmask_unattended)r   r$   r  r   r%   r  Zpast_seen_tokensZusing_compilable_cacher   rp   r  r  	min_dtyper0   r0   r1   r  T  sT   




zUdopStack._update_causal_maskrp   r  r   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr9   )Z
fill_valuer   rm   r5   )Zdiagonalrx  r6   r   )r8   r-   r#  r   r}  rm   Ztriur:   Zreshaper4  r   rC   rF   Zmasked_fill)r$   rp   r  r   r   r   rI  r  r  Zmask_lengthZpadding_maskr0   r0   r1   r    s,    $
6  z?UdopStack._prepare_4d_causal_attention_mask_with_cache_positionr  )NNNNNNNNNNNNNNNNNN)F)r)   r*   r+   r,   rw   rl  r  r   rV  rj  rp  rr  ru  r   r   r-   r   r   ri   r  r  r   r  r   r0   r0   r   r1   ra  B  sn    
 h
Dra  c                *       s(  e Zd Zg dZ fddZdd Zdd Zdd	 Zd
d Ze																				d$de
e de
e de
eeef  de
e de
eeef  de
e de
e de
e de
e de
e de
e de
e de
e de
e de
e de
e de
e de
ej d eed!f f&d"d#Z  ZS )%r   )encoder.embed_tokens.weightdecoder.embed_tokens.weight!encoder.embed_patches.proj.weightencoder.embed_patches.proj.bias=encoder.relative_bias.biases.0.relative_attention_bias.weight=decoder.relative_bias.biases.0.relative_attention_bias.weightc                    s   t t| | t|j|j| _t|| _	t
|}d|_d|_d|_t|| j| j	| _t
|}d|_d|_|j|_t|| j| _|   d S )NFT)rv   r   rw   r   r   
vocab_sizer   r   ru   patch_embedr   r   r  is_encoder_decoderra  encodernum_decoder_layersrf  decoder	post_initr   r   encoder_configZdecoder_configr   r0   r1   rw     s   
zUdopModel.__init__c                 C   rm  r   r   ro  r0   r0   r1   rp    rq  zUdopModel.get_input_embeddingsc                 C   "   || _ | j| | j| d S r   r   r  ru  r  rt  r0   r0   r1   ru       zUdopModel.set_input_embeddingsc                 C   rm  r   r  ro  r0   r0   r1   get_encoder  rq  zUdopModel.get_encoderc                 C   rm  r   r  ro  r0   r0   r1   get_decoder  rq  zUdopModel.get_decoderNTr   r$   r`   r   rW   decoder_input_idsdecoder_attention_maskrn   encoder_outputsr%   r  decoder_inputs_embedsdecoder_head_maskr  r  r{  r(  r   r5  .c                 C   s   |dur|n| j j}|dur|n| j j}|	du r'| j||||||||||d
}	|	d }|r0|	jn|	d }| j||||
|||||||||d}|sbtdd t|D }tdd t|	D }	||	 S t|j	|j
|j|j|j|	j	|	j|	jd	S )
a  
        bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.
        visual_bbox (`torch.LongTensor` of shape `(batch_size, patch_sequence_length, 4)`, *optional*):
            Bounding boxes of each patch in the image. If not provided, bounding boxes are created in the model.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
            [What are decoder input IDs?](../glossary#decoder-input-ids) T5 uses the `pad_token_id` as the starting
            token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last
            `decoder_input_ids` have to be input (see `past_key_values`). To know more on how to prepare
            `decoder_input_ids` for pretraining take a look at [T5 Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> from datasets import load_dataset
        >>> import torch

        >>> # load model and processor
        >>> # in this case, we already have performed OCR ourselves
        >>> # so we initialize the processor with `apply_ocr=False`
        >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
        >>> model = AutoModel.from_pretrained("microsoft/udop-large")

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
        >>> inputs = processor(image, words, boxes=boxes, return_tensors="pt")

        >>> decoder_input_ids = torch.tensor([[model.config.decoder_start_token_id]])

        >>> # forward pass
        >>> outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 1, 1024]
        ```N)
r   r$   r`   r   rW   rn   r  r  r{  r(  r   r5   r   r$   rn   r%   r%  r&  r  r  r  r  r{  r(  r   c                 s        | ]\}}|d kr|V  qdS r5   Nr0   rQ   rB  valuer0   r0   r1   rz  }      z$UdopModel.forward.<locals>.<genexpr>c                 s   r  r  r0   r  r0   r0   r1   rz  ~  r  )r#   r%   decoder_hidden_statesdecoder_attentionsr(   encoder_last_hidden_stater%  encoder_attentions)r   r  r|  r  r$   r  r  r?  r   r#   r%   r&   r'   r(   )r   r   r$   r`   r   rW   r  r  rn   r  r%   r  r  r  r  r  r  r{  r(  r   r&   r&  decoder_outputsr0   r0   r1   r      s\   UzUdopModel.forward)NNNNNNNNNNNNNNTNNNN)r)   r*   r+   _tied_weights_keysrw   rp  ru  r  r  r   r   r   r   rC  r   ri   r-   
LongTensorr
   r   r   r0   r0   r   r1   r     s    		

r   a  
    The UDOP encoder-decoder Transformer with a language modeling head on top, enabling to generate text given document
    images and an optional prompt.

    This class is based on [`T5ForConditionalGeneration`], extended to deal with images and layout (2D) data.
    )Zcustom_introc                ,       sJ  e Zd Zg dZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
e																				d+dee dee deeeef  dee deeeef  dee dee dee dee dee dee dee dee dee d ee d!ee d"ee d#ee d$eej d%eed&f f(d'd(Zd)d* Z  ZS ),r   )r  r  r  r  r  r  zlm_head.weightc                    s   t t| | t|j|j| _t|| _	t
|}d|_d|_d|_t|| j| j	| _t
|}d|_d|_|j|_t|| j| _tj|j|jdd| _|   d S )NFTr   )rv   r   rw   r   r   r  r   r   ru   r  r   r   r  r  ra  r  r  rf  r  r   r   r  r  r   r0   r1   rw     s   
z%UdopForConditionalGeneration.__init__c                 C   rm  r   r  ro  r0   r0   r1   rp    rq  z1UdopForConditionalGeneration.get_input_embeddingsc                 C   r  r   r  rt  r0   r0   r1   ru    r  z1UdopForConditionalGeneration.set_input_embeddingsc                 C   rs  r   r   rt  r0   r0   r1   set_output_embeddings  rv  z2UdopForConditionalGeneration.set_output_embeddingsc                 C   rm  r   r  ro  r0   r0   r1   rr    rq  z2UdopForConditionalGeneration.get_output_embeddingsc                 C   rm  r   r  ro  r0   r0   r1   r    rq  z(UdopForConditionalGeneration.get_encoderc                 C   rm  r   r  ro  r0   r0   r1   r    rq  z(UdopForConditionalGeneration.get_decoderNTr   r$   r`   r   rW   r  r  rn   r  r%   r  r  r  r  r  r{  r(  labelsr   r5  .c                 C   sn  |dur|n| j j}|dur|n| j j}|du r!|dur!| |}|	du r4| j||||||||||d
}	|	d }|r=|	jn|	d }| j||||
|||||||||d}|d }| j jrc|| j jd  }| 	|}d}|durt
dd}||d	|d	|d	}|s|f|d
d  |	d f |	d
d  }|dur|f| S |S t|||j|j|j|j|	j|	j|	jd	S )a	  
        bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.
        visual_bbox (`torch.LongTensor` of shape `(batch_size, patch_sequence_length, 4)`, *optional*):
            Bounding boxes of each patch in the image. If not provided, bounding boxes are created in the model.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
            [What are decoder input IDs?](../glossary#decoder-input-ids) T5 uses the `pad_token_id` as the starting
            token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last
            `decoder_input_ids` have to be input (see `past_key_values`). To know more on how to prepare
            `decoder_input_ids` for pretraining take a look at [T5 Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
            1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size]`.

        Examples:

        ```python
        >>> from transformers import AutoProcessor, UdopForConditionalGeneration
        >>> from datasets import load_dataset

        >>> # load model and processor
        >>> # in this case, we already have performed OCR ourselves
        >>> # so we initialize the processor with `apply_ocr=False`
        >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
        >>> model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large")

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]

        >>> # one can use the various task prefixes (prompts) used during pre-training
        >>> # e.g. the task prefix for DocVQA is "Question answering. "
        >>> question = "Question answering. What is the date on the form?"
        >>> encoding = processor(image, question, text_pair=words, boxes=boxes, return_tensors="pt")

        >>> # autoregressive generation
        >>> predicted_ids = model.generate(**encoding)
        >>> print(processor.batch_decode(predicted_ids, skip_special_tokens=True)[0])
        9/30/92
        ```N
r   r`   rW   r   r$   rn   r  r  r{  r(  r   r5   r  r   r   )Zignore_indexr6   rN   )	lossZlogitsr%   r  r  r(   r  r%  r  )r   r  r|  r   r  r$   r  r   r   r   r   r>   rg   r   r%   r&   r'   r(   r#   )r   r   r$   r`   r   rW   r  r  rn   r  r%   r  r  r  r  r  r  r{  r(  r  r   r&   r&  r  Zsequence_outputZ	lm_logitsr  Zloss_fctrY  r0   r0   r1   r     sp   [


(z$UdopForConditionalGeneration.forwardc              	   C   s   |d u rt d |S d}|D ]M}d}|D ]}||d||jf }q|d j|d jkr@td|d j d|d j dt|t|krWtdt| dt| d||f }q|S )	NzHYou might want to consider setting `use_cache=True` to speed up decodingr0   r   z%reordered_layer_past_states[0] shape z  and layer_past_states[0] shape z mismatchedz&length of reordered_layer_past_states z! and length of layer_past_states )r   r~  Zindex_selectrF   rm   rC   r   rD   )r   r%   Zbeam_idxZreordered_decoder_pastZlayer_past_statesZreordered_layer_past_statesZlayer_past_stater0   r0   r1   _reorder_cachep  s(   
z+UdopForConditionalGeneration._reorder_cache)NNNNNNNNNNNNNNTNNNNN)r)   r*   r+   r  rw   rp  ru  r  rr  r  r  r   r   r   r   rC  r   ri   r-   r  r
   r   r  r   r0   r0   r   r1   r     s    	
	

 #r   c                       s   e Zd Zg dZdef fddZdd Zdd Zd	d
 Zdd Z	e
										ddee deeeef  dee dee deeeef  dee dee dee dee dee deeej ef fddZ  ZS )UdopEncoderModel)r  r  r  r  r   c                    s`   t  | t|j|j| _t|| _t	|}d|_
d|_d|_t|| j| j| _|   d S )NF)rv   rw   r   r   r  r   r   ru   r  r   r   r  r  ra  r  r  )r   r   r  r   r0   r1   rw     s   
zUdopEncoderModel.__init__c                 C   rm  r   r  ro  r0   r0   r1   rp    rq  z%UdopEncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S r   )r   r  ru  rt  r0   r0   r1   ru    s   z%UdopEncoderModel.set_input_embeddingsc                 C   rm  r   r  ro  r0   r0   r1   r    rq  zUdopEncoderModel.get_encoderc                 C   s0   |  D ]\}}| jj| jd j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        r   N)itemsr  rg  r  r  r   )r   Zheads_to_pruner  r   r0   r0   r1   _prune_heads  s   zUdopEncoderModel._prune_headsNr   r`   r$   r   rW   r  rn   r  r{  r(  r5  c                 C   s^   |dur|n| j j}|	dur|	n| j j}	|
dur|
n| j j}
| j|||||||||	|
d
}|S )a	  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
        bbox (`torch.LongTensor` of shape `({0}, 4)`, *optional*):
            Bounding boxes of each input sequence tokens. Selected in the range `[0,
            config.max_2d_position_embeddings-1]`. Each bounding box should be a normalized version in (x0, y0, x1, y1)
            format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1,
            y1) represents the position of the lower right corner.

            Note that `sequence_length = token_sequence_length + patch_sequence_length + 1` where `1` is for [CLS]
            token. See `pixel_values` for `patch_sequence_length`.
        visual_bbox (`torch.LongTensor` of shape `(batch_size, patch_sequence_length, 4)`, *optional*):
            Bounding boxes of each patch in the image. If not provided, bounding boxes are created in the model.

        Example:

        ```python
        >>> from transformers import AutoProcessor, UdopEncoderModel
        >>> from huggingface_hub import hf_hub_download
        >>> from datasets import load_dataset

        >>> # load model and processor
        >>> # in this case, we already have performed OCR ourselves
        >>> # so we initialize the processor with `apply_ocr=False`
        >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False)
        >>> model = UdopEncoderModel.from_pretrained("microsoft/udop-large")

        >>> # load an example image, along with the words and coordinates
        >>> # which were extracted using an OCR engine
        >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train", trust_remote_code=True)
        >>> example = dataset[0]
        >>> image = example["image"]
        >>> words = example["tokens"]
        >>> boxes = example["bboxes"]
        >>> encoding = processor(image, words, boxes=boxes, return_tensors="pt")

        >>> outputs = model(**encoding)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  )r   r  r{  r|  r  )r   r   r`   r$   r   rW   r  rn   r  r{  r(  r  r0   r0   r1   r     s"   ;zUdopEncoderModel.forward)
NNNNNNNNNN)r)   r*   r+   r  r   rw   rp  ru  r  r  r   r   r   r   rC  r   ri   r   r
   r-   r.   r"   r   r   r0   r0   r   r1   r    sR    	
r  )r   r   r   r  )r2   r3   )r   )NrM   r   r2   r3   )Wr,   rz   loggingr   r:  r{   r   r   copyr   dataclassesr   typingr   r   r   r	   r
   r   r-   r   r   Ztorch.nnr   Ztransformersr   Ztransformers.modeling_outputsr   r   Zactivationsr   Zcache_utilsr   r   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_utilsr   Zpytorch_utilsr   r   utilsr   r   r   r   Z!torch.nn.attention.flex_attentionr    Zintegrations.flex_attentionr!   	getLoggerr)   r   r"   rA   rL   rt   Moduleru   r   r   r   r   r   r   r  r  r  r)  r   r7  r<  r   rD  rN  rU  rV  r_  r`  ra  r   r   r  __all__r0   r0   r0   r1   <module>   s    

*

P^ f%'fh    < {{