o
    Zhl                    @   sl  d Z ddlZddlZddlmZmZmZmZ ddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ e& rddl,m-Z- ddl.m/Z/ e)0e1Z2G dd de	j3Z4G dd de	j3Z5G dd de	j3Z6G dd de	j3Z7G dd de	j3Z8G dd de	j3Z9G dd de	j3Z:G d d! d!e	j3Z;G d"d# d#e	j3Z<e%G d$d% d%e!Z=G d&d' d'e=Z>e%G d(d) d)e=Z?e%d*d+G d,d- d-e=eZ@e%G d.d/ d/e=ZAe%d0d+G d1d2 d2e=ZBe%G d3d4 d4e=ZCe%G d5d6 d6e=ZDg d7ZEdS )8zPyTorch UMT5 model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )
UMT5Config)	BlockMask)make_flex_block_causal_maskc                       s&   e Zd Zd fdd	Zdd Z  ZS )UMT5LayerNormư>c                    s&   t    tt|| _|| _dS )ze
        Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__ U/var/www/auris/lib/python3.10/site-packages/transformers/models/umt5/modeling_umt5.pyr'   =   s   

zUMT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   T)Zkeepdim)tor)   Zfloat32powmeanZrsqrtr,   r+   dtypefloat16Zbfloat16)r-   hidden_statesZvariancer2   r2   r3   forwardE   s
   
zUMT5LayerNorm.forward)r%   )__name__
__module____qualname__r'   r<   __classcell__r2   r2   r0   r3   r$   <   s    r$   c                       *   e Zd Zdef fddZdd Z  ZS )UMT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r&   r'   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr-   rC   r0   r2   r3   r'   W   s
   
zUMT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)rJ   rP   rN   
isinstancerK   r+   r)   Tensorr9   int8r6   r-   r;   r2   r2   r3   r<   ^   s   



zUMT5DenseActDense.forwardr=   r>   r?   r!   r'   r<   r@   r2   r2   r0   r3   rB   V   s    rB   c                       rA   )UMT5DenseGatedActDenserC   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S rD   )r&   r'   r   rG   rH   rI   wi_0wi_1rK   rL   rM   rN   r   rO   rP   rQ   r0   r2   r3   r'   n   s   
zUMT5DenseGatedActDense.__init__c                 C   sz   |  | |}| |}|| }| |}t| jjtjr6|j	| jjj	kr6| jjj	tj
kr6|| jjj	}| |}|S rR   )rP   rY   rZ   rN   rS   rK   r+   r)   rT   r9   rU   r6   )r-   r;   Zhidden_geluZhidden_linearr2   r2   r3   r<   v   s   


zUMT5DenseGatedActDense.forwardrW   r2   r2   r0   r3   rX   m   s    rX   c                       rA   )UMT5LayerFFrC   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr/   )r&   r'   Zis_gated_actrX   DenseReluDenserB   r$   rH   layer_norm_epsilon
layer_normr   rL   rM   rN   rQ   r0   r2   r3   r'      s   

zUMT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S rR   )r_   r]   rN   )r-   r;   Zforwarded_statesr2   r2   r3   r<      s   

zUMT5LayerFF.forwardrW   r2   r2   r0   r3   r[      s    
r[   c                       s   e Zd ZdZddee f fddZdejdejfd	d
Z	dd Z
dddZ					ddejdeej deeej  deej deej deej fddZ  ZS )UMT5Attentionz7
    T5's attention using relative_attention_bias.
    FN	layer_idxc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _t | _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrE   )r&   r'   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerH   d_kvkey_value_proj_dim	num_headsn_headsrM   rN   Z	inner_dimra   loggerwarning_oncer1   r=   r   rG   qkvo	Embeddingrelative_attention_biassetpruned_heads)r-   rC   rc   ra   r0   r2   r3   r'      s,   
zUMT5Attention.__init__
projectionreturnc                 C   s6   |  d d | j| jf }||dddd}|S )Nr5   r   r4   r    r
   )sizeri   rg   viewpermute)r-   rt   Znew_projection_shapeZnew_projectionr2   r2   r3   _shape   s   zUMT5Attention._shapec           	      C   s   d}| j }| j}| js!|d }||dktj| 7 }t|}n
t|t| }|d }||k }t	|
 | t	||  }|||  }||tj }t|t||d }|t|||7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r4   r    )rd   re   rb   r6   r)   longabsminZ
zeros_likelogfloatmathZ	full_likewhere)	r-   relative_positionZrelative_bucketsZnum_bucketsZmax_distanceZ	max_exactZis_smallZ	log_ratioZrelative_position_if_larger2   r2   r3   _relative_position_bucket   s$    z'UMT5Attention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf }tj|tj|ddddf }|| }| |}|  |}	|	g dd}	|	S )z%Compute binned relative position biasN)r9   device)r4   r   r    r   )	rq   r+   r   r)   arangerz   r   rx   Z	unsqueeze)
r-   Zquery_length
key_lengthr   cache_positionZcontext_positionZmemory_positionr   Zrelative_position_bucketvaluesr2   r2   r3   compute_bias   s   
 

zUMT5Attention.compute_biasr;   encoder_hidden_statespast_key_valueattention_masklayer_head_maskr   c                 C   s  |j d d \}}|d u}	| |}
|
|d| j| jdd}
|d ur4|j| j}|	r1|j	}n|j
}|	r8|n|}|	rO|d urO|rO|j| j }|j| j }nE| |}| |}||d| j| jdd}||d| j| jdd}|d ur|	s}|nd }|||| jd|i\}}|	rd|j| j< t|
|dd}|d ur||  n|}|j d }| jstjd| j||f|j|jd}n| j|||j|d	}|d d d d | d d d f }|d ur|d d d d d d d |j d f }|| }| jrt|j d }d
|t| j< |d d | f }n|}||7 }tjj| dd |}tjj!|| j!| j"d}|d ur=|| }t||}|dd# }|||d}| $|}|||fS )Nr4   r5   r    r   Tr
   )r   r9   )r   r   r   dim)ptraining)%shaperl   rw   ri   rg   Z	transpose
is_updatedgetra   Zcross_attention_cacheself_attention_cacheZ	key_cacheZvalue_cacherm   rn   updater)   matmulget_seq_lengthrc   Zzerosr   r9   r   rs   r*   listboolr   Z
functionalZsoftmaxr~   Ztype_asrN   r   
contiguousro   )r-   r;   r   r   r   r   r   
batch_size
seq_lengthZis_cross_attentionZquery_statesr   Zcurr_past_key_valueZcurrent_statesZ
key_statesZvalue_statesZscoresZreal_seq_lengthr   Zposition_biascausal_maskmaskZposition_bias_maskedZattn_weightsZattn_outputr2   r2   r3   r<     sj   	




"&


zUMT5Attention.forward)FN)NNNNNNN)r=   r>   r?   __doc__r   intr'   r)   rT   ry   r   r   r   r<   r@   r2   r2   r0   r3   r`      s0    
/r`   c                       s:   e Zd Zddee f fddZ				dddZ  ZS )	UMT5LayerSelfAttentionNra   c                    >   t    t|d|d| _t|j|jd| _t	|j
| _d S )NTrc   ra   r\   )r&   r'   r`   SelfAttentionr$   rH   r^   r_   r   rL   rM   rN   r-   rC   ra   r0   r2   r3   r'   `     
zUMT5LayerSelfAttention.__init__c           	      C   sF   |  |}| j|||||d}|| |d  }|f|dd   }|S )Nr   r   r   r   r   r    )r_   r   rN   )	r-   r;   r   r   r   r   normed_hidden_statesattention_outputoutputsr2   r2   r3   r<   f  s   
zUMT5LayerSelfAttention.forwardrR   )NNNNr=   r>   r?   r   r   r'   r<   r@   r2   r2   r0   r3   r   _  s    	r   c                       s<   e Zd Zddee f fddZ					dddZ  ZS )	UMT5LayerCrossAttentionNra   c                    r   )NFr   r\   )r&   r'   r`   EncDecAttentionr$   rH   r^   r_   r   rL   rM   rN   r   r0   r2   r3   r'   |  r   z UMT5LayerCrossAttention.__init__c                 C   sH   |  |}| j||||||d}|| |d  }	|	f|dd   }
|
S )Nr   r   r   r   r   r   r    )r_   r   rN   )r-   r;   r   r   r   r   r   r   r   Zlayer_outputr   r2   r2   r3   r<     s   
	zUMT5LayerCrossAttention.forwardrR   r   r   r2   r2   r0   r3   r   {  s    	r   c                       sD   e Zd Zddee f fddZ									d	ddZ  ZS )
	UMT5BlockNra   c                    s^   t    |j| _t | _| jt||d | jr%| jt||d | jt	| d S )Nra   )
r&   r'   rb   r   
ModuleListlayerappendr   r   r[   r   r0   r2   r3   r'     s   

zUMT5Block.__init__Fc                 C   sV  | j d |||||
d\}}}|jtjkr3t|jj}tt| |d |}tj	|| |d}d }| j
o;|d u}|rr| j d ||||||
d\}}}|jtjkrrt|jj}tt| |d |}tj	|| |d}| j d |}|jtjkrt|jj}tt| |d |}tj	|| |d}||f}|	r|||f7 }|S )Nr   r   i  )r|   maxr    r   r5   )r   r9   r)   r:   finfor   r   isinfanyclamprb   )r-   r;   r   r   encoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacheoutput_attentionsr   Zself_attn_weightsZ	max_dtypeZclamp_valueZcross_attn_weightsZdo_cross_attentionr   r2   r2   r3   r<     sJ   		zUMT5Block.forwardrR   )	NNNNNNFFNr   r2   r2   r0   r3   r     s    r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	UMT5ClassificationHeadz-Head for sentence-level classification tasks.rC   c                    sB   t    t|j|j| _tj|jd| _t|j|j	| _
d S )N)r   )r&   r'   r   rG   rH   denserL   classifier_dropoutrN   
num_labelsout_projrQ   r0   r2   r3   r'     s   
zUMT5ClassificationHead.__init__r;   ru   c                 C   s6   |  |}| |}t|}|  |}| |}|S rR   )rN   r   r)   tanhr   rV   r2   r2   r3   r<     s   




zUMT5ClassificationHead.forward)
r=   r>   r?   r   r!   r'   r)   rT   r<   r@   r2   r2   r0   r3   r     s    r   c                   @   sH   e Zd ZeZdZdZdZdZdgZ	dgZ
edd Zdd Zd	d
 ZdS )UMT5PreTrainedModeltransformerTr   rK   c                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r)   Ztensorr   r   )r-   r   Z
input_maskdummy_inputsr2   r2   r3   r     s   

z UMT5PreTrainedModel.dummy_inputsc                 C   s
  | j j}t|tr|jj|d  d	S t|ttt	t
fr^|jjjjd|d d t|dr>| j js>|jjjjd|d d t|dr\|jjjjd|| j jd  d |jjj  d	S d	S t|trt|dr}|jjjjd|d d |jjj  d	S d	S t|tr|jjjjd|| j jd  d t|jdr|jjd	ur|jjj  |jjjjd|| j jd  d t|jdr|jjd	ur|jjj  d	S d	S d	S t|tr$|jjjjd|| j jd  d t|jdr|jjd	ur|jjj  |jjjjd|| j jd  d t|jdr |jjd	ur"|jjj  d	S d	S d	S t|tr|jjjjd|| j jd  d t|jdrO|jjd	urO|jjj  |jjjjd|| j jd  d t|jdrt|jjd	urt|jjj  |jjjjd|| j jd  d t|jdr|jjd	ur|jjj  d	S d	S d	S t|t r| j j}| j j!}| j j"}|j#jjjd||| d  d |j$jjjd||d  d |j%jjjd||d  d |j&jjjd||| d  d |j'r|j(jjjd||d  d d	S d	S d	S )
zInitialize the weights      ?        )r8   Zstdlm_head
qa_outputs      
classifierrF   N))rC   Zinitializer_factorrS   r$   r+   dataZfill_	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringsharedZnormal_hasattrtie_word_embeddingsr   r   rH   rF   Zzero_UMT5ForTokenClassificationr   r   r   r   rB   rJ   rK   rI   rX   rY   rZ   r`   rf   rh   rl   rm   rn   ro   rc   rq   )r-   modulefactorrH   rg   ri   r2   r2   r3   _init_weights  s|   

 


          
z!UMT5PreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d u rtdt|r1t|jd d d |}tj||dd df gdd}n|	|j}|dd df 
 |ddd f< ||d< |d u rStd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information.r5   )r    .r   r    ).r   z1self.model.config.pad_token_id has to be defined.)rC   decoder_start_token_idpad_token_id
ValueErrorr   r)   fullr   catZ	new_zeroscloneZmasked_fill_)r-   r   r   r   Zshifted_input_idsr2   r2   r3   _shift_rightO  s      z UMT5PreTrainedModel._shift_rightN)r=   r>   r?   r!   config_classZbase_model_prefixZsupports_gradient_checkpointingZ_supports_cache_classZ_supports_static_cacheZ_no_split_modulesZ_keep_in_fp32_modulespropertyr   r   r   r2   r2   r2   r3   r     s    

Br   c                       s   e Zd Zd fdd	Zdd Zdd Z													ddd	Z	
ddeej	df dej	dej	de
def
ddZedej	dededejdej	defddZ  ZS )	UMT5StackNc                    sl   t    || _ j| _t fddt jD | _t	 j
 jd| _t j| _d| _|   d S )Nc                    s   g | ]}t  |d qS )r   )r   ).0irC   r2   r3   
<listcomp>p  s    z&UMT5Stack.__init__.<locals>.<listcomp>r\   F)r&   r'   embed_tokensrb   r   r   range
num_layersblockr$   rH   r^   final_layer_normrL   rM   rN   gradient_checkpointing	post_init)r-   rC   r   r0   r   r3   r'   l  s    zUMT5Stack.__init__c                 C      | j S rR   r   r-   r2   r2   r3   get_input_embeddingsx     zUMT5Stack.get_input_embeddingsc                 C   
   || _ d S rR   r   r-   Znew_embeddingsr2   r2   r3   set_input_embeddings{     
zUMT5Stack.set_input_embeddingsc           '      C   s  |	d ur|	n| j j}	|
d ur|
n| j j}
|d ur|n| j j}|d ur$|n| j j}|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|	rtd
 d}	|d u r| jd u rtd| |}|\}}|	du r| jstd|  dd}d}| jr|	s|d urt|trt|tsd}t|t }n#t|tsd}td t|}n|d u rtt t }n| jsd }|d ur| nd}|d u rtj||| |jd}|d u rt s|| }tj|||jd}| jr/| ||||d ur*|jnd |
}n&|d urS|d d d d d d f }|j|jd}d| t|jj }nd }| jr||d ur|| \}}}||f}|d u rvtj||jd}| |}nd }|  || j j!}|  || j j!}|rdnd }|
rdnd }|
r| jrdnd }| "|}t#| j$D ]\\} }!||  }"||  }#|r||f }| j	r| j
r| %|!j&|||||"|#d |	|
|}$n|!|||||"|#||	|
|d
}$|$d }|	r|$d }%|
r||$d f7 }| jr||$d f7 }q| '|}| "|}|r ||f }|	r%|%nd }&|r-|j}&|r4|( }&|sEt)dd ||&|||fD S t*||&|||dS )NZdecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer5   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   )r9   r   r2   )	r   r   r   r   r   r   r   r   r   r    r4   r
   c                 s   s    | ]	}|d ur|V  qd S rR   r2   )r   rn   r2   r2   r3   	<genexpr>4  s    z$UMT5Stack.forward.<locals>.<genexpr>)last_hidden_statepast_key_valuesr;   
attentionscross_attentions)+rC   r   r   output_hidden_statesuse_return_dictrb   r   rv   rw   r   r   rj   rk   r   rS   r   r   r   Zfrom_legacy_cacher   r)   r   r   r   r*   _update_causal_maskr   r6   r9   r   r|   Zinvert_attention_maskZget_head_maskr   rN   	enumerater   Z_gradient_checkpointing_funcr<   r   Zto_legacy_cachetupler   )'r-   r   r   r   r   r   	head_maskcross_attn_head_maskr  r   r   r  return_dictr   Zerr_msg_prefixZinput_shaper   r   Zreturn_legacy_cacheZreturn_self_attention_cachepast_key_values_lengthZmask_seq_lengthr   Zencoder_batch_sizeZencoder_sequence_length_Zencoder_hidden_shapeZencoder_extended_attention_maskZall_hidden_statesZall_attentionsZall_cross_attentionsr;   r   Zlayer_moduler   r   Zlayer_outputsZnext_decoder_cacheZ
next_cacher2   r2   r3   r<   ~  s$  










zUMT5Stack.forwardFr   r"   input_tensorr   r  r   c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2r   Zflex_attentionr   FZsdpa)r   r  Zis_trainingr    r5   )sequence_lengthtarget_lengthr9   r   r   )cudaZxpuZnpu)rC   Z_attn_implementationr   rS   r)   rT   r#   r   Zis_compileabler   Z_ignore_causal_mask_sdpar   r9   r   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r|   Z_unmask_unattended)r-   r   r  r   r  r   Zpast_seen_tokensZusing_compilable_cacher9   r  r  r   	min_dtyper2   r2   r3   r  H  sT   




zUMT5Stack._update_causal_maskr  r  r9   r   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )Z
fill_valuer9   r   r    )Zdiagonalr   r5   r   )r   r)   r   r|   r   r   Ztriur   Zreshapeexpandr   r   r6   Zmasked_fill)r   r  r  r9   r   r   kwargsr   r  Zmask_lengthZpadding_maskr2   r2   r3   r    s,    $
6  z?UMT5Stack._prepare_4d_causal_attention_mask_with_cache_positionrR   )NNNNNNNNNNNNN)F)r=   r>   r?   r'   r   r   r<   r   r)   rT   r   r   r  staticmethodr   r9   r  r@   r2   r2   r0   r3   r   k  sZ    
 Q
Dr   c                &       sJ  e Zd ZdZdZeZddgZ fddZdd Z	d	d
 Z
dd Zdd Zdd Zdd Ze																d'deej deej deej deej deej deej deej deeeej   deeeej   deej deej dee d ee d!ee d"ee d#eej d$eeej ef f"d%d&Z  ZS )(r   ao  
    Examples:

    ```python
    >>> from transformers import UMT5Model, AutoTokenizer

    >>> model = UMT5Model.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
    >>> label = "<extra_id_0> verhandelt"
    >>> inputs = tokenizer(inputs, return_tensors="pt")
    >>> labels = tokenizer(label=label, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```umt5encoder.embed_tokens.weightdecoder.embed_tokens.weightc                    s   t  | t|j|j| _t|}d|_	d|_
d|_t|| j| _t|}d|_	d|_|j|_t|| j| _|   d S NFT)r&   r'   r   rp   
vocab_sizerH   r   copydeepcopyrb   r   is_encoder_decoderr   encodernum_decoder_layersr   decoderr   r-   rC   encoder_configZdecoder_configr0   r2   r3   r'     s   

zUMT5Model.__init__c                 C   r   rR   r   r   r2   r2   r3   r     r   zUMT5Model.get_input_embeddingsc                 C   "   || _ | j| | j| d S rR   r   r!  r   r#  r   r2   r2   r3   r        zUMT5Model.set_input_embeddingsc                 C   4   | j jr| | jj| j | | jj| j d S d S rR   rC   r   _tie_or_clone_weightsr!  r   r   r#  r   r2   r2   r3   _tie_weights     zUMT5Model._tie_weightsc                 C   r   rR   r!  r   r2   r2   r3   get_encoder   r   zUMT5Model.get_encoderc                 C   r   rR   r#  r   r2   r2   r3   get_decoder  r   zUMT5Model.get_decoderc                 C   s*   |  D ]\}}| jj| j| qdS )
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr!  r   Z	attentionprune_headsr-   Zheads_to_pruner   Zheadsr2   r2   r3   _prune_heads  s   zUMT5Model._prune_headsNr   r   r   r   r	  decoder_head_maskr
  encoder_outputsr  r   decoder_inputs_embedsr   r   r  r  r   ru   c                 C   s   |dur|n| j j}|dur|n| j j}|du r%| j|||
||||d}n$|rIt|tsIt|d t|dkr:|d ndt|dkrE|d ndd}|d }| j||||	|||||||||d}|se|| S t|j	|j
|j|j|j|j	|j|jdS )	a+  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5Model.from_pretrained("google/umt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
        >>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr   r   r   r	  r   r  r  r   r    r4   r   r;   r  r   r   r   r  r   r   r	  r
  r   r   r  r  r   )r   r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentions)rC   r   r  r!  rS   r   lenr#  r   r   r  r;   r  r  )r-   r   r   r   r   r	  r8  r
  r9  r  r   r:  r   r   r  r  r   r;   decoder_outputsr2   r2   r3   r<     s\   Q	zUMT5Model.forwardNNNNNNNNNNNNNNNN)r=   r>   r?   r   
model_typer!   r   _tied_weights_keysr'   r   r   r-  r0  r2  r7  r   r   r)   
LongTensorFloatTensor
BoolTensorrT   r   r   r   r   r<   r@   r2   r2   r0   r3   r     s    	
r   z<
    UMT5 Model with a `language modeling` head on top.
    )Zcustom_introc                (       sv  e Zd ZdZdZg dZ fddZdd Zdd	 Zd
d Z	dd Z
dd Zdd Zdd Ze																	d-deej deej deej deej deej deej deej deeeej   deeeej   deej deej d eej d!ee d"ee d#ee d$ee d%eej d&eeej ef f$d'd(Zd ejfd)d*Zed+d, Z  ZS ).r   a  
    Examples:

    ```python
    >>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

    >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```r  )r  r  zlm_head.weightc                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d S )NFTrE   )r&   r'   rH   	model_dimr   rp   r  r   r  r  rb   r   r   r   r!  r"  r   r#  rG   r   r   r$  r0   r2   r3   r'     s   

z%UMT5ForConditionalGeneration.__init__c                 C   r   rR   r&  r   r2   r2   r3   r     r   z1UMT5ForConditionalGeneration.get_input_embeddingsc                 C   r'  rR   r(  r   r2   r2   r3   r     r)  z1UMT5ForConditionalGeneration.set_input_embeddingsc                 C   r*  rR   r+  r   r2   r2   r3   r-    r.  z)UMT5ForConditionalGeneration._tie_weightsc                 C   r   rR   r   r   r2   r2   r3   set_output_embeddings  r   z2UMT5ForConditionalGeneration.set_output_embeddingsc                 C   r   rR   rK  r   r2   r2   r3   get_output_embeddings  r   z2UMT5ForConditionalGeneration.get_output_embeddingsc                 C   r   rR   r/  r   r2   r2   r3   r0    r   z(UMT5ForConditionalGeneration.get_encoderc                 C   r   rR   r1  r   r2   r2   r3   r2    r   z(UMT5ForConditionalGeneration.get_decoderNr   r   r   r   r	  r8  r
  r9  r  r   r:  labelsr   r   r  r  r   ru   c                 C   s  |dur|n| j j}|dur|n| j j}|du r%| j|||
||||d}n$|rIt|tsIt|d t|dkr:|d ndt|dkrE|d ndd}|d }|dur^|du r^|du r^| |}| j||||	|||||||||d}|d }| j j	r|| j
d  }| |}d}|durtd	d
}||j}||d|d|d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )aK  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
        >>> outputs = model.generate(input_ids)
        >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
        ```Nr;  r   r    r4   r<  r=  r   r   Zignore_indexr5   	losslogitsr  r>  r?  r  r@  r   rA  )rC   r   r  r!  rS   r   rB  r   r#  r   rJ  r   r   r6   r   rw   rv   r   r  r;   r  r  r   )r-   r   r   r   r   r	  r8  r
  r9  r  r   r:  rN  r   r   r  r  r   r;   rC  sequence_outputZ	lm_logitsrQ  loss_fctoutputr2   r2   r3   r<     sv   U	


z$UMT5ForConditionalGeneration.forwardc                 C   s
   |  |S rR   )r   )r-   rN  r2   r2   r3   %prepare_decoder_input_ids_from_labels  r   zBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr2   c                 3   s$    | ]}| d  |jV  qdS )r   N)Zindex_selectr6   r   )r   Z
past_statebeam_idxr2   r3   r     s   " z>UMT5ForConditionalGeneration._reorder_cache.<locals>.<genexpr>)r  )r  rX  Zreordered_pastZ
layer_pastr2   rW  r3   _reorder_cache  s   z+UMT5ForConditionalGeneration._reorder_cache)NNNNNNNNNNNNNNNNN)r=   r>   r?   r   rE  rF  r'   r   r   r-  rL  rM  r0  r2  r   r   r)   rG  rH  rI  rT   r   r   r   r   r<   rV  r  rY  r@   r2   r2   r0   r3   r     s    	
 #r   c                       s   e Zd ZdZdZdgZ fddZdd Zdd	 Zd
d Z	dd Z
dd Ze							ddeej deej deej deej dee dee dee deeej ef fddZ  ZS )r   a  
    Examples:

    ```python
    >>> from transformers import UMT5EncoderModel, AutoTokenizer

    >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```r  r  c                    sN   t  | t|j|j| _t|}d|_	d|_
t|| j| _|   d S NF)r&   r'   r   rp   r  rH   r   r  r  r   r   r   r!  r   )r-   rC   r%  r0   r2   r3   r'     s   
zUMT5EncoderModel.__init__c                 C   r   rR   r&  r   r2   r2   r3   r     r   z%UMT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S rR   )r   r!  r   r   r2   r2   r3   r     s   z%UMT5EncoderModel.set_input_embeddingsc                 C   s"   | j jr| | jj| j d S d S rR   )rC   r   r,  r!  r   r   r   r2   r2   r3   r-    s   zUMT5EncoderModel._tie_weightsc                 C   r   rR   r/  r   r2   r2   r3   r0    r   zUMT5EncoderModel.get_encoderc                 C   s0   |  D ]\}}| jj| jd j| qdS )r3  r   N)r4  r!  r   r   r   r5  r6  r2   r2   r3   r7    s   zUMT5EncoderModel._prune_headsNr   r   r	  r   r   r  r  ru   c           	   	   C   s0   |dur|n| j j}| j|||||||d}|S )aQ  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, UMT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
        >>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr;  )rC   r  r!  )	r-   r   r   r	  r   r   r  r  r9  r2   r2   r3   r<     s   #
zUMT5EncoderModel.forward)NNNNNNN)r=   r>   r?   r   rE  rF  r'   r   r   r-  r0  r7  r   r   r)   rG  rH  r   r   r   r   r<   r@   r2   r2   r0   r3   r     sF    	r   z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                $       s   e Zd ZdgZddgZdef fddZe															ddee	j
 d	ee	j d
ee	j
 dee	j
 dee	j dee	j dee	j deee	j  dee	j dee	j dee	j
 dee dee dee dee deeef f ddZ  ZS )UMT5ForSequenceClassificationFdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightr  r  rC   c                    s2   t  | t|| _t|| _|   d| _d S rZ  )r&   r'   r   r   r   classification_headr   Zmodel_parallelrQ   r0   r2   r3   r'     s
   


z&UMT5ForSequenceClassification.__init__Nr   r   r   r   r	  r8  r
  r9  r   r:  rN  r   r   r  r  ru   c                 C   sh  |dur|n| j j}|durd}|du r!|	dur!td| jj |du r6|
du r6|du r1td| |}| j|||||||||	|
||||d}|d }|| j j	
|j}tt|ddkrhtd|j\}}}||ddf |d	|ddd	ddf }| |}d}|dur|
|j}| j jdu r| j jdkrd
| j _n| j jdkr|jtjks|jtjkrd| j _nd| j _| j jd
krt }| j jdkr|| | }n-|||}n'| j jdkrt }||d	| j j|d	}n| j jdkrt }|||}|s |f|dd  }|dur|f| S |S t|||j|j|j|j |j!|j"|j#d	S )as
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r   r   r	  r8  r
  r9  r   r:  r   r   r  r  r   r    z7All examples must have the same number of <eos> tokens.r5   Z
regressionZsingle_label_classificationZmulti_label_classificationrP  )$rC   r  NotImplementedErrorr1   r=   r   r   r   eqZeos_token_idr6   r   rB  r)   Zunique_consecutivesumr   rw   r]  Zproblem_typer   r9   rz   r   r	   squeezer   r   r   r  r>  r?  r  r@  r   rA  )r-   r   r   r   r   r	  r8  r
  r9  r   r:  rN  r   r   r  r  r   rS  Zeos_maskr   r  r.   Zsentence_representationrR  rQ  rT  rU  r2   r2   r3   r<     s   >
,


$

z%UMT5ForSequenceClassification.forward)NNNNNNNNNNNNNNN)r=   r>   r?   "_keys_to_ignore_on_load_unexpectedrF  r!   r'   r   r   r)   rG  rT   r   rH  r   r   r   r   r<   r@   r2   r2   r0   r3   r[    sj    
	

r[  c                       s   e Zd ZdgZdgZdef fddZe								ddee	j
 dee	j
 d	ee	j
 d
ee	j
 dee	j
 dee dee dee deee	j
 ef fddZ  ZS )r   r\  z'transformer.encoder.embed_tokens.weightrC   c                    sJ   t  | |j| _t|| _t|j| _t	|j
|j| _|   d S rR   )r&   r'   r   r   r   r   rL   r   rN   rG   r.   r   r   rQ   r0   r2   r3   r'     s   
z#UMT5ForTokenClassification.__init__Nr   r   r	  r   rN  r   r  r  ru   c	              	   C   s   |dur|n| j j}| j|||||||d}	|	d }
| |
}
| |
}d}|dur:t }||d| j|d}|sO||	dd f}|durM|f| S |S t|||	j	|	j
dS )aB  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   r	  r   r   r  r  r   r5   r4   )rQ  rR  r;   r  )rC   r  r   rN   r   r   rw   r   r   r;   r  )r-   r   r   r	  r   rN  r   r  r  r   r;   rR  rQ  rT  rU  r2   r2   r3   r<     s4   


z"UMT5ForTokenClassification.forward)NNNNNNNN)r=   r>   r?   rc  rF  r!   r'   r   r   r)   rT   r   r   r   r   r<   r@   r2   r2   r0   r3   r     s@    	
r   c                &       s.  e Zd ZddgZ fddZdd Zdd Zd	d
 Zdd Zdd Z	e
																d#deej deej deej deej deej deej deej deeeej   deej deej deej deej dee dee dee dee d eeej ef f"d!d"Z  ZS )$r   r  r  c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _|j| _t|j|j| _|   d S r  )r&   r'   rH   rJ  r   rp   r  r   r  r  rb   r   r   r   r!  r"  r   r#  r   rG   r   r   r$  r0   r2   r3   r'     s    

z!UMT5ForQuestionAnswering.__init__c                 C   r   rR   r&  r   r2   r2   r3   r     r   z-UMT5ForQuestionAnswering.get_input_embeddingsc                 C   r'  rR   r(  r   r2   r2   r3   r     r)  z-UMT5ForQuestionAnswering.set_input_embeddingsc                 C   r*  rR   r+  r   r2   r2   r3   r-  "  r.  z%UMT5ForQuestionAnswering._tie_weightsc                 C   r   rR   r/  r   r2   r2   r3   r0  (  r   z$UMT5ForQuestionAnswering.get_encoderc                 C   r   rR   r1  r   r2   r2   r3   r2  ,  r   z$UMT5ForQuestionAnswering.get_decoderNr   r   r   r   r	  r8  r
  r9  start_positionsend_positionsr   r:  r   r   r  r  ru   c                 C   sj  |dur|n| j j}|dur|n| j j}|	dur|
durd}|du r3|du r3|du r.td| |}|dur9|n| j j}|durC|n| j j}|du rX| j|||||||d}n$|r|t|ts|t|d t|dkrm|d ndt|dkrx|d ndd}|d }| j	|||d||||||||d	}|d }| 
|}|jdd
d\}}|d
 }|d
 }d}|	dur|
durt|	 dkr|	d
|j}	t|
 dkr|
d
|j}
|d}|	d|}	|
d|}
t|d}|||	}|||
}|| d }|s ||f|dd  | }|dur|f| S |S t||||j|j|j|j|j|j|jd
S )aI	  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
            Training](./umt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NFr^  r;  r   r    r4   r<  )r   r   r   r  r   r   r	  r
  r   r   r  r  r5   r   rO  )
rQ  start_logits
end_logitsr  r>  r?  r  r@  r   rA  )rC   r  r   r   r   r!  rS   r   rB  r#  r   splitrb  r   rv   r6   r   r   r   r   r  r;   r  r  r   )r-   r   r   r   r   r	  r8  r
  r9  rd  re  r   r:  r   r   r  r  r;   rC  rS  rR  rf  rg  Z
total_lossZignored_indexrT  Z
start_lossZend_lossrU  r2   r2   r3   r<   /  s   <
	




z UMT5ForQuestionAnswering.forwardrD  )r=   r>   r?   rF  r'   r   r   r-  r0  r2  r   r   r)   rG  rH  rI  rT   r   r   r   r   r<   r@   r2   r2   r0   r3   r     sx    	
r   )r   r   r   r[  r   r   r   )Fr   r  r   typingr   r   r   r   r)   r   Ztorch.nnr   r   r	   Zactivationsr   Zcache_utilsr   r   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_outputsr   r   r   r   r   r   r   Zmodeling_utilsr   utilsr   r   r   r   r   r   r   Zconfiguration_umt5r!   Z!torch.nn.attention.flex_attentionr"   Zintegrations.flex_attentionr#   Z
get_loggerr=   rj   Moduler$   rB   rX   r[   r`   r   r   r   r   r   r   r   r   r   r[  r   r   __all__r2   r2   r2   r3   <module>   sn   $	$	
 CMr  \ S |l $L R