
    fThl                       S r SSKrSSKrSSKJrJrJrJr  SSKrSSKJ	r	  SSK
JrJrJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJrJr  SSK J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)  SSK*J+r+  \&" 5       (       a  SSK,J-r-  SSK.J/r/  \)R`                  " \15      r2 " S S\	Rf                  5      r4 " S S\	Rf                  5      r5 " S S\	Rf                  5      r6 " S S\	Rf                  5      r7 " S S\	Rf                  5      r8 " S S\	Rf                  5      r9 " S S\	Rf                  5      r: " S  S!\	Rf                  5      r; " S" S#\	Rf                  5      r<\% " S$ S%\!5      5       r= " S& S'\=5      r>\% " S( S)\=5      5       r?\%" S*S+9 " S, S-\=\5      5       r@\% " S. S/\=5      5       rA\%" S0S+9 " S1 S2\=5      5       rB\% " S3 S4\=5      5       rC\% " S5 S6\=5      5       rD/ S7QrEg)8zPyTorch UMT5 model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )
UMT5Config)	BlockMask)make_flex_block_causal_maskc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )UMT5LayerNorm<   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zU
Construct a layernorm module in the UMT5 style. No bias and no subtraction of mean.
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      ^/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/umt5/modeling_umt5.pyr*   UMT5LayerNorm.__init__=   s/     	ll5::k#:; #    c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )N   T)keepdim)tor,   float32powmeanrsqrtr/   r.   dtypefloat16bfloat16)r0   hidden_statesvariances      r4   forwardUMT5LayerNorm.forwardE   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r6   )r/   r.   )gư>)__name__
__module____qualname____firstlineno__r*   rE   __static_attributes____classcell__r3   s   @r4   r&   r&   <   s    $+ +r6   r&   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )UMT5DenseActDenseV   configc                 X  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l
        [        UR                     U l        g NFbias)r)   r*   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr0   rQ   r3   s     r4   r*   UMT5DenseActDense.__init__W   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r6   c                    U R                  U5      nU R                  U5      nU R                  U5      n[        U R                  R
                  [        R                  5      (       a  UR                  U R                  R
                  R                  :w  aa  U R                  R
                  R                  [        R                  :w  a/  UR                  U R                  R
                  R                  5      nU R	                  U5      nU$ N)rY   r_   r]   
isinstancerZ   r.   r,   Tensorr@   int8r;   r0   rC   s     r4   rE   UMT5DenseActDense.forward^   s    ./]3tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r6   )r_   r]   rY   rZ   	rG   rH   rI   rJ   r"   r*   rE   rK   rL   rM   s   @r4   rO   rO   V   s    /z / r6   rO   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )UMT5DenseGatedActDensem   rQ   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g rS   )r)   r*   r   rV   rW   rX   wi_0wi_1rZ   r[   r\   r]   r   r^   r_   r`   s     r4   r*   UMT5DenseGatedActDense.__init__n   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r6   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ rc   )r_   rn   ro   r]   rd   rZ   r.   r,   re   r@   rf   r;   )r0   rC   hidden_geluhidden_linears       r4   rE   UMT5DenseGatedActDense.forwardv   s    hhtyy78		-0#3]3 tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r6   )r_   r]   rn   ro   rZ   ri   rM   s   @r4   rk   rk   m   s    /z / r6   rk   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )UMT5LayerFF   rQ   c                   > [         TU ]  5         UR                  (       a  [        U5      U l        O[        U5      U l        [        UR                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr2   )r)   r*   is_gated_actrk   DenseReluDenserO   r&   rW   layer_norm_epsilon
layer_normr   r[   r\   r]   r`   s     r4   r*   UMT5LayerFF.__init__   s_    "8"@D"3F";D'F<U<UVzz&"5"56r6   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ rc   )r}   r{   r]   )r0   rC   forwarded_statess      r4   rE   UMT5LayerFF.forward   s;    ??=9../?@%5E(FFr6   )r{   r]   r}   ri   rM   s   @r4   rv   rv      s    7z 7 r6   rv   c                   T  ^  \ rS rSrSrSS\\   4U 4S jjjrS\R                  S\R                  4S jr
S rSS	 jr     SS
\R                  S\\R                     S\\\R                        S\\R                     S\\R                     S\\R                     4S jjrSrU =r$ )UMT5Attention   z/
T5's attention using relative_attention_bias.
	layer_idxc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  U R                  -  U l        X0l        Uc>  U R                  (       a-  [        R!                  SU R"                  R$                   S35        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        U R                  (       a0  [&        R2                  " U R                  U R                  5      U l        [7        5       U l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrT   )r)   r*   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerW   d_kvkey_value_proj_dim	num_headsn_headsr\   r]   	inner_dimr   loggerwarning_oncer3   rG   r   rV   qkvo	Embeddingrelative_attention_biassetpruned_heads)r0   rQ   r   r   r3   s       r4   r*   UMT5Attention.__init__   se    +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(Er6   
projectionreturnc                     UR                  5       S S U R                  U R                  4-   nUR                  U5      R	                  SSSS5      nU$ )Nr9   r   r8   r!   r   )sizer   r   viewpermute)r0   r   new_projection_shapenew_projections       r4   _shapeUMT5Attention._shape   sQ    )0"5tG^G^8__#)=>FFq!QPQRr6   c                    SnU R                   nU R                  nU R                  (       dC  US-  nX!S:  R                  [        R
                  5      U-  -  n[        R                  " U5      nO,[        R                  " U[        R                  " U5      5      * nUS-  nX:  n[        R                  " UR                  5       U-  5      [        R                  " XE-  5      -  nXsU-
  -  nXWR                  [        R
                  5      -   n[        R                  " U[        R                  " XS-
  5      5      nU[        R                  " XaU5      -  nU$ )aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   r8   r!   )r   r   r   r;   r,   longabsmin
zeros_likelogfloatmath	full_likewhere)	r0   relative_positionrelative_bucketsnum_bucketsmax_distance	max_exactis_small	log_ratiorelative_position_if_larges	            r4   _relative_position_bucket'UMT5Attention._relative_position_bucket   s/   * 99;;AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 II/557)CDtxxP\PhGii	y!89	%.ejj1I%I"%*YY&8RbcTc(d&
" 	EKKE_``r6   c                    Uc   U R                   R                  R                  nUc,  [        R                  " U[        R
                  US9SS2S4   nO	USS2S4   n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  U5      nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )z%Compute binned relative position biasN)r@   device)r8   r   r!   r   )	r   r.   r   r,   aranger   r   r   	unsqueeze)
r0   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluess
             r4   compute_biasUMT5Attention.compute_bias   s    >1188??F!$||L

SYZ[\^b[bc-ag6,,zFSTXZ[T[\+>#'#A#ABS#T --.FG	*44Q7r6   rC   encoder_hidden_statespast_key_valueattention_masklayer_head_maskr   c                    UR                   S S u  pxUS Ln	U R                  U5      n
U
R                  USU R                  U R                  5      R                  SS5      n
UbE  UR                  R                  U R                  5      nU	(       a  UR                  nOUR                  nU	(       a  UOUnU	(       a=  Ub:  W(       a3  WR                  U R                     nUR                  U R                     nOU R                  U5      nU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUbL  U	(       d  UOS nWR                  XU R                  SU05      u  pU	(       a  SUR                  U R                  '   [         R"                  " XR                  SS5      5      nUb  XR%                  5       -   OUnUR                   S   nU R&                  (       d9  [         R(                  " SU R                  UU4UR*                  UR,                  S9nO.U R/                  UUUR*                  US	9nUS S 2S S 2U* S 2S S 24   nUb#  US S 2S S 2S S 2S UR                   S   24   nUU-   nU R0                  (       aS  [         R2                  " UR                   S   5      nS
U[5        U R0                  5      '   US S 2UR7                  5       4   nOUnUU-  n[8        R:                  R=                  UR?                  5       SS9RA                  U5      n[8        R:                  RC                  UU RB                  U RD                  S9nUb  UU-  n[         R"                  " UU5      nUR                  SS5      RG                  5       nUR                  XxS5      nU RI                  U5      nUUU4$ )Nr8   r9   r!   r   Tr   )r   r@   )r   r   r   dim)ptraining)%shaper   r   r   r   	transpose
is_updatedgetr   cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   updater,   matmulget_seq_lengthr   zerosr   r@   r   r   r-   listboolr   
functionalsoftmaxr   type_asr]   r   
contiguousr   )r0   rC   r   r   r   r   r   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesscoresreal_seq_lengthr   position_biascausal_maskmaskposition_bias_maskedattn_weightsattn_outputs                            r4   rE   UMT5Attention.forward  s    "/!4!4Ra!8
 3$>vvm,#((RtG^G^_iijkmno%'2266t~~FJ!&4&J&J#&4&I&I#2D.-."<,66t~~FJ.::4>>JL/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
 &@DN--dnn= l,@,@A,FG KYJd*'D'D'FFjt%%b)
//!KKDLL*j9&--W]WcWcM !--FMMR` . M *!Qa*?@M%(Aq2HJ4D4DR4H2H)HIK)K7M::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&zrBff[)L.88r6   )rW   r]   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )FN)NNNNNNN)rG   rH   rI   rJ   __doc__r   intr*   r,   re   r   r   r   r   rE   rK   rL   rM   s   @r4   r   r      s    "XVY] " ": %,, - ^$ 9=8<152615Y9||Y9  (5Y9 !u||!45	Y9
 !.Y9 "%,,/Y9 !.Y9 Y9r6   r   c                   L   ^  \ rS rSrSS\\   4U 4S jjjr    SS jrSrU =r	$ )UMT5LayerSelfAttentioni_  r   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NTr   r   ry   )r)   r*   r   SelfAttentionr&   rW   r|   r}   r   r[   r\   r]   r0   rQ   r   r3   s      r4   r*   UMT5LayerSelfAttention.__init__`  sN    *6t_hi'F<U<UVzz&"5"56r6   c                     U R                  U5      nU R                  UUUUUS9nXR                  US   5      -   nU4USS  -   nU$ )Nr   r   r   r   r   r!   )r}   r   r]   )	r0   rC   r   r   r   r   normed_hidden_statesattention_outputoutputss	            r4   rE   UMT5LayerSelfAttention.forwardf  sk      $}=-- )+)) . 
 &5Ea5H(II "%5ab%99r6   )r   r]   r}   rc   )NNNN
rG   rH   rI   rJ   r   r   r*   rE   rK   rL   rM   s   @r4   r   r   _  s0    7(3- 7 7  r6   r   c                   N   ^  \ rS rSrSS\\   4U 4S jjjr     SS jrSrU =r	$ )UMT5LayerCrossAttentioni{  r   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr   ry   )r)   r*   r   EncDecAttentionr&   rW   r|   r}   r   r[   r\   r]   r   s      r4   r*    UMT5LayerCrossAttention.__init__|  sO    ,VQVbkl'F<U<UVzz&"5"56r6   c           	          U R                  U5      nU R                  UUUUUUS9nXR                  US   5      -   n	U	4USS  -   n
U
$ )Nr   r   r   r   r   r   r!   )r}   r  r]   )r0   rC   r   r   r   r   r   r   r   layer_outputr  s              r4   rE   UMT5LayerCrossAttention.forward  sm      $}=// "7)+)) 0 
 %||4DQ4G'HH/$4QR$88r6   )r  r]   r}   rc   r   r  rM   s   @r4   r  r  {  s3    7(3- 7 7 # r6   r  c                   V   ^  \ rS rSrSS\\   4U 4S jjjr         SS jrSrU =r	$ )	UMT5Blocki  r   c                 j  > [         TU ]  5         UR                  U l        [        R                  " 5       U l        U R
                  R                  [        XS95        U R                  (       a"  U R
                  R                  [        XS95        U R
                  R                  [        U5      5        g )Nr   )
r)   r*   r   r   
ModuleListlayerappendr   r  rv   r   s      r4   r*   UMT5Block.__init__  sv     ++]]_


0MN??JJ5fRS

+f-.r6   c           	         U R                   S   " UUUUU
S9u  pnUR                  [        R                  :X  a}  [        R                  " UR                  5      R
                  n[        R                  " [        R                  " U5      R                  5       US-
  U5      n[        R                  " X* US9nS nU R                  =(       a    US LnU(       a  U R                   S   " UUUUUU
S9u  pnUR                  [        R                  :X  a}  [        R                  " UR                  5      R
                  n[        R                  " [        R                  " U5      R                  5       US-
  U5      n[        R                  " X* US9nU R                   S   " U5      nUR                  [        R                  :X  a}  [        R                  " UR                  5      R
                  n[        R                  " [        R                  " U5      R                  5       US-
  U5      n[        R                  " X* US9nUU4nU	(       a  UX4-  nU$ )Nr   r   i  )r   maxr!   r
  r9   )r  r@   r,   rA   finfor  r   isinfanyclampr   )r0   rC   r   r   encoder_attention_maskr   cross_attn_layer_head_maskr   	use_cacheoutput_attentionsr   self_attn_weights	max_dtypeclamp_valuecross_attn_weightsdo_cross_attentionr  s                    r4   rE   UMT5Block.forward  s    <@::a=)+))<
8. %--/M$7$78<<I++ekk-&@&D&D&F	TXHXZcdK!KK<[YM "!__R1Fd1R@D

1&;5 :--A=M~ ""emm3!KK(;(;<@@	#kk%++m*D*H*H*JIX\L\^gh %M|Q\ ] 

2}5 %--/M$7$78<<I++ekk-&@&D&D&F	TXHXZcdK!KK<[YM 

 )>>Gr6   )r   r  rc   )	NNNNNNFFNr  rM   s   @r4   r  r    s?    /(3- / / "##'> >r6   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	UMT5ClassificationHeadi  z-Head for sentence-level classification tasks.rQ   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  S9U l        [        R                  " UR                  UR                  5      U l
        g )N)r   )r)   r*   r   rV   rW   denser[   classifier_dropoutr]   
num_labelsout_projr`   s     r4   r*   UMT5ClassificationHead.__init__  sZ    YYv~~v~~>
zzF$=$=>		&..&2C2CDr6   rC   r   c                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ rc   )r]   r(  r,   tanhr+  rg   s     r4   rE   UMT5ClassificationHead.forward  sN    ]3

=1

=1]3m4r6   )r(  r]   r+  )rG   rH   rI   rJ   r   r"   r*   r,   re   rE   rK   rL   rM   s   @r4   r&  r&    s4    7Ez EU\\ ell  r6   r&  c                   P    \ rS rSr\rSrSrSrSr	S/r
S/r\S 5       rS rS rS	rg
)UMT5PreTrainedModeli  transformerTr  rZ   c                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r,   tensorr   r   )r0   r5  
input_maskdummy_inputss       r4   r9   UMT5PreTrainedModel.dummy_inputs  s6    LL.	\\*-
!*"&0

 r6   c                    U R                   R                  n[        U[        5      (       a)  UR                  R
                  R                  US-  5        g	[        U[        [        [        [        45      (       Ga  UR                  R                  R
                  R                  SUS-  S9  [        US5      (       aL  U R                   R                  (       d1  UR                  R                  R
                  R                  SUS-  S9  [        US5      (       av  UR                   R                  R
                  R                  SX R                   R"                  S-  -  S9  UR                   R$                  R
                  R'                  5         g	g	[        U[(        5      (       ar  [        US5      (       a`  UR*                  R                  R
                  R                  SUS-  S9  UR*                  R$                  R
                  R'                  5         g	g	[        U[,        5      (       GaQ  UR.                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR.                  S5      (       aE  UR.                  R$                  b.  UR.                  R$                  R
                  R'                  5         UR0                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR0                  S5      (       aG  UR0                  R$                  b/  UR0                  R$                  R
                  R'                  5         g	g	g	[        U[2        5      (       GaQ  UR4                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR4                  S5      (       aE  UR4                  R$                  b.  UR4                  R$                  R
                  R'                  5         UR6                  R                  R
                  R                  SX R                   R8                  S-  -  S9  [        UR6                  S5      (       aG  UR6                  R$                  b/  UR6                  R$                  R
                  R'                  5         g	g	g	[        U[:        5      (       Ga  UR<                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR<                  S5      (       aE  UR<                  R$                  b.  UR<                  R$                  R
                  R'                  5         UR>                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR>                  S5      (       aE  UR>                  R$                  b.  UR>                  R$                  R
                  R'                  5         UR6                  R                  R
                  R                  SX R                   R8                  S-  -  S9  [        UR6                  S5      (       aG  UR6                  R$                  b/  UR6                  R$                  R
                  R'                  5         g	g	g	[        U[@        5      (       GaZ  U R                   R"                  nU R                   RB                  nU R                   RD                  nURF                  R                  R
                  R                  SX#U-  S-  -  S9  URH                  R                  R
                  R                  SX#S-  -  S9  URJ                  R                  R
                  R                  SX#S-  -  S9  URL                  R                  R
                  R                  SX%U-  S-  -  S9  URN                  (       a4  URP                  R                  R
                  R                  SX#S-  -  S9  g	g	g	)
zInitialize the weights      ?        )r>   stdlm_head
qa_outputs      
classifierrU   N))rQ   initializer_factorrd   r&   r.   datafill_	UMT5ModelUMT5ForConditionalGenerationUMT5EncoderModelUMT5ForQuestionAnsweringsharednormal_hasattrtie_word_embeddingsr?  r@  rW   rU   zero_UMT5ForTokenClassificationrB  r&  r(  r+  rO   rY   rZ   rX   rk   rn   ro   r   r   r   r   r   r   r   r   r   )r0   modulefactorrW   r   r   s         r4   _init_weights!UMT5PreTrainedModel._init_weights  s   //fm,,MM$$Vc\2, (	
 
 MM  %%--3FSL-Ivy))$++2Q2Q%%**22#2Nv|,,!!((--553F{{ObObgkNkDl5m!!&&++113 -  :;;v|,,!!((--553FSL5Q!!&&++113 -  677LL$$,,#6kkFYFY^bEb;c,dv||V,,1B1B1N!!&&,,.OO""''//SfI\I\aeHe>f/gv//FOO4H4H4T$$))//1 5U/ 122 II!!))s;;CVCV[_B_8`)avyy&))fiinn.H		##))+II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I) 677KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I).. kk))G!%!1!1kk++GHHOO  ((cvL^B^cgAg7h(iHHOO  ((cv$7O(PHHOO  ((cv$7O(PHHOO  ((cvL^B^cgAg7h(i11..55::BBQWhl[lQmBn 2 /r6   c                    U R                   R                  nU R                   R                  nUc  [        S5      e[	        U5      (       aE  [
        R                  " UR                  S S S-   U5      n[
        R                  " XASS S24   /SS9nO=UR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US	:H  U5        U$ )
Nzself.model.config.decoder_start_token_id has to be defined. In UMT5 it is usually set to the pad_token_id. See UMT5 docs for more information.r9   )r!   .r   r!   ).r   z1self.model.config.pad_token_id has to be defined.)rQ   decoder_start_token_idpad_token_id
ValueErrorr   r,   fullr   cat	new_zerosclonemasked_fill_)r0   r5  rV  rW  shifted_input_idss        r4   _shift_right UMT5PreTrainedModel._shift_rightO  s    !%!C!C{{//!)6  Y'' %

9??3B+?$+FH^ _ %		+<SbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r6    N)rG   rH   rI   rJ   r"   config_classbase_model_prefixsupports_gradient_checkpointing_supports_cache_class_supports_static_cache_no_split_modules_keep_in_fp32_modulespropertyr9  rR  r_  rK   ra  r6   r4   r1  r1    sN    L%&*# !$!F @oD!r6   r1  c                   .  ^  \ rS rSrSU 4S jjrS rS r             SS jr SS\\	R                  S4   S\	R                  S	\	R                  S
\S\4
S jjr\S\	R                  S\S\S\	R                   S	\	R                  S\4S j5       rSrU =r$ )	UMT5Stackik  c           
        > [         TU ]  U5        X l        UR                  U l        [        R
                  " [        UR                  5       Vs/ s H  n[        XS9PM     sn5      U l	        [        UR                  UR                  S9U l        [        R                  " UR                  5      U l        SU l        U R%                  5         g s  snf )Nr  ry   F)r)   r*   embed_tokensr   r   r  range
num_layersr  blockr&   rW   r|   final_layer_normr[   r\   r]   gradient_checkpointing	post_init)r0   rQ   rm  ir3   s       r4   r*   UMT5Stack.__init__l  s     ( ++]]ERXRcRcLd#eLdqIf$BLd#ef
 -fnn&B[B[ \zz&"5"56 ',# $fs   Cc                     U R                   $ rc   rm  r0   s    r4   get_input_embeddingsUMT5Stack.get_input_embeddingsx  s       r6   c                     Xl         g rc   rw  r0   new_embeddingss     r4   set_input_embeddingsUMT5Stack.set_input_embeddings{  s    *r6   c                 v   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb*  Ub'  U R
                  (       a  SOSn[        SU SU S35      eUb&  UR                  5       nUR                  SUS   5      nO>Ub  UR                  5       S S nO'U R
                  (       a  SOSn[        SU SU S	35      eU R                  (       a/  U R                  (       a  U	(       a  [        R                  S
5        Sn	Uc)  U R                  c  [        S5      eU R                  U5      nUu  nnU	SL a   U R
                  (       d  [        SU  S35      eSnSnU R
                  (       a  U	(       d  Ub  [        U[        5      (       a,  [        U[         5      (       d  Sn[!        U[#        5       5      nOv[        U[         5      (       d.  Sn[        R                  S5        [         R$                  " U5      nO3Uc  [!        [#        5       [#        5       5      nOU R
                  (       d  S nUb  UR'                  5       OSnUc#  [(        R*                  " UUU-   UR,                  S9nUc4  [/        5       (       d%  UU-   n[(        R0                  " UUUR,                  S9nU R
                  (       a%  U R3                  UUUUb  UR4                  OS U
5      nO\UbW  US S 2S S S S 24   nUR7                  UR8                  S9nSU-
  [(        R:                  " UR8                  5      R<                  -  nOS nU R
                  (       aO  UbL  UR                  5       u  nnnUU4nUc  [(        R0                  " UUR,                  S9nU R?                  U5      nOS nU RA                  X`R                   RB                  5      nU RA                  XpR                   RB                  5      nU(       a  SOS nU
(       a  SOS nU
(       a  U R
                  (       a  SOS nU RE                  U5      n[G        U RH                  5       H  u  n n!UU    n"UU    n#U(       a  UU4-   nU R                  (       a7  U R                  (       a&  U RK                  U!RL                  UUUUU"U#S U	U
U5      n$OU!" UUUUU"U#UU	U
US9
n$U$S   nU	(       a  U$S   n%U
(       d  M  UU$S   4-  nU R
                  (       d  M  UU$S   4-  nM     U RO                  U5      nU RE                  U5      nU(       a  UU4-   nU	(       a  W%OS n&U(       a  UR4                  n&U(       a  URQ                  5       n&U(       d  [S        S UU&UUU4 5       5      $ [U        UU&UUUS9$ )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer9   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   )r@   r<  ra  )	r   r   r  r   r  r   r  r  r   r!   r8   r   c              3   0   #    U  H  nUc  M  Uv   M     g 7frc   ra  ).0r   s     r4   	<genexpr>$UMT5Stack.forward.<locals>.<genexpr>4  s"      
A  s   	)last_hidden_statepast_key_valuesrC   
attentionscross_attentions)+rQ   r  r  output_hidden_statesuse_return_dictr   rX  r   r   rr  r   r   r   rm  rd   r   r   r   from_legacy_cacher   r,   r   r   r   r-   _update_causal_maskr   r;   r@   r  r   invert_attention_maskget_head_maskro  r]   	enumeraterp  _gradient_checkpointing_funcrE   rq  to_legacy_cachetupler   )'r0   r5  r   r   r  r  	head_maskcross_attn_head_maskr  r  r  r  return_dictr   err_msg_prefixinput_shaper   r   return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsrC   rt  layer_moduler   r  layer_outputsnext_decoder_cache
next_caches'                                          r4   rE   UMT5Stack.forward~  s     "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	   ( !_`` --i8M!,
J?? #LTFRg!hii $&+#??	_-H/511*_Vi:j:j.2+"5o|~"V1DEE&*###`
 #6"G"G"X ("5lnln"U #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D*D4zAO"ZZ
OML`L`aN??228G8S44Y]!K '(D$)9:K%..}/B/B.CK,M<O<O0P0T0TTKK ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y++2H2HI	#112FH^H^_"6BD0d%64??rPT]3(4OA|'lO)=a)@&#$58H$H!**t}} $ A A ((!)3#.%"! !-!#.*?+J$3/I#2'&7#1! !.a 0%21%5"  =#3"55???(]1-=,??(W  5Z --m<]3   1]4D D+4'$
&(==J(88:J 
 "%"(
 
 
 9+&+%1
 	
r6   r   r#   input_tensorr   r  r  c           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r=  flex_attentionr   Fsdpa)r  r  is_trainingr!   r9   )sequence_lengthtarget_lengthr@   r   r   )cudaxpunpu)rQ   _attn_implementationr  rd   r,   re   r$   r   is_compileabler   _ignore_causal_mask_sdpar   r@   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer  r   _unmask_unattended)r0   r   r  r   r  r  past_seen_tokensusing_compilable_cacher@   r  r  r   	min_dtypes                r4   r  UMT5Stack._update_causal_maskH  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr6   r  r  r@   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
N   )
fill_valuer@   r   r!   )diagonalr  r9   r   )r   r,   r  r   rY  r   triur   reshapeexpandr\  r   r;   masked_fill)r   r  r  r@   r   r   kwargsr   r  mask_lengthpadding_masks              r4   r  ?UMT5Stack._prepare_4d_causal_attention_mask_with_cache_position  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r6   )rp  r]   rm  rq  rr  r   rc   )NNNNNNNNNNNNN)F)rG   rH   rI   rJ   r*   ry  r~  rE   r   r,   re   r   r   r  staticmethodr   r@   r  rK   rL   rM   s   @r4   rk  rk  k  s    
!+
 "#!!G
` #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r6   rk  c            &       h  ^  \ rS rSrSrSr\rSS/rU 4S jr	S r
S rS	 rS
 rS rS r\                S S\\R&                     S\\R(                     S\\R&                     S\\R*                     S\\R(                     S\\R(                     S\\R,                     S\\\\R(                           S\\\\R(                           S\\R,                     S\\R,                     S\\   S\\   S\\   S\\   S\\R&                     S\\\R(                     \4   4"S jj5       rSrU =r$ )!rF  i  a?  
Examples:

```python
>>> from transformers import UMT5Model, AutoTokenizer

>>> model = UMT5Model.from_pretrained("google/umt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> noisy_text = "UN Offizier sagt, dass weiter <extra_id_0> werden muss in Syrien."
>>> label = "<extra_id_0> verhandelt"
>>> inputs = tokenizer(inputs, return_tensors="pt")
>>> labels = tokenizer(label=label, return_tensors="pt")

>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
>>> hidden_states = outputs.last_hidden_state
```umt5encoder.embed_tokens.weightdecoder.embed_tokens.weightc                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        SUl        [        X R                  5      U l        [        R                  " U5      nSUl	        SUl        UR                  Ul        [        X0R                  5      U l        U R#                  5         g NFT)r)   r*   r   r   
vocab_sizerW   rJ  copydeepcopyr   r  is_encoder_decoderrk  encodernum_decoder_layersro  decoderrs  r0   rQ   encoder_configdecoder_configr3   s       r4   r*   UMT5Model.__init__  s     ll6#4#4fnnEv.$)!#( ,1) =v.$(!,1)$*$=$=! = 	r6   c                     U R                   $ rc   rJ  rx  s    r4   ry  UMT5Model.get_input_embeddings      {{r6   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g rc   rJ  r  r~  r  r|  s     r4   r~  UMT5Model.set_input_embeddings  +    $)).9)).9r6   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g rc   rQ   rM  _tie_or_clone_weightsr  rm  rJ  r  rx  s    r4   _tie_weightsUMT5Model._tie_weights  P    ;;**&&t||'@'@$++N&&t||'@'@$++N +r6   c                     U R                   $ rc   r  rx  s    r4   get_encoderUMT5Model.get_encoder       ||r6   c                     U R                   $ rc   r  rx  s    r4   get_decoderUMT5Model.get_decoder  r  r6   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r  	attentionprune_headsr0   heads_to_pruner  headss       r4   _prune_headsUMT5Model._prune_heads  s<    
 +002LELLu%//;;EB 3r6   r5  r   r4  r6  r  decoder_head_maskr  encoder_outputsr  r  decoder_inputs_embedsr  r  r  r  r   r   c                 \   Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  U R                  UUU
UUUUS9nORU(       aK  [	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUU	UUUUUUUUUS9nU(       d  UU-   $ [        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoTokenizer, UMT5Model

>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> model = UMT5Model.from_pretrained("google/umt5-small")

>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for UMT5Model.
>>> # This is not needed for torch's UMT5ForConditionalGeneration as it does this internally using labels arg.
>>> decoder_input_ids = model._shift_right(decoder_input_ids)

>>> # forward pass
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```Nr5  r   r  r  r  r  r  r   r!   r8   r  rC   r  r5  r   r  r  r   r  r  r  r  r  r  r  r   )r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentions)rQ   r  r  r  rd   r   lenr  r   r  r  rC   r  r  )r0   r5  r   r4  r6  r  r  r  r  r  r  r  r  r  r  r  r   rC   decoder_outputss                      r4   rE   UMT5Model.forward  s^   b "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1'!5/!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r6   )r  r  rJ  NNNNNNNNNNNNNNNN)rG   rH   rI   rJ   r   
model_typer"   rb  _tied_weights_keysr*   ry  r~  r  r  r  r   r   r   r,   
LongTensorFloatTensor
BoolTensorre   r   r   r   r   rE   rK   rL   rM   s   @r4   rF  rF    s   " JL79VW(:OC  156:8<=A159=7;EIEI048<$(,0/3&*59#D
E,,-D
 !!2!23D
 $E$4$45	D

 !))9)9 :D
 E--.D
 $E$5$56D
 'u||4D
 "%e.?.?(@"ABD
 "%e.?.?(@"ABD
  -D
  (5D
 D>D
 $D>D
 'tnD
  d^!D
" !!1!12#D
$ 
uU&&');;	<%D
 D
r6   rF  z<
    UMT5 Model with a `language modeling` head on top.
    )custom_introc            (         ^  \ rS rSrSrSr/ SQrU 4S jrS rS r	S r
S	 rS
 rS rS r\                 S#S\\R$                     S\\R&                     S\\R$                     S\\R(                     S\\R&                     S\\R&                     S\\R*                     S\\\\R*                           S\\\\R*                           S\\R&                     S\\R&                     S\\R$                     S\\   S\\   S\\   S\\   S\\R$                     S\\\R&                     \4   4$S jj5       rS\R*                  4S  jr\S! 5       rS"rU =r$ )$rG  i  a  
Examples:

```python
>>> from transformers import UMT5ForConditionalGeneration, AutoTokenizer

>>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> summary = "Weiter Verhandlung in Syrien."
>>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

>>> outputs = model(**inputs)
>>> loss = outputs.loss
```r  )r  r  zlm_head.weightc                 L  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        SUl        [        X R                  5      U l        [        R                  " U5      nSUl
        SUl        UR                  Ul        [        X0R                  5      U l        [        R$                  " UR                  UR                  SS9U l        U R)                  5         g )NFTrT   )r)   r*   rW   	model_dimr   r   r  rJ  r  r  r   r  r  rk  r  r  ro  r  rV   r?  rs  r  s       r4   r*   %UMT5ForConditionalGeneration.__init__  s     ll6#4#4fnnEv.$)!#( ,1) =v.$(!,1)$*$=$=! =yy1B1BO 	r6   c                     U R                   $ rc   r  rx  s    r4   ry  1UMT5ForConditionalGeneration.get_input_embeddings  r  r6   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g rc   r  r|  s     r4   r~  1UMT5ForConditionalGeneration.set_input_embeddings  r  r6   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g rc   r  rx  s    r4   r  )UMT5ForConditionalGeneration._tie_weights  r  r6   c                     Xl         g rc   r?  r|  s     r4   set_output_embeddings2UMT5ForConditionalGeneration.set_output_embeddings  s    %r6   c                     U R                   $ rc   r"  rx  s    r4   get_output_embeddings2UMT5ForConditionalGeneration.get_output_embeddings  r  r6   c                     U R                   $ rc   r  rx  s    r4   r  (UMT5ForConditionalGeneration.get_encoder  r  r6   c                     U R                   $ rc   r  rx  s    r4   r  (UMT5ForConditionalGeneration.get_decoder  r  r6   r5  r   r4  r6  r  r  r  r  r  r  r  labelsr  r  r  r  r   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  U R                  UUU
UUUUS9nORU(       aK  [	        U[
        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nUb  Uc  Uc  U R                  U5      nU R                  UUUU	UUUUUUUUUS9nUS   nU R                   R                  (       a  UU R                  S-  -  nU R                  U5      nSnUb[  [        S	S
9nUR                  UR                  5      nU" UR                  SUR!                  S5      5      UR                  S5      5      nU(       d  U4USS -   U-   nUb  U4U-   $ U$ [#        UUUR$                  UR&                  UR(                  UR*                  UR,                  UR&                  UR(                  S9	$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoTokenizer, UMT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> model = UMT5ForConditionalGeneration.from_pretrained("google/umt5-small")

>>> # training
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
>>> outputs = model(input_ids=input_ids, labels=labels)
>>> loss = outputs.loss
>>> logits = outputs.logits

>>> # inference
>>> input_ids = tokenizer("Studies have shown that <extra_id_0> good for you", return_tensors="pt").input_ids
>>> outputs = model.generate(input_ids)
>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
```Nr  r   r!   r8   r  r  rA  rU  ignore_indexr9   	losslogitsr  r	  r
  r  r  r   r  )rQ   r  r  r  rd   r   r  r_  r  rM  r  r?  r	   r;   r   r   r   r   r  rC   r  r  r  )r0   r5  r   r4  r6  r  r  r  r  r  r  r  r,  r  r  r  r  r   rC   r  sequence_output	lm_logitsr1  loss_fctoutputs                            r4   rE   $UMT5ForConditionalGeneration.forward  s.   j "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1'!5/!5#) ' 
  *!,;;** .1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r6   c                 $    U R                  U5      $ rc   )r_  )r0   r,  s     r4   %prepare_decoder_input_ids_from_labelsBUMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      ((r6   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nra  c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7f)r   N)index_selectr;   r   )r  
past_statebeam_idxs     r4   r  >UMT5ForConditionalGeneration._reorder_cache.<locals>.<genexpr>  s1     ncmU_--aZ=N=N1OPPcms   7:)r  )r  r?  reordered_past
layer_pasts    `  r4   _reorder_cache+UMT5ForConditionalGeneration._reorder_cache  s8    )Jncmnn N * r6   )r  r  r?  r  rJ  )NNNNNNNNNNNNNNNNN) rG   rH   rI   rJ   r   r  r  r*   ry  r~  r  r#  r&  r  r  r   r   r,   r  r  r  re   r   r   r   r   rE   r9  r  rC  rK   rL   rM   s   @r4   rG  rG    s4     Ji0:O&  156:8<=A159=7;@D@D59=A-1$(,0/3&*59%_
E,,-_
 !!2!23_
 $E$4$45	_

 !))9)9 :_
 E--._
 $E$5$56_
 'u||4_
 "%ell(;"<=_
 "%ell(;"<=_
   1 12_
  ((9(9:_
 ))*_
 D>_
 $D>_
  'tn!_
" d^#_
$ !!1!12%_
& 
uU&&'8	9'_
 _
D)ELL )  r6   rG  c                   8  ^  \ rS rSrSrSrS/rU 4S jrS rS r	S r
S	 rS
 r\       SS\\R                      S\\R"                     S\\R"                     S\\R"                     S\\   S\\   S\\   S\\\R"                     \4   4S jj5       rSrU =r$ )rH  i  a  
Examples:

```python
>>> from transformers import UMT5EncoderModel, AutoTokenizer

>>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
>>> input_ids = tokenizer(article, return_tensors="pt").input_ids
>>> outputs = model(input_ids)
>>> hidden_state = outputs.last_hidden_state
```r  r  c                    > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        [        X R                  5      U l        U R                  5         g NF)r)   r*   r   r   r  rW   rJ  r  r  r  r  rk  r  rs  )r0   rQ   r  r3   s      r4   r*   UMT5EncoderModel.__init__  sf     ll6#4#4fnnEv.#( ,1) = 	r6   c                     U R                   $ rc   r  rx  s    r4   ry  %UMT5EncoderModel.get_input_embeddings  r  r6   c                 F    Xl         U R                  R                  U5        g rc   )rJ  r  r~  r|  s     r4   r~  %UMT5EncoderModel.set_input_embeddings  s    $)).9r6   c                     U R                   R                  (       a1  U R                  U R                  R                  U R
                  5        g g rc   )rQ   rM  r  r  rm  rJ  rx  s    r4   r  UMT5EncoderModel._tie_weights  s2    ;;**&&t||'@'@$++N +r6   c                     U R                   $ rc   r  rx  s    r4   r  UMT5EncoderModel.get_encoder  r  r6   c                     UR                  5        HD  u  p#U R                  R                  U   R                  S   R                  R                  U5        MF     g)r  r   N)r  r  rp  r  r   r  r  s       r4   r   UMT5EncoderModel._prune_heads  sG    
 +002LELLu%++A.<<HHO 3r6   r5  r   r  r  r  r  r  r   c           
      f    Ub  UOU R                   R                  nU R                  UUUUUUUS9nU$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).

Example:

```python
>>> from transformers import AutoTokenizer, UMT5EncoderModel

>>> tokenizer = AutoTokenizer.from_pretrained("google/umt5-small")
>>> model = UMT5EncoderModel.from_pretrained("google/umt5-small")
>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```r  )rQ   r  r  )	r0   r5  r   r  r  r  r  r  r  s	            r4   rE   UMT5EncoderModel.forward  sK    F &1%<k$++B]B],,)'/!5# ' 
 r6   )r  rJ  )NNNNNNN)rG   rH   rI   rJ   r   r  r  r*   ry  r~  r  r  r   r   r   r,   r  r  r   r   r   r   rE   rK   rL   rM   s   @r4   rH  rH    s     J78
:
O
P  156:1559,0/3&*-E,,-- !!2!23- E--.	-
   1 12- $D>- 'tn- d^- 
uU&&'8	9- -r6   rH  z
    UMT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $         ^  \ rS rSrS/rSS/rS\4U 4S jjr\               SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\\\4   4 S jj5       rSrU =r$ )UMT5ForSequenceClassificationi  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightr  r  rQ   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         SU l        g rG  )r)   r*   rF  r2  r&  classification_headrs  model_parallelr`   s     r4   r*   &UMT5ForSequenceClassification.__init__  s>     $V,#9&#A  	#r6   r5  r   r4  r6  r  r  r  r  r  r  r,  r  r  r  r  r   c                 4   Ub  UOU R                   R                  nUb  SnUc%  U	b"  [        SU R                  R                   35      eUc"  U
c  Uc  [        S5      eU R                  U5      nU R                  UUUUUUUUU	U
UUUUS9nUS   nUR                  U R                   R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      5      S:  a  [        S5      eUR                   u  nnnUUSS24   R#                  US	U5      SS2S	SS24   nU R%                  U5      nSnUGb  UR                  UR                  5      nU R                   R&                  c  U R                   R(                  S:X  a  S
U R                   l        OyU R                   R(                  S:  aN  UR*                  [        R,                  :X  d  UR*                  [        R.                  :X  a  SU R                   l        OSU R                   l        U R                   R&                  S
:X  aT  [1        5       nU R                   R(                  S:X  a&  U" UR3                  5       UR3                  5       5      nOU" UU5      nOU R                   R&                  S:X  aG  [5        5       nU" UR#                  S	U R                   R(                  5      UR#                  S	5      5      nO-U R                   R&                  S:X  a  [7        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [9        UUUR:                  UR<                  UR>                  UR@                  URB                  URD                  URF                  S9	$ )ak	  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r4  r6  r  r  r  r  r  r  r  r  r  r  r   r!   z7All examples must have the same number of <eos> tokens.r9   
regressionsingle_label_classificationmulti_label_classificationr0  )$rQ   r  NotImplementedErrorr3   rG   rX  r_  r2  eqeos_token_idr;   r   r  r,   unique_consecutivesumr   r   rY  problem_typer*  r@   r   r   r
   squeezer	   r   r   r  r	  r
  r  r  r   r  )r0   r5  r   r4  r6  r  r  r  r  r  r  r,  r  r  r  r  r  r3  eos_maskr   r  r1   sentence_representationr2  r1  r5  r6  s                              r4   rE   %UMT5ForSequenceClassification.forward  sR   | &1%<k$++B]B]I!:%J4>>KbKbJcd  $)>)F  U 
 !% 1 1) <"")/#9/!5+'"7/!5# # 
  "!*<< 8 89<<_=S=STu''Q89A=VWW%4%:%:"
A{"1(A+">"C"CJPRT_"`abdfhiai"j))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r6   )rY  rZ  r2  )NNNNNNNNNNNNNNN)rG   rH   rI   rJ   "_keys_to_ignore_on_load_unexpectedr  r"   r*   r   r   r,   r  re   r   r  r   r   r   r   rE   rK   rL   rM   s   @r4   rV  rV    s    +s)s&79VW$z $  15158<=A,0487;=A59=A-1$(,0/3&*!P
E,,-P
 !.P
 $E$4$45	P

 !))9)9 :P
 ELL)P
 $ELL1P
 'u||4P
 "$u'8'8"9:P
   1 12P
  ((9(9:P
 ))*P
 D>P
 $D>P
 'tnP
  d^!P
" 
u55	6#P
 P
r6   rV  c                   @  ^  \ rS rSrS/rS/rS\4U 4S jjr\        SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\   S\	\   S\	\   S\\\
R                     \4   4S jj5       rSrU =r$ )rO  i  rW  z'transformer.encoder.embed_tokens.weightrQ   c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g rc   )r)   r*   r*  rH  r2  r   r[   r)  r]   rV   r1   rB  rs  r`   s     r4   r*   #UMT5ForTokenClassification.__init__  sj      +++F3zz&";";<))F$6$68I8IJ 	r6   r5  r   r  r  r,  r  r  r  r   c	           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9n	U	S   n
U R                  U
5      n
U R	                  U
5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  XSS 4nUb  U4U-   $ U$ [        UUU	R                  U	R                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
N)r   r  r  r  r  r  r   r9   r8   )r1  r2  rC   r  )rQ   r  r2  r]   rB  r	   r   r*  r   rC   r  )r0   r5  r   r  r  r,  r  r  r  r  rC   r2  r1  r5  r6  s                  r4   rE   "UMT5ForTokenClassification.forward  s    6 &1%<k$++B]B]"")'/!5# # 
  
]3/')HFKKDOO<fkk"oNDam,F)-)9TGf$EvE$!//))	
 	
r6   )rB  r]   r*  r2  )NNNNNNNN)rG   rH   rI   rJ   rk  r  r"   r*   r   r   r,   re   r   r   r   r   rE   rK   rL   rM   s   @r4   rO  rO    s    *r)s&CD	z 	  -115,004)-,0/3&*7
ELL)7
 !.7
 ELL)	7

  -7
 &7
 $D>7
 'tn7
 d^7
 
uU\\"$99	:7
 7
r6   rO  c            &       J  ^  \ rS rSrSS/rU 4S jrS rS rS rS r	S	 r
\                SS
\\R                     S\\R                     S\\R                     S\\R                      S\\R                     S\\R                     S\\R"                     S\\\\R"                           S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\R                     \4   4"S jj5       rSrU =r$ )rI  i  r  r  c                 p  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        SUl        [        X R                  5      U l        [        R                  " U5      nSUl
        SUl        UR                  Ul        [        X0R                  5      U l        UR$                  U l        [        R&                  " UR                  UR$                  5      U l        U R+                  5         g r  )r)   r*   rW   r  r   r   r  rJ  r  r  r   r  r  rk  r  r  ro  r  r*  rV   r@  rs  r  s       r4   r*   !UMT5ForQuestionAnswering.__init__  s     ll6#4#4fnnEv.$)!#( ,1) =v.$(!,1)$*$=$=! = ++))FNNF4E4EF 	r6   c                     U R                   $ rc   r  rx  s    r4   ry  -UMT5ForQuestionAnswering.get_input_embeddings  r  r6   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g rc   r  r|  s     r4   r~  -UMT5ForQuestionAnswering.set_input_embeddings  r  r6   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g rc   r  rx  s    r4   r  %UMT5ForQuestionAnswering._tie_weights"  r  r6   c                     U R                   $ rc   r  rx  s    r4   r  $UMT5ForQuestionAnswering.get_encoder(  r  r6   c                     U R                   $ rc   r  rx  s    r4   r  $UMT5ForQuestionAnswering.get_decoder,  r  r6   r5  r   r4  r6  r  r  r  r  start_positionsend_positionsr  r  r  r  r  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U
b  SnUc"  Uc  Uc  [        S5      eU R	                  U5      nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  U R                  UUUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUSUUUUUUUUS	9nUS   nU R                  U5      nUR                  SS
S9u  nnUR                  S
5      R                  5       nUR                  S
5      R                  5       nSnU	b  U
b  [        U	R                  5       5      S:  a*  U	R                  S
5      R                  UR                   5      n	[        U
R                  5       5      S:  a*  U
R                  S
5      R                  UR                   5      n
UR                  S5      nU	R#                  SU5      n	U
R#                  SU5      n
[%        US9nU" UU	5      nU" UU
5      nUU-   S-  nU(       d  UU4USS -   U-   nUb  U4U-   $ U$ ['        UUUUR(                  UR*                  UR,                  UR.                  UR0                  UR*                  UR,                  S9
$ )aY  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. UMT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [UMT5 Training](./umt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    UMT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [UMT5
    Training](./umt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
NFr]  r  r   r!   r8   r  )r5  r   r  r  r   r  r  r  r  r  r  r  r9   r   r.  )
r1  start_logits
end_logitsr  r	  r
  r  r  r   r  )rQ   r  r  rX  r_  r  rd   r   r  r  r@  splitrg  r   r   r;   r   r  r	   r   r  rC   r  r  r  )r0   r5  r   r4  r6  r  r  r  r  r~  r  r  r  r  r  r  r  rC   r  r3  r2  r  r  
total_lossignored_indexr5  
start_lossend_lossr6  s                                r4   rE    UMT5ForQuestionAnswering.forward/  s<   x &1%<k$++B]B]!*!6IDKK<Q<Q	&=+DI
 $)>)F  U 
 !% 1 1) <!*!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/ "/#1'!5/!5# ' 
 *!,1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J//!"2EEWF/9/EZMF*Q6Q2%!+;;"1"?"?.99,==&5&G&G"1"?"?.99
 	
r6   )r  r  r  r*  r@  rJ  r  )rG   rH   rI   rJ   r  r*   ry  r~  r  r  r  r   r   r,   r  r  r  re   r   r   r   r   rE   rK   rL   rM   s   @r4   rI  rI    s   79VW2:O  156:8<=A159=7;@D6:4859=A$(,0/3&*#Z
E,,-Z
 !!2!23Z
 $E$4$45	Z

 !))9)9 :Z
 E--.Z
 $E$5$56Z
 'u||4Z
 "%ell(;"<=Z
 "%"2"23Z
   0 01Z
   1 12Z
  ((9(9:Z
 D>Z
 $D>Z
  'tn!Z
" d^#Z
$ 
uU&&')LL	M%Z
 Z
r6   rI  )rH  rG  rI  rV  rO  rF  r1  )Fr   r  r   typingr   r   r   r   r,   r   torch.nnr   r	   r
   activationsr   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   r   r    configuration_umt5r"   !torch.nn.attention.flex_attentionr#   integrations.flex_attentionr$   
get_loggerrG   r   Moduler&   rO   rk   rv   r   r   r  r  r&  r1  rk  rF  rG  rH  rV  rO  rI  __all__ra  r6   r4   <module>r     s      / /   A A ! C C ) >   .   +  !!;J			H	%+BII +4		 .RYY <")) $9BII 9DRYY 8bii <I		 IZRYY $ o!/ o! o!dW# Wt
 O
# O
 O
d 
x#6 x
xv i* i iX `
$7 `
`
F I
!4 I
 I
X N
2 N
 N
br6   