
    fTh$                       S r SSKrSSKrSSKrSSKJrJrJrJrJ	r	  SSK
r
SSK
Jr  SSKJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJr  SSKJ r J!r!J"r"  SSK#J$r$J%r%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,  \'" 5       (       a  SSK-J.r.  SSK/J0r0  \*Rb                  " \25      r3SSS\
Rh                  S\5S\5S\5S\
Rh                  4
S jjr6S\
Rh                  S\5S\5S\
Rh                  4S jr7SSS\
Rh                  S\5S\5S\5S\
Rh                  4
S jjr8S\5S\
Rh                  4S jr9S\
Rh                  S\5S\
Rh                  4S jr:S \
Rh                  S\5S!\
Rv                  S\
Rh                  4S" jr<S \
Rh                  S#\5S\\
Rh                  \
Rh                  4   4S$ jr=S \
Rh                  S#\5S\
Rh                  4S% jr>S&\
Rh                  S'\
Rh                  S(\5S\
Rh                  4S) jr? " S* S+\R                  5      rA SS,KBJCrC  \CrA\3R                  S-5        \ R                  " \A5         " S/ S0\R                  5      rI " S1 S2\R                  5      rJ " S3 S4\R                  5      rK " S5 S6\R                  5      rL " S7 S8\R                  5      rM " S9 S:\R                  5      rN " S; S<\R                  5      rO " S= S>\R                  5      rP " S? S@\R                  5      rQ " SA SB\R                  5      rR " SC SD\R                  5      rS\& " SE SF\5      5       rT " SG SH\T5      rUSIrV\& " SJ SK\T5      5       rW\&" SLSM9 " SN SO\T\5      5       rX\& " SP SQ\T5      5       rY/ SRQrZg! \E a     GNY\F a    \3R                  S.5         GNrf = f)TzPyTorch LongT5 model.    N)AnyListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )LongT5Config)	BlockMask)make_flex_block_causal_maskx	block_lendim	pad_valuereturnc                 l   U R                   U   * U-  n[        U R                   5      (       d?  [        U R                   5      nXR==   U-  ss'   [        R                  " XPR
                  S9$ S/U R                  -  nSU4Xb'   [        USSS2   S5      n[        R                  R                  XSUS9n U $ )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr+   ndimsumr   
functionalr1   )r$   r%   r&   r'   pad_len	new_shaper1   s          b/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multipler?   >   s    wws|mi'Gqww<<M	'!{{9GG44(QVV
C7|CH
c$B$i
C
!:YGAH    c                 4   U R                   U   U-  S:w  a  [        XUSS9n U R                   U   U-  nU R                   SU X14-   U R                   US-   S -   nSU;   a)  [        R                  " X@R                  U R
                  S9$ U R                  U5      $ )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
r   )r'   Nr    r+   device)r4   r?   r7   emptyr+   rC   reshape)r$   r%   r&   
num_blocksoutput_shapes        r>   _split_into_blocksrH   N   s    
 	wws|i1$Q3!<*J774C=J#::QWWcAg[=QQLL{{<wwqxxHH99\""r@   	block_dimsequence_dimc                    U R                   U   nS/U R                  -  nSXQ'   [        USSS2   S5      n[        R                  R                  XSUS9n / n[        S5       HK  n[        S	S5      /U R                  -  n[        XwU-   5      X'   [        U5      nUR                  X   5        MM     [        R                  " XbS
9$ )zConcatenate three consecutive blocks for each input block for local attentiont.

For more information, see: https://arxiv.org/pdf/2112.07916.pdf.
r,   )r    r    Nr-   r.   r/   r0   r
   r   r&   )r4   r9   r:   r   r;   r1   rangeslicetupleappendr7   cat)	r$   rI   rJ   r'   rF   r1   blocks_listiindicess	            r>   _concatenate_3_blocksrU   ]   s    
 #J(QVV
CCN
c$B$i
C
!:YGA&(K1X D>"QVV+"1*n5.1:&  99[33r@   c                     [         R                  " SU -  [         R                  S9nXU *  nUR                  S5      UR                  S5      -
  nU$ )z:Makes 3-blocked relative position ids for local attention.r
   r*   r   r    )r7   arangeint32	unsqueeze)r%   position_idscenter_position_idsrelative_position_idss       r>   "_make_3block_relative_position_idsr]   v   sP    <<IU[[AL&)<(22158K8U8UVW8XX  r@   local_attention_maskc                     [        U5      n[        R                  " U5      U:  nUSSSS2SS24   nUR                  U R                  5      n[        R
                  " X5      $ )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)r]   r7   abstorC   logical_and)r^   r%   r\   locality_masks       r>   _mask_local_attention_maskrd      s]    >yIII34y@M!$a"23M!$$%9%@%@AM1AAr@   attention_maskrC   c                     [        XSS9n[        USSS9nUR                  S5      nUR                  S5      n[        R                  " X45      n[        XQ5      nUR                  S5      R                  U5      $ )z;Prepare attention mask to be applied for a local attention.r    rL      rI   rJ   r-   )rH   rU   rY   r7   rb   rd   ra   )re   r%   rC   _blocked_attention_mask_3blocked_attention_maskr^   s         r>   _get_local_attention_maskrl      s     1PQR45LXYhij5??C7AA"E ,,-D_56JV))!,//77r@   global_block_sizec                   ^^ U R                   SS u  nmS[        R                  S[        R                  4UU4S jjn[        R                  " X R                  S9T-  n[        R
                  " USS9U-
  n[        R                  " U S	:g  S
S5      R                  U R                  5      n[        R                  " XT-   S
-
  5      R                  U R                  5      n[        R                  " SUR                  UR                  S9n[        R                  " Xg:  Xg5      nX`-  U S-
  -   nU" U5      nTT-  nUS:  a@  [        R                  " USS9R                  R                  US5      R                  SS5      n	O+[        R                  " USUR                  UR                  S9n	[        R
                  " [        R                   " X(5      SS9S-
  n
U
R#                  U R                  5      n
[        R                  " X:*  SS5      n
UR                  [        R$                  5      U
R                  [        R$                  5      4$ )a  Obtain the "fixed block" global id corresponding to each input token.

This implementation is a simplified version of the original Flaxformr implementation adopted from:
https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
the whole fixed block, are assigned to the preceding block.

Padding tokens from the original sequence are represented by -1.
Nrg   	block_idsr(   c                 X  > [         R                  " T5      T-  TS-
  :H  nUR                  U R                  5      n[         R                  " XS:  5      nUR                  S5      R                  S5      R                  U R                  5      S-
  n[         R                  " X:  X5      n U $ )Nr    r   r-   )
r7   rW   ra   rC   rb   r:   rY   typer+   where)ro   
block_endstrue_block_endsfull_blocksrm   seq_lens       r>   handle_orphan_tokens:_make_global_fixed_block_ids.<locals>.handle_orphan_tokens   s    ll7+.??DUXYDYY
]]9#3#34
++JQG%))"-77;@@QTUUKK	 7P	r@   rC   r    )axis              ?g     @r-   rB   r   rL   )r4   r7   Tensor	ones_likerC   cumsumrr   rq   r+   floortensormaxvaluesrepeat	transposer8   onesra   int)re   rm   
batch_sizerw   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsrv   s    `         @r>   _make_global_fixed_block_idsr      s    )..r2J    ~>S>STWhh||$41=@PP;;~,c7;@@AUAUVD{{4#:S#@AFF~G[G[\$)LL;K;Q;QZjZqZq$r!{{8:J )9nq>PQ+,<=..KQ"')),<""E"L"L"S"ST_ab"c"m"mnoqr"s"'++!1!7!7@P@W@W#
 ejj&IrRUVV+..~/D/DE%7%RTUWXY  +-?-D-DUYY-OOOr@   c                     [        X5      u  p#UR                  S   n[        R                  " XBR                  S9nXRS   -
  nUR                  [        R                  5      $ )zBCreate the relative position tensor for local -> global attention.r-   ry   .N)r   r4   r7   rW   rC   rq   int64)re   rm   ro   r   global_seq_lenglobal_positionsside_relative_positions          r>    _make_side_relative_position_idsr      sW    $@$c!I'--b1N||N;K;KL-)0DD!&&u{{33r@   hidden_statesro   r   c           	      r   UR                  US:  [        R                  " X!R                  UR                  S95      n[
        R                  R                  UR                  [        R                  5      US-   5      SS2SS2SS24   n[        R                  " SXR                  U R                  5      5      $ )zFCompute individual block aggregates by summing over individual blocks.r   rB   r    Nr-   z...nd,...ng->...gd)rr   r7   r   r+   rC   r   r;   one_hotrq   r   einsum)r   ro   r   one_hot_block_idss       r>   _create_global_aggregatesr      s    
 Q^??S\ScScdI --innU[[.I>\]K]^_`bcehfheh_hi<<,m=S=STaTgTg=hiir@   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )LongT5LayerNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zW
Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
N)super__init__r   	Parameterr7   r   weightvariance_epsilon)selfhidden_sizeeps	__class__s      r>   r   LongT5LayerNorm.__init__   s/     	ll5::k#:; #r@   c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )Nrg   r-   T)keepdim)ra   r7   float32powmeanrsqrtr   r   r+   float16bfloat16)r   r   variances      r>   forwardLongT5LayerNorm.forward   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r@   )r   r   )gư>)__name__
__module____qualname____firstlineno__r   r   __static_attributes____classcell__r   s   @r>   r   r      s    $+ +r@   r   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5DenseActDensei	  configc                 X  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l
        [        UR                     U l        g NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr   r   r   s     r>   r   LongT5DenseActDense.__init__
  sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r@   c                    U R                  U5      nU R                  U5      nU R                  U5      n[        U R                  R
                  [        R                  5      (       a  UR                  U R                  R
                  R                  :w  aa  U R                  R
                  R                  [        R                  :w  a/  UR                  U R                  R
                  R                  5      nU R	                  U5      nU$ N)r   r   r   
isinstancer   r   r7   r}   r+   int8ra   )r   r   s     r>   r   LongT5DenseActDense.forward  s    ./]3tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r@   )r   r   r   r   	r   r   r   r   r!   r   r   r   r   r   s   @r>   r   r   	  s    /| / r@   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5DenseGatedActDensei  r   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r   r   r   r   s     r>   r   !LongT5DenseGatedActDense.__init__   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r@   c                     U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      nU R	                  U5      nU$ r   )r   r   r   r   r   )r   r   hidden_geluhidden_linears       r>   r    LongT5DenseGatedActDense.forward(  sQ    hhtyy78		-0#3]3.r@   )r   r   r   r   r   r   r   s   @r>   r   r     s    /| / r@   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )LongT5LayerFFi2  r   c                   > [         TU ]  5         UR                  (       a  [        U5      U l        O[        U5      U l        [        UR                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   s     r>   r   LongT5LayerFF.__init__3  s_    ":6"BD"5f"=D)&..f>W>WXzz&"5"56r@   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ r   )r   r   r   )r   r   forwarded_statess      r>   r   LongT5LayerFF.forward=  s;    ??=9../?@%5E(FFr@   )r   r   r   r   r   s   @r>   r   r   2  s    7| 7 r@   r   c                      ^  \ rS rSr  S
S\S\\   4U 4S jjjrS r\	SS j5       r
SS jr         SS jrS	rU =r$ )LongT5AttentioniE  r   	layer_idxc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  U R                  -  U l        X0l        Uc>  U R                  (       a-  [        R!                  SU R"                  R$                   S35        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        U R                  (       a0  [&        R2                  " U R                  U R                  5      U l        [7        5       U l        SU l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   r   r   r   r   s       r>   r   LongT5Attention.__init__F  so    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E&+#r@   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g Nr   r    rL   lenr   r   r   r  r   r   r   r   r   r   unionr   headsindexs      r>   prune_headsLongT5Attention.prune_headsi      u:?7<<!8!8$:K:K
 $DFFE2#DFFE2#DFFE2#DFFEq9||c%j0004<<? --33E:r@   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   rg   r    ra   r7   longr`   min
zeros_likelogfloatmath	full_likerr   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r>   _relative_position_bucket)LongT5Attention._relative_position_buckety  s   , AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r@   c                    Uc   U R                   R                  R                  nUc,  [        R                  " U[        R
                  US9SS2S4   nOUSS2S4   R                  U5      n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )%Compute binned relative position biasNrB   r  r   r!  rg   r   r    r   )r  r   rC   r7   rW   r  ra   r&  r   r   r   permuterY   )
r   query_length
key_lengthrC   cache_positioncontext_positionmemory_positionr  relative_position_bucketr   s
             r>   compute_biasLongT5Attention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r@   c                 ~   UR                   SS u  pUSLnU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUbE  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nU(       a  UOUnU(       a=  Ub:  W(       a3  WR                  U R                     nUR                  U R                     nOU R                  U5      nU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUbN  U(       d  U
OSn
WR                  UUU R                  SU
05      u  nnU(       a  SUR                  U R                  '   [         R"                  " UUR                  SS5      5      nUc  UR                   S   nUb  UOU
S   S-   nU R$                  (       db  [         R&                  " SU R                  UU4UR(                  UR*                  S	9nU R,                  (       a  U R.                  (       a  SUl        O.U R3                  UUUR(                  U
S
9nUSS2SS2U* S2SS24   nUb#  USS2SS2SS2SUR                   S   24   nUU-   nU R4                  (       aS  [         R6                  " UR                   S   5      nSU[9        U R4                  5      '   USS2UR;                  5       4   nOUnUU-  n[<        R>                  RA                  URC                  5       SS9RE                  U5      n[<        R>                  RG                  UU RF                  U R.                  S9nUb  UU-  n[         R"                  " UU5      nUR                  SS5      RI                  5       nUR                  USU RJ                  5      nU RM                  U5      nUXT4nU	(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
Nrg   r-   r    r/  Tr
   ri   rC   r+   )rC   r/  r   rL   ptraining)'r4   r   viewr   r   r   
is_updatedgetr   cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   updater7   matmulr   r8   rC   r+   r  r9  requires_gradr3  r  r   r6   boolr   r;   softmaxr  type_asr   
contiguousr   r   )r   r   r   key_value_statesposition_biaspast_key_valuelayer_head_maskr-  	use_cacheoutput_attentionsr/  r   
seq_lengthis_cross_attentionquery_statesr;  curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr.  real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                               r>   r   LongT5Attention.forward  s   $ "/!4!4Ra!8
 .T9vvm,#((RtG^G^_iijkmno%'2266t~~FJ!&4&J&J#&4&I&I#-?)]."<,66t~~FJ.::4>>JL/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn= lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fff[)>/Gr@   )r   r   r  r   r   r   r   r   r   r   r   r  r   r  r   r   r   FNT       )NN)	NNNNNNFFN)r   r   r   r   r!   r   r   r   r  staticmethodr&  r3  r   r   r   r   s   @r>   r   r   E  st     %*#'	!,!, C=	!, !,F;  -  - ^. i ir@   r   c                   v   ^  \ rS rSrSS\S\SS4U 4S jjjrS r\SS j5       r	S	\
4S
 jr    SS jrSrU =r$ )LongT5LocalAttentioni)  r   r   r(   Nc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  S-   U l        UR                  U l        U R                  U R                  -  U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        [         R"                  " U R                  U R                  SS9U l        U R                  (       a0  [         R,                  " U R                  U R                  5      U l        [1        5       U l        SU l        g )Nr    Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr%   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r   r   r   r   s      r>   r   LongT5LocalAttention.__init__*  sQ    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.**(?(?? 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E&+#r@   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g r	  r
  r  s      r>   r   LongT5LocalAttention.prune_headsD  r  r@   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ r  r  r  s           r>   r&  .LongT5LocalAttention._relative_position_bucketT     . AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r@   block_lengthc                    U R                   R                  R                  R                  S:w  a   U R                   R                  R                  OSn[        R
                  " SU-  [        R                  US9nX1U*  nUSSS24   USS2S4   -
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      R                  S5      R                  S5      nU$ r)  metaNr
   rB   r*  r+  r   r  r   rC   rq   r7   rW   r  r&  r   r   r   r,  rY   r   rm  target_devicer1  r0  r  r2  r   s           r>   r3  !LongT5LocalAttention.compute_bias       ++2299>>&H ((//66 	
  ,,q<'7uzzR_`*F ,D!G47G47PP#'#A#A#.;;==	 $B $
  --.FG	*44Q7AA!Dr@   c                 6  ^ ^ UR                   S S u  mnUU 4S jnUU 4S jnU" T R                  U5      5      n	U" T R                  U5      5      n
U" T R                  U5      5      n[	        U	T R
                  SS9n	[	        U
T R
                  SS9n
[	        UT R
                  SS9n[        U
SSS9n
[        USSS9n[        R                  " SX5      nUc  T R                  (       dz  [        R                  " SST R                  T R
                  ST R
                  -  4UR                  UR                  S	9nT R                  (       a  T R                  (       a  S
Ul        OT R#                  T R
                  5      nUb/  [        R$                  " US:  SS5      nX2R'                  SS5      -   nX-  n[(        R*                  R-                  UR/                  5       SS9R1                  U5      n[(        R*                  R3                  UT R2                  T R                  S9nUb  X-  nUR5                  UR                  5      nU" [        R                  " SX5      5      nUS S 2S U2S S 24   nT R7                  U5      nS nU4U4-   U4-   nU(       a  UU4-   nU$ )Nrg   c                 T   > U R                  TSTR                  TR                  5      $ 
projectionr-   r:  r   r   statesr   r   s    r>   r4   +LongT5LocalAttention.forward.<locals>.shape  "    ;;z2t||T=T=TUUr@   c                 Z   > U R                  5       R                  TSTR                  5      $ rE   r-   rG  r:  r   r{  s    r>   unshape-LongT5LocalAttention.forward.<locals>.unshape  %    $$&++JDNNKKr@   r    rL   rh   ...qhd,...khd->...hqkr
   r6  Tr   r{       _r-   r7  ...hqk,...khd->...qhd)r4   r   r   r   rH   r%   rU   r7   r   r   r8   r   rC   r+   r  r9  rC  r3  rr   r   r   r;   rE  r  rF  r   rq   r   )r   r   r   rI  rK  rM  rN  r4   r  rP  rS  rT  rU  rY  rZ  present_key_value_stater[  r   s   `                @r>   r   LongT5LocalAttention.forward  sf    "/!4!4Ra!8
J	V	L
 TVVM23466-01
TVVM23 *,AN'
DNNJ
),AN +:QRS
,\QUVW #\
  33 %4<<T^^9KLU[UbUbjpjvjv! ..4==26M/ $ 1 1$.. A{{4!8S%8 -q!0D D}},,V\\^,DLLVT}},,\T\\TXTaTa,b &'9L#((););<ell+BL_`!![j[!"34ff[)"&.$;#==@PP/Gr@   )r%   r   r   r  r   r   r   r   r   re  r   r   r  r   r  r   r   r   Fr^  NNNF)r   r   r   r   r!   rD  r   r  ra  r&  r   r3  r   r   r   r   s   @r>   rc  rc  )  sc    ,| ,$ ,[_ , ,4;  -  - ^ 6 I Ir@   rc  c                      ^  \ rS rSrSS\S\SS4U 4S jjjrS r\SS j5       r	S	\
4S
 jrS\R                  S\R                  S\R                  4S jr    SS jrSrU =r$ )LongT5TransientGlobalAttentioni  r   r   r(   Nc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  S-   U l        UR                  U l        UR                  U l        U R                  U R                  -  U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                  U R                   SS9U l        ["        R$                  " U R                   U R                  SS9U l        U R                  (       a0  ["        R.                  " U R                  U R                  5      U l        [3        5       U l        U R                  (       a0  ["        R.                  " U R                  U R                  5      U l        [9        UR                  UR:                  S9U l        g )Nr    Fr   r   )r   r   r   r   r   r   r   r   r   r   r   re  r%   rm   r   r   r   r   r   r   r   r   r   r  r  r  r  global_relative_attention_biasr   r   global_input_layer_normrf  s      r>   r   'LongT5TransientGlobalAttention.__init__  s    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.!'!9!9**(?(?? 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E ++24,,t?b?bdhdpdp2qD/'6v~~6KdKd'e$r@   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g r	  r
  r  s      r>   r  *LongT5TransientGlobalAttention.prune_heads	  r  r@   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ r  r  r  s           r>   r&  8LongT5TransientGlobalAttention._relative_position_bucket  rl  r@   rm  c                    U R                   R                  R                  R                  S:w  a   U R                   R                  R                  OSn[        R
                  " SU-  [        R                  US9nX1U*  nUSSS24   USS2S4   -
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      R                  S5      R                  S5      nU$ ro  rq  rr  s           r>   r3  +LongT5TransientGlobalAttention.compute_biasJ  ru  r@   r   r   c                 x   [         R                  " US   US S 2S S S 24   5      S S 2S S4   n[         R                  " US:  SS5      n[        XR                  5      nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      nUR                  / SQ5      nXG-   nU$ )Nr   .r   r{   r  r*  )r   r
   r    rg   )r7   eqrr   r   rm   r&  r   r   r   r  r,  )r   r   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biass           r>   compute_side_bias0LongT5TransientGlobalAttention.compute_side_biasb  s    #hhtI8J1dTU:8VWXY[_adXde#kk*=*A3N!A$H^H^!_(,(F(F"#.;;==	 )G )
% 778UV	 %%l3	1=""r@   c                 P	  ^ ^ UR                   S S u  mnUU 4S jnUU 4S jn[        Ub  UO"[        R                  " UR                   S S 5      T R                  5      u  pU
R                   S   n[        XU5      nT R                  U5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      nU" T R                  U5      5      n[        UT R                  SS9n[        UT R                  SS9n[        UT R                  SS9n[        USSS9n[        USSS9nS/UR                  S-   -  nUR                   S   US'   UR                  S5      R                  U5      nUR                  S5      R                  U5      n[        R                   " UU/SS9n[        R                   " UU/SS9n[        R"                  " SX5      nUb=  [%        UT R                  UR&                  5      n[        R(                  " US	:  S
S5      nOS nUGct  T R*                  (       dz  [        R,                  " SST R.                  T R                  ST R                  -  4UR&                  UR0                  S9nT R2                  (       a  T R4                  (       a  SUl        OT R9                  T R                  5      nUb  UUR;                  SS5      -   nUR=                  UR0                  5      nUc  [        R                  " TU5      nT R?                  X*5      n[        UT R                  SS9R;                  SS5      nUR=                  UR0                  5      RA                  UR&                  5      n[        R                   " UU/SS9nUU-  n[B        RD                  RG                  URI                  5       SS9RK                  U5      n[B        RD                  RM                  UT RL                  T R4                  S9nUb  UU-  nUR=                  UR0                  5      nU" [        R"                  " SUU5      5      nUS S 2S U2S S 24   nT RO                  U5      nS nU4U4-   U4-   nU(       a  UU4-   nU$ )Nrg   c                 T   > U R                  TSTR                  TR                  5      $ rx  rz  r{  s    r>   r4   5LongT5TransientGlobalAttention.forward.<locals>.shape  r~  r@   c                 Z   > U R                  5       R                  TSTR                  5      $ r  r  r{  s    r>   r  7LongT5TransientGlobalAttention.forward.<locals>.unshape  r  r@   r-   r    rL   rh   r  r   r{   r  r
   r6  Tri   r7  r  )(r4   r   r7   r   rm   r   r  r   r   r   rH   r%   rU   r9   rY   r   rQ   r   rl   rC   rr   r   r8   r   r+   r  r9  rC  r3  r   rq   r  ra   r   r;   rE  r  rF  r   r   )r   r   r   rI  rK  rM  rN  r4   r  ro   r   _global_seq_lenglobal_inputsrP  rS  rT  side_key_statesside_value_statesrepsrU  r^   side_position_biasrY  rZ  r  r[  r   s   `                         @r>   r   &LongT5TransientGlobalAttention.forwardw  sM    "/!4!4Ra!8
J	V	L )E$D%**]5H5H"5M*N"")
%	
 -22261-O\44]C TVVM23466-01
TVVM23} 56!$&&"78 *,AN'
DNNJ
),AN +:QRS
,\QUVW so**Q./""1%Q)33A6==dC-77:AA$G YY
O<!D
yy,0A!BJ 5|P#<T4>>S`SgSg#h #(;;/Ca/Ge#T #'  33 %4<<T^^9KL!== ,,!
 ..4==26M/ $ 1 1$.. A#/ -0D0N0NqRS0T T)..v||<M |zz*j9!%!7!7!Q!34F\^!_!i!ijkmn!o!3!8!8!F!I!I&--!X!II}6H&IrRM-}},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9L#((););<ell+BLR^_`!![j[!"34ff[)"&.$;#==@PP/Gr@   )r%   r   r   rm   r  r  r   r   r   r   r   re  r   r   r  r   r  r   r   r   r  r^  r  )r   r   r   r   r!   rD  r   r  ra  r&  r   r3  r7   r}   r  r   r   r   r   s   @r>   r  r    s    f| f$ f[_ f f>;  -  - ^ 0#ell # #Y^YeYe #0 v vr@   r  c                   R   ^  \ rS rSrSS\\   4U 4S jjjr       SS jrSrU =r	$ )LongT5LayerSelfAttentioni  r   c                    > [         TU ]  5         [        XUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r  s       r>   r   !LongT5LayerSelfAttention.__init__  sQ    ,W`
 *&..f>W>WXzz&"5"56r@   c	                     U R                  U5      n	U R                  U	UUUUUUUS9n
XR                  U
S   5      -   nU4U
SS  -   nU$ )N)r   rI  rK  rJ  rL  rM  r/  r   r    )r   r  r   )r   r   re   rI  rK  rJ  rL  rM  r/  normed_hidden_statesattention_outputr[  s               r>   r    LongT5LayerSelfAttention.forward  st      $}=-- '+)/) . 	
 &5Ea5H(II "%5ab%99r@   )r  r   r   r]  )NNNNFFN
r   r   r   r   r   r   r   r   r   r   r   s   @r>   r  r    s:    7XVY] 7 7  r@   r  c                   X   ^  \ rS rSrSrSS\\   4U 4S jjjr    S	S\4S jjr	Sr
U =r$ )
LongT5LayerLocalSelfAttentioni  z$Local self attention used in encoderr   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g N)r   r   )r   r   rc  LocalSelfAttentionr   r   r   r   r   r   r   r   r  s       r>   r   &LongT5LayerLocalSelfAttention.__init__  sI    "6v"w)&..f>W>WXzz&"5"56r@   kwargsc                     U R                  U5      nU R                  UUUUUS9nXR                  US   5      -   nU4USS  -   n	U	$ N)r   rI  rK  rM  r   r    )r   r  r   
r   r   re   rI  rK  rM  r  r  r  r[  s
             r>   r   %LongT5LayerLocalSelfAttention.forward  sk      $}=22 '+/ 3 
 &5Ea5H(II "%5ab%99r@   )r  r   r   r]  r  r   r   r   r   __doc__r   r   r   r   r   r   r   r   s   @r>   r  r    s>    .7XVY] 7 7   r@   r  c                   X   ^  \ rS rSrSrSS\\   4U 4S jjjr    S	S\4S jjr	Sr
U =r$ )
'LongT5LayerTransientGlobalSelfAttentioni4  z/Transient-Global self attention used in encoderr   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r  )r   r   r  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r  s       r>   r   0LongT5LayerTransientGlobalSelfAttention.__init__7  sN    ,J-
) *&..f>W>WXzz&"5"56r@   r  c                     U R                  U5      nU R                  UUUUUS9nXR                  US   5      -   nU4USS  -   n	U	$ r  )r   r  r   r  s
             r>   r   /LongT5LayerTransientGlobalSelfAttention.forward?  sk      $}=<< '+/ = 
 &5Ea5H(II "%5ab%99r@   )r  r   r   r]  r  r  r   s   @r>   r  r  4  s>    97XVY] 7 7   r@   r  c                   T   ^  \ rS rSrSS\\   4U 4S jjjr        SS jrSrU =r	$ )LongT5LayerCrossAttentioniV  r   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFr  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   s      r>   r   "LongT5LayerCrossAttention.__init__W  sO    .vSXdmn)&..f>W>WXzz&"5"56r@   c                     U R                  U5      nU R                  UUUUUUUUU	U
S9
nXR                  US   5      -   nU4USS  -   nU$ )N)	r   rH  rI  rK  rJ  rL  r-  rM  r/  r   r    )r   r  r   )r   r   rH  re   rI  rK  rJ  rL  r-  rM  r/  r  r  layer_outputr[  s                  r>   r   !LongT5LayerCrossAttention.forward]  sy      $}=// -'+)%/) 0 
 %||4DQ4G'HH/$4QR$88r@   )r  r   r   r   )NNNNFNFNr  r   s   @r>   r  r  V  s<    7(3- 7 7  r@   r  c                   \   ^  \ rS rSrSS\\   4U 4S jjjr            SS jrSrU =r	$ )LongT5Blocki|  r   c                 $  > [         TU ]  5         UR                  U l        UR                  (       a  [        nOGUR                  S:X  a  [
        nO0UR                  S:X  a  [        nO[        SUR                   S35      e[        R                  " 5       U l
        U R                  R                  U" XUS95        U R                  (       a"  U R                  R                  [        XS95        U R                  R                  [        U5      5        g )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .r  )r   )r   r   r   r  encoder_attention_typer  r  
ValueErrorr   
ModuleListlayerrP   r  r   )r   r   r   r   attention_layerr   s        r>   r   LongT5Block.__init__}  s     ++6O**g5;O**.@@EO!889<  ]]_


Fgpq	
 ??JJ7TU

-/0r@   c                    U R                   S   " UUUUU	U
UUS9nUS S u  pUSS  nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU R                  =(       a    US LnU(       a  U R                   S   " UUUUUU	US   S-   U
UUS9
nUS S u  pUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nUUSS  -   nU R                   S   " U5      nUR                  [        R                  :X  am  [        R                  " U5      R                  5       (       aD  [        R                  " UR                  5      R                  S-
  n[        R                  " UU* US9nU4nU
(       a  UU	4-   U-   nU$ UU-   nU$ )	Nr   )re   rI  rK  rJ  rL  rM  r/  rg   i  )r  r   r    r-   )	rH  re   rI  rK  rJ  r-  rL  rM  r/  )
r  r+   r7   r   isinfanyfinfor   clampr   )r   r   re   rI  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasrK  cross_attn_layer_head_maskrJ  rL  rM  return_dictr/  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr[  s                       r>   r   LongT5Block.forward  s-     "&A)'+)/)	"
 )?r(B%2126 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM!__R1Fd1R&*jjm!65; :-+B/!3#"3-'# -DBQ,G)M ""emm3M8R8V8V8X8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O 

2}5 %--/EKK4N4R4R4T4T++m&9&9:>>EK!KKK<[YM " 114EEG   11Gr@   )r   r  r]  )NNNNNNNNFFTNr  r   s   @r>   r  r  |  sK    1XVY] 1 14 "#&*#'I Ir@   r  c                   J    \ rS rSr\rSrSrS/rSr	Sr
\S 5       rS rS rS	rg
)LongT5PreTrainedModeli  transformerTr  Fc                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)r7   r   r   r   )r   r  
input_maskdummy_inputss       r>   r  "LongT5PreTrainedModel.dummy_inputs  s8     LL.	\\*-
!*"&0

 r@   c                    U R                   R                  n[        U[        5      (       a)  UR                  R
                  R                  US-  5        g[        U[        [        [        45      (       a  UR                  R                  R
                  R                  SUS-  S9  [        US5      (       aN  U R                   R                  (       d2  UR                  R                  R
                  R                  SUS-  S9  ggg[        U[        5      (       GaQ  UR                   R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR                   S5      (       aE  UR                   R$                  b.  UR                   R$                  R
                  R'                  5         UR(                  R                  R
                  R                  SX R                   R*                  S-  -  S9  [        UR(                  S5      (       aG  UR(                  R$                  b/  UR(                  R$                  R
                  R'                  5         ggg[        U[,        5      (       Ga  UR.                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR.                  S5      (       aE  UR.                  R$                  b.  UR.                  R$                  R
                  R'                  5         UR0                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR0                  S5      (       aE  UR0                  R$                  b.  UR0                  R$                  R
                  R'                  5         UR(                  R                  R
                  R                  SX R                   R*                  S-  -  S9  [        UR(                  S5      (       aG  UR(                  R$                  b/  UR(                  R$                  R
                  R'                  5         ggg[        U[2        [4        [6        45      (       Ga  U R                   R"                  nU R                   R8                  nU R                   R:                  nUR<                  R                  R
                  R                  SX#U-  S-  -  S9  UR>                  R                  R
                  R                  SX#S-  -  S9  UR@                  R                  R
                  R                  SX#S-  -  S9  URB                  R                  R
                  R                  SX%U-  S-  -  S9  URD                  (       a}  URF                  R                  R
                  R                  SX#S-  -  S9  [        U[6        5      (       a4  URH                  R                  R
                  R                  SX#S-  -  S9  gggg)zInitialize the weightsr|   r{   )r   stdlm_head      r   N)%r   initializer_factorr   r   r   datafill_LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelsharednormal_hasattrtie_word_embeddingsr  r   r   r   r   zero_r   r   r   r   r   r   rc  r  r   r   r   r   r   r   r   r  r  )r   modulefactorr   r   r   s         r>   _init_weights#LongT5PreTrainedModel._init_weights  s   //fo..MM$$Vc\2.LN` abb MM  %%--3FSL-Ivy))$++2Q2Q%%**22#2N 3R) 344 II!!))s;;CVCV[_B_8`)avyy&))fiinn.H		##))+II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I) 899KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I)2FHf ghh kk))G!%!1!1kk++GHHOO  ((cvL^B^cgAg7h(iHHOO  ((cv$7O(PHHOO  ((cv$7O(PHHOO  ((cvL^B^cgAg7h(i11..55::BBQWhl[lQmBnf&DEE99@@EEMM fT0A&B N  F 2 ir@   c                    U R                   R                  nU R                   R                  nUc  [        S5      e[	        U5      (       aE  [
        R                  " UR                  S S S-   U5      n[
        R                  " XASS S24   /SS9nO=UR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US	:H  U5        U$ )
Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information.r-   )r    .rL   r    ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr  r   r7   fullr4   rQ   	new_zerosclonemasked_fill_)r   r  r  r  shifted_input_idss        r>   _shift_right"LongT5PreTrainedModel._shift_right&  s    !%!C!C{{//!)8  Y'' %

9??3B+?$+FH^ _ %		+<SbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r@   r.   N)r   r   r   r   r!   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_cache_class_supports_static_cachepropertyr  r	  r  r   r.   r@   r>   r  r    sD    L%&*#& " .b!r@   r  c                   .  ^  \ rS rSrSU 4S jjrS rS r             SS jr SS\\	R                  S4   S\	R                  S	\	R                  S
\S\4
S jjr\S\	R                  S\S\S\	R                   S	\	R                  S\4S j5       rSrU =r$ )LongT5StackiB  c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        Ub  UR                  U R                  l        UR                  U l        UR                  U l	        U R                  S-   U l
        [        R                  " [        UR                  5       Vs/ s H  n[        U[        US:H  5      US9PM     sn5      U l        [#        UR
                  UR$                  S9U l        [        R(                  " UR*                  5      U l        SU l        U R1                  5         g s  snf )Nr    r   r  r   F)r   r   r   r  
vocab_sizer   embed_tokensr   r   re  r%   r  rM   
num_layersr  rD  blockr   r   final_layer_normr   r   r   r  	post_init)r   r   r!  rS   r   s       r>   r   LongT5Stack.__init__C  s    LL):):FNNK#'3':':D$ ++"//**Q.]] v0011A FQ!VXYZ1

 !0FD]D] ^zz&"5"56&+# 	s   <!E	c                     U R                   $ r   r!  r   s    r>   get_input_embeddings LongT5Stack.get_input_embeddings]  s       r@   c                     Xl         g r   r(  r   new_embeddingss     r>   set_input_embeddings LongT5Stack.set_input_embeddingsa  s    *r@   c                 \   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb*  Ub'  U R
                  (       a  SOSn[        SU SU S35      eUb&  UR                  5       nUR                  SUS   5      nO>Ub  UR                  5       S S nO'U R
                  (       a  SOSn[        SU SU S	35      eU R                  (       a/  U R                  (       a  U	(       a  [        R                  S
5        Sn	Uc%  U R                  c   S5       eU R                  U5      nUu  nnSnSnU R
                  (       a  U	(       d  Ub  [        U[        5      (       a,  [        U[         5      (       d  Sn[!        U[#        5       5      nOv[        U[         5      (       d.  Sn[        R                  S5        [         R$                  " U5      nO3Uc  [!        [#        5       [#        5       5      nOU R
                  (       d  S nUb  UR'                  5       OSnUc#  [(        R*                  " UUU-   UR,                  S9nUc4  [/        5       (       d%  UU-   n[(        R0                  " UUUR,                  S9nU R
                  (       a%  U R3                  UUUUb  UR4                  OS U
5      nO=U R                   R6                  S:X  a!  [9        X R:                  UR,                  5      nOUnU R
                  (       aO  UbL  UR                  5       u  nnnUU4nUc  [(        R0                  " UUR,                  S9nU R=                  U5      nOS nU R?                  X`R                   R@                  5      nU R?                  XpR                   R@                  5      nU(       a  SOS nU
(       a  SOS nU
(       a  U R
                  (       a  SOS nS nS n U RC                  U5      n![E        U RF                  5       H  u  n"n#UU"   n$UU"   n%U(       a  UU!4-   nU R                  (       a:  U R                  (       a)  U RI                  U#RJ                  U!UUUUU U$U%S U	U
UU5      n&OU#" U!UUUUU U$U%UU	U
UUS9n&U	SL a  U&S S S-   U&SS  -   n&U&S S u  n!n'U&S   nU R
                  (       a  Ub  U&U
(       a  SOS   n U
(       d  M  UU&S   4-   nU R
                  (       d  M  UU&S   4-   nM     U RM                  U!5      n!U RC                  U!5      n!U(       a  UU!4-   nU	(       a  W'OS n(U(       a  UR4                  n(U(       a  URO                  5       n(U(       d  [Q        S U!U(UUU4 5       5      $ [S        U!U(UUUS9$ )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer-   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   ry   r  r.   )re   rI  r  r  r  rK  r  rJ  rL  rM  r  r/  r    r   rg      r
      c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r.   ).0r   s     r>   	<genexpr>&LongT5Stack.forward.<locals>.<genexpr>%  s"      
A  s   	)last_hidden_statepast_key_valuesr   
attentionscross_attentions)*r   rL  rM  output_hidden_statesuse_return_dictr   r  sizer:  r  r9  r   r   r!  r   r   r   r   from_legacy_cacheget_seq_lengthr7   rW   rC   r   r   _update_causal_maskr>  r  rl   r%   invert_attention_maskget_head_maskr"  r   	enumerater#  _gradient_checkpointing_funcr   r$  to_legacy_cacherO   r   ))r   r  re   r  r  r4  	head_maskcross_attn_head_maskr<  rL  rM  r?  r  r/  err_msg_prefixinput_shaper   rN  return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthrW  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsrI  r  r   rS   layer_modulerK  r  layer_outputsnext_decoder_cache
next_caches)                                            r>   r   LongT5Stack.forwardd  s     "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	 $$0p2pp0 --i8M!,
J $&+#??	_-H/511*_Vi:j:j.2+"5o|~"V1DEE&*###`
 #6"G"G"X ("5lnln"U #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D*D4zAO"ZZ
OML`L`aN??228G8S44Y]!K [[//7:3NNNTaThThiK(K ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y++2H2HI	#112FH^H^_"6BD0d&7DOOrRV(,%]3(4OA|'lO)=a)@&#$58H$H!**t}} $ A A ((!!)31#.%"!" !-!#."/*?+J2O$3/I#2'&7 +#1!$ E! -bq 1G ;mAB>O O0=bq0A-M-
 *!,M#8#D0=CTaZ[0\-  !/=3C2E!E???+?=QRCSBU+U(u  5x --m<]3   1]4D D+4'$
&(==J(88:J 
 "%"(
 
 
 9+&+%1
 	
r@   re   r"   input_tensorr/  r<  rM  c           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r{   flex_attentionr   Fsdpa)r4  rP  is_trainingr    r-   )sequence_lengthtarget_lengthr+   r/  r   )cudaxpunpu)r   _attn_implementationr  r   r7   r}   r#   rC  is_compileabler   _ignore_causal_mask_sdpar9  r+   r4   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrC   rq   r  r  _unmask_unattended)r   re   r_  r/  r<  rM  past_seen_tokensusing_compilable_cacher+   re  rf  rW  	min_dtypes                r>   rD  LongT5Stack._update_causal_mask9  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr@   re  rf  r+   r   c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr5  )
fill_valuer+   rC   r    )diagonalry   r-   r   )r&   r7   r  r  r  rC   triurW   rE   expandr  r4   ra   masked_fill)re   re  rf  r+   r/  r   r  rW  rr  mask_lengthpadding_masks              r>   rn  ALongT5Stack._prepare_4d_causal_attention_mask_with_cache_position}  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r@   )r#  r%   r   r!  r$  r  r   re  r   )NNNNNNNNNNNNNr  )r   r   r   r   r   r*  r/  r   r   r7   r}   r   rD  rD  ra  r   r+   rn  r   r   r   s   @r>   r  r  B  s    4!+
 "#!!R
v #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r@   r  a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
c            &       j  ^  \ rS rSrS/rSS/rS\4U 4S jjrS rS r	S	 r
S
 rS rS r\                S S\\R"                     S\\R$                     S\\R"                     S\\R&                     S\\R$                     S\\R$                     S\\R(                     S\\\\R$                           S\\\\R$                           S\\R(                     S\\R(                     S\\   S\\   S\\   S\\   S\\R"                     S\\\R$                     \4   4"S jj5       rSrU =r$ )!r  i  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        SUl        [        X R                  5      U l        [        R                  " U5      nSUl	        SUl        UR                  Ul        [        X0R                  5      U l        U R#                  5         g )NFT)r   r   r   r  r   r   r  copydeepcopyr   rL  is_encoder_decoderr  encodernum_decoder_layersr"  decoderr%  r   r   encoder_configdecoder_configr   s       r>   r   LongT5Model.__init__  s     ll6#4#4fnnEv.$)!#( ,1)">;;?v.$(!,1)$*$=$=!">;;? 	r@   c                     U R                   $ r   r  r)  s    r>   r*   LongT5Model.get_input_embeddings      {{r@   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   r  r  r/  r  r-  s     r>   r/   LongT5Model.set_input_embeddings  +    $)).9)).9r@   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g r   r   r  _tie_or_clone_weightsr  r!  r  r  r)  s    r>   _tie_weightsLongT5Model._tie_weights  P    ;;**&&t||'@'@$++N&&t||'@'@$++N +r@   c                     U R                   $ r   r  r)  s    r>   get_encoderLongT5Model.get_encoder      ||r@   c                     U R                   $ r   r  r)  s    r>   get_decoderLongT5Model.get_decoder  r  r@   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gz
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
Nitemsr  r  	attentionr  r   heads_to_pruner  r  s       r>   _prune_headsLongT5Model._prune_heads  <    
 +002LELLu%//;;EB 3r@   r  re   r  r  rJ  decoder_head_maskrK  encoder_outputsr<  r4  decoder_inputs_embedsrL  rM  r?  r  r/  r(   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R                  U R                   R                  :X  a!  [
        R                  " [        [        5        UnUc  U R                  UUU
UUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUU	UUUUUUUUUS9nU(       d  UU-   $ [        UR                  UR                   UR"                  UR$                  UR&                  UR                  UR"                  UR$                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
    Training](./longt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoTokenizer, LongT5Model

>>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
>>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

>>> # Let's try a very long encoder input.
>>> input_ids = tokenizer(
...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1

>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

>>> # forward pass
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```Nr  re   r4  rJ  rM  r?  r  r   r    rg   r;  r   r=  r  re   r4  r<  r  r  rJ  rK  rL  rM  r?  r  r/  )r;  r<  decoder_hidden_statesdecoder_attentionsr>  encoder_last_hidden_stater  encoder_attentions)r   rL  r@  r"  r  warningswarn#_LongT5Model__HEAD_MASK_WARNING_MSGFutureWarningr  r   r   r  r  r   r;  r<  r   r=  r>  )r   r  re   r  r  rJ  r  rK  r  r<  r4  r  rL  rM  r?  r  r/  r   decoder_outputss                      r>   r   LongT5Model.forward  s   b "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1'!5/!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r@   )r  r  r  )NNNNNNNNNNNNNNNN)r   r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr!   r   r*  r/  r  r  r  r  r   r   r7   
LongTensorFloatTensor
BoolTensorr}   r   rD  r   r   r   r   r   r   s   @r>   r  r    s    	R*& 89VW| &:
O
C  156:8<=A159=7;EIEI048<$(,0/3&*59#J
E,,-J
 !!2!23J
 $E$4$45	J

 !))9)9 :J
 E--.J
 $E$5$56J
 'u||4J
 "%e.?.?(@"ABJ
 "%e.?.?(@"ABJ
  -J
  (5J
 D>J
 $D>J
 'tnJ
  d^!J
" !!1!12#J
$ 
uU&&');;	<%J
 J
r@   r  z>
    LONGT5 Model with a `language modeling` head on top.
    )custom_introc            (         ^  \ rS rSrS/r/ SQrS\4U 4S jjrS rS r	S r
S	 rS
 rS rS r\                 S#S\\R$                     S\\R&                     S\\R$                     S\\R(                     S\\R&                     S\\R&                     S\\R*                     S\\\\R*                           S\\\\R*                           S\\R&                     S\\R&                     S\\R$                     S\\   S\\   S\\   S\\   S\\R$                     S\\\R&                     \4   4$S jj5       rS\R*                  4S  jrS! rS"rU =r$ )$r   i  r~  )r  r  zlm_head.weightr   c                 L  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        SUl        [        X R                  5      U l        [        R                  " U5      nSUl
        SUl        UR                  Ul        [        X0R                  5      U l        [        R$                  " UR                  UR                  SS9U l        U R)                  5         g )NFTr   )r   r   r   	model_dimr   r  r   r  r  r  r   rL  r  r  r  r  r"  r  r   r  r%  r  s       r>   r   'LongT5ForConditionalGeneration.__init__  s     ll6#4#4fnnEv.$)!#( ,1)">;;?v.$(!,1)$*$=$=!">;;?yy1B1BO 	r@   c                     U R                   $ r   r  r)  s    r>   r*  3LongT5ForConditionalGeneration.get_input_embeddings  r  r@   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   r  r-  s     r>   r/  3LongT5ForConditionalGeneration.set_input_embeddings  r  r@   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g r   r  r)  s    r>   r  +LongT5ForConditionalGeneration._tie_weights  r  r@   c                     Xl         g r   r  r-  s     r>   set_output_embeddings4LongT5ForConditionalGeneration.set_output_embeddings  s    %r@   c                     U R                   $ r   r  r)  s    r>   get_output_embeddings4LongT5ForConditionalGeneration.get_output_embeddings  r  r@   c                     U R                   $ r   r  r)  s    r>   r  *LongT5ForConditionalGeneration.get_encoder  r  r@   c                     U R                   $ r   r  r)  s    r>   r  *LongT5ForConditionalGeneration.get_decoder  r  r@   r  re   r  r  rJ  r  rK  r  r<  r4  r  labelsrL  rM  r?  r  r/  r(   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R                  U R                   R                  :X  a!  [
        R                  " [        [        5        UnUc  U R                  UUU
UUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nUb  Uc  Uc  U R                  U5      nU R                  UUUU	UUUUUUUUUS9nUS   nU R                   R                  (       a  UU R                   S-  -  nU R#                  U5      nSnUb[  [%        S	S
9nUR'                  UR(                  5      nU" UR+                  SUR-                  S5      5      UR+                  S5      5      nU(       d  U4USS -   U-   nUb  U4U-   $ U$ [/        UUUR0                  UR2                  UR4                  UR6                  UR8                  UR2                  UR4                  S9	$ )ar  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
    Training](./longt5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
>>> model = LongT5ForConditionalGeneration.from_pretrained(
...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
... )

>>> # Let's try a very long input.
>>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
>>> input_ids = inputs.input_ids

>>> outputs = model.generate(input_ids)
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
abstractthe aim of this article is to provide an overview of the literature on the role of dog
```Nr  r   r    rg   r  r  r  r  )ignore_indexr-   )	losslogitsr<  r  r  r>  r  r  r  )r   rL  r@  r"  r  r  r  6_LongT5ForConditionalGeneration__HEAD_MASK_WARNING_MSGr  r  r   r   r  r  r  r  r  r  r	   ra   rC   r:  rA  r   r<  r   r=  r>  r;  )r   r  re   r  r  rJ  r  rK  r  r<  r4  r  r  rL  rM  r?  r  r/  r   r  sequence_output	lm_logitsr  loss_fctoutputs                            r>   r   &LongT5ForConditionalGeneration.forward  so   j "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1'!5/!5#) ' 
  *!,;;** .1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD \OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r@   c                 $    U R                  U5      $ r   )r  )r   r  s     r>   %prepare_decoder_input_ids_from_labelsDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsf  s      ((r@   c           	      F   Uc  [         R                  S5        U$ SnU H~  nSnU H2  nUUR                  SUR                  UR                  5      5      4-   nM4     US   R
                  US   R
                  :X  d   e[        U5      [        U5      :X  d   eX54-   nM     U$ )NzHYou might want to consider setting `use_cache=True` to speed up decodingr.   r   )r   warningindex_selectra   rC   r4   r  )r   r<  beam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_states          r>   _reorder_cache-LongT5ForConditionalGeneration._reorder_cachei  s     "NNef""!#!0 +-'$5 .I$11!X[[AQAXAX5YZM /+ %6 /q177;LQ;O;U;UUUU23s;L7MMMM%;>\%\" "1 &%r@   )r  r  r  r  r  )NNNNNNNNNNNNNNNNN)r   r   r   r   r  r  r!   r   r*  r/  r  r  r  r  r  r   r   r7   r  r  r  r}   r   rD  r   r   r   r  r  r   r   r   s   @r>   r   r     s6    	R*& j| .:
O
&  156:8<=A159=7;@D@D59=A-1$(,0/3&*59%f
E,,-f
 !!2!23f
 $E$4$45	f

 !))9)9 :f
 E--.f
 $E$5$56f
 'u||4f
 "%ell(;"<=f
 "%ell(;"<=f
   1 12f
  ((9(9:f
 ))*f
 D>f
 $D>f
  'tn!f
" d^#f
$ !!1!12%f
& 
uU&&'8	9'f
 f
P)ELL )& &r@   r   c                   >  ^  \ rS rSrS/rS/rS\4U 4S jjrS rS r	S r
S	 rS
 r\       SS\\R                      S\\R"                     S\\R"                     S\\R"                     S\\   S\\   S\\   S\\\R"                     \4   4S jj5       rSrU =r$ )r  i  r  r  r   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        [        X R                  5      U l        U R                  5         g )NF)r   r   r   r  r   r   r  r  r  rL  r  r  r  r%  )r   r   r  r   s      r>   r   LongT5EncoderModel.__init__  sf     ll6#4#4fnnEv.#( ,1)">;;? 	r@   c                     U R                   $ r   r  r)  s    r>   r*  'LongT5EncoderModel.get_input_embeddings  r  r@   c                 F    Xl         U R                  R                  U5        g r   )r  r  r/  r-  s     r>   r/  'LongT5EncoderModel.set_input_embeddings  s    $)).9r@   c                     U R                   R                  (       a1  U R                  U R                  R                  U R
                  5        g g r   )r   r  r  r  r!  r  r)  s    r>   r  LongT5EncoderModel._tie_weights  s2    ;;**&&t||'@'@$++N +r@   c                     U R                   $ r   r  r)  s    r>   r  LongT5EncoderModel.get_encoder  r  r@   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     gr  r  r  s       r>   r  LongT5EncoderModel._prune_heads  r  r@   r  re   rJ  r4  rM  r?  r  r(   c           
      f    Ub  UOU R                   R                  nU R                  UUUUUUUS9nU$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
    you should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
    Training](./longt5#training).

Example:

```python
>>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
>>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
>>> input_ids = tokenizer(
...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```r  )r   r@  r  )	r   r  re   rJ  r4  rM  r?  r  r  s	            r>   r   LongT5EncoderModel.forward  sK    F &1%<k$++B]B],,)'/!5# ' 
 r@   )r  r  )NNNNNNN)r   r   r   r   r  r  r!   r   r*  r/  r  r  r  r   r   r7   r  r  rD  r   r   r   r   r   r   r   s   @r>   r  r    s   78*4&
| 
:OC  156:1559,0/3&*.E,,-. !!2!23. E--.	.
   1 12. $D>. 'tn. d^. 
uU&&'8	9. .r@   r  )r  r   r  r  )r   )[r  r  r  r  typingr   r   r   r   r   r7   r   torch.nnr	   activationsr   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   r   configuration_longt5r!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerr   r   r}   r   r?   rH   rU   r]   rd   rC   rl   r   r   r   Moduler   apex.normalizationr   infoImportError	Exceptionr  rP   r   r   r   r   rc  r  r  r  r  r  r  r  r  __HEAD_MASK_WARNING_MSGr  r   r  __all__r.   r@   r>   <module>r     s       4 4   % ! C C ) >  . g g   /  !!;J 
		H	%  3 3 W\WcWc  #%,, #3 #S #U\\ #4U\\ 4c 4 4Y\ 4ejeqeq 42!# !%,, !BU\\ Bc BV[VbVb B8ell 8s 8TYT`T` 8ejeqeq 8 .PLL.P58.P
5<<%&.Pb4U\\ 4VY 4^c^j^j 4	j<<	j,1LL	jJM	j
\\	j+bii +2	/"O
KKef   O ,")) ,ryy &BII &abii aH}299 }@DRYY DP!ryy !HBII >bii D#		 #La")) aH ^!O ^! ^!Bq' qj  
' 
 
D 
x&%:O x&
x&v U. U Up k{>  	 	
NN[\	s   =M& &N	/N	N	