
    fTh                       S r SSKrSSKrSSKrSSKrSSKJrJrJrJ	r	  SSK
r
SSK
Jr  SSKJrJrJr  SSKJr  SSKJrJrJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJ r J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'  SSK(J)r)J*r*J+r+J,r,J-r-J.r.J/r/J0r0  SSK1J2r2J3r3  SSK4J5r5  \-" 5       (       a  SSK6J7r7  SSK8J9r9  \0Rt                  " \;5      r<S r=Sr>Sr? " S S\R                  5      rA SSKBJCrC  \CrA\<R                  S5        \%R                  " \A5         " S S\R                  5      rI " S S\R                  5      rJ " S  S!\R                  5      rK " S" S#\R                  5      rL " S$ S%\R                  5      rM " S& S'\R                  5      rN " S( S)\R                  5      rO " S* S+\R                  5      rP\, " S, S-\#5      5       rQ " S. S/\Q5      rRS0rS\, " S1 S2\Q5      5       rT\," S3S49 " S5 S6\Q\5      5       rU\, " S7 S8\Q5      5       rV\," S9S49 " S: S;\Q5      5       rW\, " S< S=\Q5      5       rX\, " S> S?\Q5      5       rY/ S@QrZg! \E a     GNN\F a    \<R                  S5         GNgf = f)AzPyTorch T5 model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKadd_start_docstringsauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging)assert_device_mapget_device_map   )T5Config)	BlockMask)make_flex_block_causal_maskc           	      "
    SSK nSSKnSSKn[        R                  R                  U5      n[        R                  SU 35        UR                  R                  U5      n/ n0 n	U HP  u  p[        R                  SU
 SU 35        UR                  R                  Xj5      nUR                  U
5        XU
'   MR     U GH}  nUR                  S5      n
[        S U
 5       5      (       a;  [        R                  S	SR!                  U
5       35        U	R#                  US5        Mg  S
U
S   ;   a;  [        R                  S	SR!                  U
5       35        U	R#                  US5        M  U nX   nU
 GH  nUR%                  SU5      (       a  UR                  SU5      nOU/nUS   S;   a  ['        US5      nGOdUS   S:X  a  ['        US5      nUS   nGOHUS   S:X  a  ['        US5      nUS   nGO,US   S:X  a  ['        US5      nUS   nGOUS   S:X  a<  [)        US5      (       a  ['        US5      nO[)        US5      (       a  ['        US5      nOUS   S:X  a  ['        US5      nOUS   S:X  d	  US   S:X  a  ['        US5      nOUS   S:X  a  ['        US5      nOUS   S:X  a  U
S   S :X  a  GMA  US   S :X  a  ['        US!5      nOUUS   S":X  a<  [+        U5      S:  a-  US   R-                  5       (       a  ['        US#US    35      nGM   ['        UUS   5      n[+        U5      S:  d  GM  [1        US   5      nUU   nGM     WS   S;  a  ['        US5      nUS   S$:w  a6  [        R                  S%UR2                   S&U
 35        UR5                  U5      n UR2                  UR2                  :w  a&  [7        S'UR2                   S(UR2                   S)35      e [        R                  S*U
 35        [<        R>                  " URA                  URB                  5      5      Ul"        U	R#                  US5        GM     [        R                  S+S,R!                  U	RG                  5       5       S-35        U $ ! [         a    [        R                  S5        e f = f! [.         a,    [        R                  S	SR!                  U
5       35         GM`  f = f! [8         a1  nU=R:                  UR2                  UR2                  4-  sl        e SnAff = f).z'Load tf checkpoints in a pytorch model.r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z&Converting TensorFlow checkpoint from zLoading TF weight z with shape /c              3   ,   #    U  H
  nUS ;   v   M     g7f))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     Z/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/t5/modeling_t5.py	<genexpr>(load_tf_weights_in_t5.<locals>.<genexpr>h   s      
 nns   z	Skipping _slot_z[A-Za-z]+_\d+z_(\d+))kernelscale	embeddingweightself_attentionlayerenc_dec_attentionr'   dense_relu_dense   rms_norm
layer_normfinal_layer_normr<   output_biasbetabiassquad
classifierdecoderlogitslm_headwiwi_r=   z"Transposing numpy weight of shape z for zPointer shape z and array shape  mismatchedzInitialize PyTorch weight z%Weights not copied to PyTorch model: z, .)$renumpy
tensorflowImportErrorloggererrorospathabspathinfotrainlist_variablesload_variableappendsplitanyjoinpop	fullmatchgetattrhasattrlenisdigitAttributeErrorintshape	transpose
ValueErrorAssertionErrorargstorch
from_numpyastypefloat32datakeys)modelconfigtf_checkpoint_pathrS   nptftf_path	init_varsnames
tf_weightsnamerl   arraytxt_namepointerm_namescope_namesnumes                      r6   load_tf_weights_in_t5r   K   s   
 ggoo01G
KK8	BC''0IEJ (l5'BC&&w5T 4	 ! ~~c"  

 
 
 KK)CHHTN#345NN8T*tBxKK)CHHTN#345NN8T*$F||,f55 hhy&9%h1~!AA!'84Q#33!'73!!*Q#66!'73!!*Q#55!'73!!*Q:-7L11%g|<GW&899%g/ABGQ7*!'84Q=0KNf4L!'62Q7*!'<8Q9,aH1DQ8+!'95Q4'C,<q,@[QR^E[E[E]E]!'SQ0@+AB%g{1~>G ;1$+a.)!#,U V q>!AAgx0Gq>[(KK<U[[MtfUVLL'E	}}+ >'--@QRWR]R]Q^^i!jkk ,
 	078''RZZ(@Ax&U X KK7		*//BS8T7UUVWXL  Q	
 	R & KK)CHHTN+; <=  	FFw}}ekk22F	s6   Q6 RA S6!R1SS
T,T		TaR  
    This is an experimental feature and is a subject to change at a moment's notice.

    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
    it will evenly distribute blocks across all devices.

    Args:
        device_map (`Dict[int, list]`, *optional*):
            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
            automatically mapped to the first device (for esoteric reasons). That means that the first device should
            have fewer attention modules mapped to it than other devices. For reference, the t5 models have the
            following number of attention modules:

                - google-t5/t5-small: 6
                - google-t5/t5-base: 12
                - google-t5/t5-large: 24
                - google-t5/t5-3b: 24
                - google-t5/t5-11b: 24

    Example:

    ```python
    # Here is an example of a device map on a machine with 4 GPUs using google-t5/t5-3b, which has a total of 24 attention modules:
    model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b")
    device_map = {
        0: [0, 1, 2],
        1: [3, 4, 5, 6, 7, 8, 9],
        2: [10, 11, 12, 13, 14, 15, 16],
        3: [17, 18, 19, 20, 21, 22, 23],
    }
    model.parallelize(device_map)
    ```
a4  
    Moves the model to cpu from a model parallel state.

    Example:

    ```python
    # On a 4 GPU machine with google-t5/t5-3b:
    model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-3b")
    device_map = {
        0: [0, 1, 2],
        1: [3, 4, 5, 6, 7, 8, 9],
        2: [10, 11, 12, 13, 14, 15, 16],
        3: [17, 18, 19, 20, 21, 22, 23],
    }
    model.parallelize(device_map)  # Splits the model across several devices
    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
    ```
c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )T5LayerNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)zS
Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
N)super__init__r   	Parameterrq   onesr>   variance_epsilon)selfhidden_sizeeps	__class__s      r6   r   T5LayerNorm.__init__   s/     	ll5::k#:; #    c                    UR                  [        R                  5      R                  S5      R	                  SSS9nU[        R
                  " X R                  -   5      -  nU R                  R                  [        R                  [        R                  4;   a%  UR                  U R                  R                  5      nU R                  U-  $ )NrC   r:   T)keepdim)torq   rt   powmeanrsqrtr   r>   dtypefloat16bfloat16)r   hidden_statesvariances      r6   forwardT5LayerNorm.forward   s     !##EMM266q9>>r4>P%H?T?T4T(UU ;; ??),,T[[->->?M{{]**r   )r   r>   )gư>)__name__
__module____qualname____firstlineno__r   r   __static_attributes____classcell__r   s   @r6   r   r      s    $+ +r   r   )FusedRMSNormzODiscovered apex.normalization.FusedRMSNorm - will use it instead of T5LayerNormzBdiscovered apex but it failed to load, falling back to T5LayerNormc                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )T5DenseActDensei  rx   c                 X  > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l
        [        UR                     U l        g NFrI   )r   r   r   Lineard_modeld_ffrO   woDropoutdropout_ratedropoutr   dense_act_fnactr   rx   r   s     r6   r   T5DenseActDense.__init__  sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r   c                    U R                  U5      nU R                  U5      nU R                  U5      n[        U R                  R
                  [        R                  5      (       a  UR                  U R                  R
                  R                  :w  aa  U R                  R
                  R                  [        R                  :w  a/  UR                  U R                  R
                  R                  5      nU R	                  U5      nU$ N)rO   r   r   
isinstancer   r>   rq   Tensorr   int8r   r   r   s     r6   r   T5DenseActDense.forward  s    ./]3tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r   )r   r   rO   r   	r   r   r   r   r(   r   r   r   r   r   s   @r6   r   r     s    /x / r   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )T5DenseGatedActDensei-  rx   c                   > [         TU ]  5         [        R                  " UR                  UR
                  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  5      U l        [        UR                     U l        g r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r   r   r   r   s     r6   r   T5DenseGatedActDense.__init__.  s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r   c                 8   U R                  U R                  U5      5      nU R                  U5      nX#-  nU R                  U5      n[	        U R
                  R                  [        R                  5      (       a  UR                  U R
                  R                  R                  :w  aa  U R
                  R                  R                  [        R                  :w  a/  UR                  U R
                  R                  R                  5      nU R                  U5      nU$ r   )r   r   r   r   r   r   r>   rq   r   r   r   r   )r   r   hidden_geluhidden_linears       r6   r   T5DenseGatedActDense.forward6  s    hhtyy78		-0#3]3 tww~~u||44##tww~~';';;$$

2),,TWW^^-A-ABM.r   )r   r   r   r   r   r   r   s   @r6   r   r   -  s    /x / r   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )	T5LayerFFiJ  rx   c                   > [         TU ]  5         UR                  (       a  [        U5      U l        O[        U5      U l        [        UR                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilonrE   r   r   r   r   r   s     r6   r   T5LayerFF.__init__K  s_    "6v">D"1&"9D%fnn&:S:STzz&"5"56r   c                 p    U R                  U5      nU R                  U5      nXR                  U5      -   nU$ r   )rE   r   r   )r   r   forwarded_statess      r6   r   T5LayerFF.forwardU  s;    ??=9../?@%5E(FFr   )r   r   rE   r   r   s   @r6   r   r   J  s    7x 7 r   r   c                      ^  \ rS rSr  S
S\S\\   4U 4S jjjrS r\	SS j5       r
SS jr         SS jrS	rU =r$ )T5Attentioni\  rx   	layer_idxc                   > [         TU ]  5         UR                  U l        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U l
        UR                  U l        U R                  U R                  -  U l        X0l        Uc>  U R                  (       a-  [        R!                  SU R"                  R$                   S35        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        U R                  (       a0  [&        R2                  " U R                  U R                  5      U l        [7        5       U l        SU l        g )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   rW   warning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   rx   r   r   r   s       r6   r   T5Attention.__init__]  so    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E&+#r   c                 
   [        U5      S:X  a  g [        XR                  U R                  U R                  5      u  p[        U R                  U5      U l        [        U R                  U5      U l        [        U R                  U5      U l        [        U R                  USS9U l	        U R                  [        U5      -
  U l        U R                  U R                  -  U l
        U R                  R                  U5      U l        g )Nr   r'   dim)rh   r   r   r   r   r   r   r   r   r   r   union)r   headsindexs      r6   prune_headsT5Attention.prune_heads  s    u:?7<<!8!8$:K:K
 $DFFE2#DFFE2#DFFE2#DFFEq9||c%j0004<<? --33E:r   c                 b   SnU(       aC  US-  nX@S:  R                  [        R                  5      U-  -  n[        R                  " U 5      n O,[        R                  " U [        R
                  " U 5      5      * n US-  nX:  nU[        R                  " U R                  5       U-  5      [        R                  " X5-  5      -  X%-
  -  R                  [        R                  5      -   n[        R                  " U[        R                  " XrS-
  5      5      nU[        R                  " X`U5      -  nU$ )aR  
Adapted from Mesh Tensorflow:
https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

Translate relative position to a bucket number for relative attention. The relative position is defined as
memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
This should allow for more graceful generalization to longer sequences than the model has been trained on

Args:
    relative_position: an int32 Tensor
    bidirectional: a boolean - whether the attention is bidirectional
    num_buckets: an integer
    max_distance: an integer

Returns:
    a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
r   rC   r'   )r   rq   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r6   _relative_position_bucket%T5Attention._relative_position_bucket  s   , AKQ!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$0 &/II'--/);<hh|/01&( "UZZ.	&"
 &+YY&8RbcTc(d&
" 	EKKE_``r   c                    Uc   U R                   R                  R                  nUc,  [        R                  " U[        R
                  US9SS2S4   nOUSS2S4   R                  U5      n[        R                  " U[        R
                  US9SSS24   nXe-
  nU R                  UU R                  (       + U R                  U R                  S9nU R                  U5      n	U	R                  / SQ5      R                  S5      n	U	$ )z%Compute binned relative position biasN)r   device)r  r  r  )rC   r   r'   r   )r   r>   r  rq   aranger   r   r  r   r   r   permute	unsqueeze)
r   query_length
key_lengthr  cache_positioncontext_positionmemory_positionr  relative_position_bucketvaluess
             r6   compute_biasT5Attention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r   c                 ~   UR                   SS u  pUSLnU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUbE  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nU(       a  UOUnU(       a=  Ub:  W(       a3  WR                  U R                     nUR                  U R                     nOU R                  U5      nU R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUR                  USU R                  U R                  5      R                  SS5      nUbN  U(       d  U
OSn
WR                  UUU R                  SU
05      u  nnU(       a  SUR                  U R                  '   [         R"                  " UUR                  SS5      5      nUc  UR                   S   nUb  UOU
S   S-   nU R$                  (       db  [         R&                  " SU R                  UU4UR(                  UR*                  S	9nU R,                  (       a  U R.                  (       a  SUl        O.U R3                  UUUR(                  U
S
9nUSS2SS2U* S2SS24   nUb#  USS2SS2SS2SUR                   S   24   nUU-   nU R4                  (       aS  [         R6                  " UR                   S   5      nSU[9        U R4                  5      '   USS2UR;                  5       4   nOUnUU-  n[<        R>                  RA                  URC                  5       SS9RE                  U5      n[<        R>                  RG                  UU RF                  U R.                  S9nUb  UU-  n[         R"                  " UU5      nUR                  SS5      RI                  5       nUR                  USU RJ                  5      nU RM                  U5      nUXT4nU	(       a  UU4-   nU$ )zp
Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
NrC   r:   r'   r  Tr   r  r   )r  r  r   r   )ptraining)'rl   r   viewr   r   rm   
is_updatedgetr   cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   updaterq   matmulr   zerosr  r   r   r!  requires_gradr  r   r   listboolr   
functionalsoftmaxr  type_asr   
contiguousr   r   )r   r   maskkey_value_statesposition_biaspast_key_valuelayer_head_maskr  	use_cacheoutput_attentionsr  
batch_size
seq_lengthis_cross_attentionquery_statesr#  curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr  real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                               r6   r   T5Attention.forward  s   $ "/!4!4Ra!8
 .T9vvm,#((RtG^G^_iijkmno%'2266t~~FJ!&4&J&J#&4&I&I#-?)]."<,66t~~FJ.::4>>JL/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn= lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fff[)>/Gr   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   FN)T       )NN)	NNNNNNFFN)r   r   r   r   r(   r   rk   r   r   staticmethodr  r  r   r   r   r   s   @r6   r   r   \  st     %*#'	!,!, C=	!, !,F;  -  - ^. i ir   r   c                   R   ^  \ rS rSrSS\\   4U 4S jjjr       SS jrSrU =r	$ )T5LayerSelfAttentioni@  r   c                    > [         TU ]  5         [        XUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )Nr   r   r   )r   r   r   SelfAttentionr   r   r   rE   r   r   r   r   r   s       r6   r   T5LayerSelfAttention.__init__A  sQ    (W`
 &fnn&:S:STzz&"5"56r   c	                     U R                  U5      n	U R                  U	UUUUUUUS9n
XR                  U
S   5      -   nU4U
SS  -   nU$ )N)r3  r5  r7  r6  r8  r9  r  r   r'   )rE   rR  r   )r   r   attention_maskr5  r7  r6  r8  r9  r  normed_hidden_statesattention_outputrH  s               r6   r   T5LayerSelfAttention.forwardI  st      $}=-- '+)/) . 	
 &5Ea5H(II "%5ab%99r   )rR  r   rE   rJ  )NNNNFFN
r   r   r   r   r   rk   r   r   r   r   r   s   @r6   rO  rO  @  s:    7XVY] 7 7  r   rO  c                   T   ^  \ rS rSrSS\\   4U 4S jjjr        SS jrSrU =r	$ )T5LayerCrossAttentionid  r   c                    > [         TU ]  5         [        USUS9U l        [	        UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g )NFrQ  r   )r   r   r   EncDecAttentionr   r   r   rE   r   r   r   r   )r   rx   r   r   s      r6   r   T5LayerCrossAttention.__init__e  sN    *6u`ij%fnn&:S:STzz&"5"56r   c                     U R                  U5      nU R                  UUUUUUUUU	U
S9
nXR                  US   5      -   nU4USS  -   nU$ )N)	r3  r4  r5  r7  r6  r8  r  r9  r  r   r'   )rE   r]  r   )r   r   r4  rU  r5  r7  r6  r8  r  r9  r  rV  rW  layer_outputrH  s                  r6   r   T5LayerCrossAttention.forwardk  sy      $}=// -'+)%/) 0 
 %||4DQ4G'HH/$4QR$88r   )r]  r   rE   r   )NNNNFNFNrY  r   s   @r6   r[  r[  d  s<    7(3- 7 7  r   r[  c                   \   ^  \ rS rSrSS\\   4U 4S jjjr            SS jrSrU =r	$ )T5Blocki  r   c                 l  > [         TU ]  5         UR                  U l        [        R                  " 5       U l        U R
                  R                  [        XUS95        U R                  (       a"  U R
                  R                  [        XS95        U R
                  R                  [        U5      5        g )NrQ  )r   )
r   r   r   r   
ModuleListr@   r`   rO  r[  r   r   s       r6   r   T5Block.__init__  s}     ++]]_


 luv	
 ??JJ3FPQ

)F+,r   c                    U R                   S   " UUUUU	U
UUS9nUS S u  pUSS  nUR                  [        R                  :X  a  [        R                  " [        R
                  " U5      R                  5       [        R                  " UR                  5      R                  S-
  [        R                  " UR                  5      R                  5      n[        R                  " UU* US9nU R                  =(       a    US LnU(       a  U R                   S   " UUUUUU	US   S-   U
US9	nUS S u  pUR                  [        R                  :X  a  [        R                  " [        R
                  " U5      R                  5       [        R                  " UR                  5      R                  S-
  [        R                  " UR                  5      R                  5      n[        R                  " UU* US9nUUSS  -   nU R                   S   " U5      nUR                  [        R                  :X  a  [        R                  " [        R
                  " U5      R                  5       [        R                  " UR                  5      R                  S-
  [        R                  " UR                  5      R                  5      n[        R                  " UU* US9nU4nU
(       a  UU	4-   U-   nU$ UU-   nU$ )	Nr   )rU  r5  r7  r6  r8  r9  r  rC   i  )r   maxr'   r:   )r4  rU  r5  r7  r6  r  r8  r9  )r@   r   rq   r   r  isinfrb   finforh  clampr   )r   r   rU  r5  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr7  cross_attn_layer_head_maskr6  r8  r9  return_dictr  self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsrH  s                       r6   r   T5Block.forward  s     "&A)'+)/)	"
 )?r(B%2126 %--/++M*..0M//044t;M//044K
 "KKK<[YM!__R1Fd1R&*jjm!65; :-+B/!3#"3
'# -DBQ,G)M ""emm3#kkKK.224KK 3 34884?KK 3 3488
 !&M|Q\ ] !24KAB4O O 

2}5 %--/++M*..0M//044t;M//044K
 "KKK<[YM " 114EEG   11Gr   )r   r@   rJ  )NNNNNNNNFFTNrY  r   s   @r6   rc  rc    sK    
-XVY] 
- 
- "#&*#'T Tr   rc  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	T5ClassificationHeadi  z-Head for sentence-level classification tasks.rx   c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  S9U l        [        R                  " UR                  UR                  5      U l
        g )N)r   )r   r   r   r   r   denser   classifier_dropoutr   
num_labelsout_projr   s     r6   r   T5ClassificationHead.__init__  sZ    YYv~~v~~>
zzF$=$=>		&..&2C2CDr   r   returnc                     U R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r   )r   rz  rq   tanhr}  r   s     r6   r   T5ClassificationHead.forward  sN    ]3

=1

=1]3m4r   )rz  r   r}  )r   r   r   r   __doc__r(   r   rq   r   r   r   r   r   s   @r6   rx  rx    s4    7Ex EU\\ ell  r   rx  c                   \    \ rS rSr\r\rSrSr	Sr
SrSrSrS/rS/r\S 5       rS rS	 rS
rg)T5PreTrainedModeli   transformerTFrc  r   c                 z    [         R                  " [        5      n[         R                  " [        5      nUUUS.nU$ )N)decoder_input_ids	input_idsdecoder_attention_mask)rq   tensorr   r   )r   r  
input_maskdummy_inputss       r6   r  T5PreTrainedModel.dummy_inputs  s6    LL.	\\*-
!*"&0

 r   c                    U R                   R                  n[        U[        5      (       a)  UR                  R
                  R                  US-  5        g	[        U[        [        [        [        45      (       Ga  UR                  R                  R
                  R                  SUS-  S9  [        US5      (       aL  U R                   R                  (       d1  UR                  R                  R
                  R                  SUS-  S9  [        US5      (       av  UR                   R                  R
                  R                  SX R                   R"                  S-  -  S9  UR                   R$                  R
                  R'                  5         g	g	[        U[(        5      (       ar  [        US5      (       a`  UR*                  R                  R
                  R                  SUS-  S9  UR*                  R$                  R
                  R'                  5         g	g	[        U[,        5      (       GaQ  UR.                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR.                  S5      (       aE  UR.                  R$                  b.  UR.                  R$                  R
                  R'                  5         UR0                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR0                  S5      (       aG  UR0                  R$                  b/  UR0                  R$                  R
                  R'                  5         g	g	g	[        U[2        5      (       GaQ  UR4                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR4                  S5      (       aE  UR4                  R$                  b.  UR4                  R$                  R
                  R'                  5         UR6                  R                  R
                  R                  SX R                   R8                  S-  -  S9  [        UR6                  S5      (       aG  UR6                  R$                  b/  UR6                  R$                  R
                  R'                  5         g	g	g	[        U[:        5      (       Ga  UR<                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR<                  S5      (       aE  UR<                  R$                  b.  UR<                  R$                  R
                  R'                  5         UR>                  R                  R
                  R                  SX R                   R"                  S-  -  S9  [        UR>                  S5      (       aE  UR>                  R$                  b.  UR>                  R$                  R
                  R'                  5         UR6                  R                  R
                  R                  SX R                   R8                  S-  -  S9  [        UR6                  S5      (       aG  UR6                  R$                  b/  UR6                  R$                  R
                  R'                  5         g	g	g	[        U[@        5      (       GaZ  U R                   R"                  nU R                   RB                  nU R                   RD                  nURF                  R                  R
                  R                  SX#U-  S-  -  S9  URH                  R                  R
                  R                  SX#S-  -  S9  URJ                  R                  R
                  R                  SX#S-  -  S9  URL                  R                  R
                  R                  SX%U-  S-  -  S9  URN                  (       a4  URP                  R                  R
                  R                  SX#S-  -  S9  g	g	g	)
zInitialize the weights      ?        )r   stdrN   
qa_outputs      rK   rI   N))rx   initializer_factorr   r   r>   ru   fill_T5ModelT5ForConditionalGenerationT5EncoderModelT5ForQuestionAnsweringsharednormal_rg   tie_word_embeddingsrN   r  r   rI   zero_T5ForTokenClassificationrK   rx  rz  r}  r   rO   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   modulefactorr   r   r   s         r6   _init_weightsT5PreTrainedModel._init_weights  s   //fk**MM$$Vc\20.BXY
 
 MM  %%--3FSL-Ivy))$++2Q2Q%%**22#2Nv|,,!!((--553F{{ObObgkNkDl5m!!&&++113 -  899v|,,!!((--553FSL5Q!!&&++113 -  455LL$$,,#6kkFYFY^bEb;c,dv||V,,1B1B1N!!&&,,.OO""''//SfI\I\aeHe>f/gv//FOO4H4H4T$$))//1 5U/00 II!!))s;;CVCV[_B_8`)avyy&))fiinn.H		##))+II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I) 455KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-KK##++&[[EXEX]aDa:b+cv{{F++0@0@0L  %%++-II!!))s;;CSCSX\B\8])^vyy&))fiinn.H		##))+ /I),, kk))G!%!1!1kk++GHHOO  ((cvL^B^cgAg7h(iHHOO  ((cv$7O(PHHOO  ((cv$7O(PHHOO  ((cvL^B^cgAg7h(i11..55::BBQWhl[lQmBn 2 -r   c                    U R                   R                  nU R                   R                  nUc  [        S5      e[	        U5      (       aE  [
        R                  " UR                  S S S-   U5      n[
        R                  " XASS S24   /SS9nO=UR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc  [        S5      eUR                  US	:H  U5        U$ )
Nzself.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information.r:   )r'   .r   r'   ).r   z1self.model.config.pad_token_id has to be defined.)rx   decoder_start_token_idpad_token_idrn   r"   rq   fullrl   cat	new_zerosclonemasked_fill_)r   r  r  r  shifted_input_idss        r6   _shift_rightT5PreTrainedModel._shift_rightU  s    !%!C!C{{//!)4  Y'' %

9??3B+?$+FH^ _ %		+<SbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r   r3   N)r   r   r   r   r(   config_classr   load_tf_weightsbase_model_prefixis_parallelizablesupports_gradient_checkpointing_supports_quantized_cache_supports_static_cache_supports_cache_class_no_split_modules_keep_in_fp32_modulespropertyr  r  r  r   r3   r   r6   r  r     s^    L+O%&*# %! "!F ;oz!r   r  c                   j  ^  \ rS rSrSU 4S jjr\" \5      SS j5       r\" \5      S 5       r	S r
S r             SS jr SS\\R                  S	4   S
\R                  S\R                  S\S\4
S jjr\S\R                  S\S\S\R*                  S\R                  S\4S j5       rSrU =r$ )T5Stackiq  c                   > [         TU ]  U5        X l        UR                  U l        [        R
                  " [        UR                  5       Vs/ s H  n[        U[        US:H  5      US9PM     sn5      U l
        [        UR                  UR                  S9U l        [        R                  " UR                   5      U l        U R%                  5         SU l        S U l        SU l        g s  snf )Nr   rQ  r   F)r   r   embed_tokensr   r   re  range
num_layersrc  r.  blockr   r   r   rF   r   r   r   	post_initmodel_parallel
device_mapr   )r   rx   r  ir   s       r6   r   T5Stack.__init__r  s     ( ++]]]bcictct]uv]uXYWVa1fQRS]uv

 !,FNN@Y@Y Zzz&"5"56 	#&+# ws   !C*c                    [         R                  " S[        5        UcD  [        [	        U R
                  5      [        [        R                  R                  5       5      5      OUU l
        [        U R                  [	        U R
                  5      5        SU l        SU R                  R                  5       ;   a  SO.S[        [        U R                  R                  5       5      5      -   U l        S[        [#        U R                  R                  5       5      5      -   U l        U R                  R'                  5        HG  u  p#U H<  nS[        U5      -   nU R
                  U   R)                  U5      U R
                  U'   M>     MI     U R*                  R)                  U R                   5      U l        U R,                  R)                  U R$                  5      U l        g )Na<  `T5Stack.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0, 'block.1': 1, ...}Tcpucuda:)warningswarnFutureWarningr&   rh   r  r  rq   cudadevice_countr  r%   r  rv   strr   first_devicerh  last_deviceitemsr   r  rF   )r   r  r   r   r@   cuda_devices         r6   parallelizeT5Stack.parallelize  se   " 	
 R\QcN3tzz?E%**2I2I2K,LMis 	 	$//3tzz?;"%*doo.B.B.D%DE'TWX[\`\k\k\p\p\rXsTtJt"ST__-A-A-C)D%EEOO))+DA%A.$(JJu$5$8$8$E

5!  , !--001B1BC $ 5 5 8 89I9I Jr   c                    [         R                  " S[        5        SU l        S U l        SU l        SU l        [        [        U R                  5      5       H.  nU R                  U   R                  S5      U R                  U'   M0     U R                  R                  S5      U l        U R                  R                  S5      U l        [        R                  R                  5         g )N\Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.Fr  )r  r  r  r  r  r  r  r  rh   r  r   r  rF   rq   r  empty_cache)r   r  s     r6   deparallelizeT5Stack.deparallelize  s    j	
 $! s4::'A JJqM,,U3DJJqM ( --007 $ 5 5 8 8 ?

 r   c                     U R                   $ r   r  r   s    r6   get_input_embeddingsT5Stack.get_input_embeddings  s       r   c                     Xl         g r   r  r   new_embeddingss     r6   set_input_embeddingsT5Stack.set_input_embeddings  s    *r   c                    U R                   (       aS  [        R                  R                  U R                  5        U R
                  R                  U R                  5      U l        U	b  U	OU R                  R                  n	U
b  U
OU R                  R                  n
Ub  UOU R                  R                  nUb  UOU R                  R                  nUb*  Ub'  U R                  (       a  SOSn[        SU SU S35      eUb&  UR                  5       nUR                  SUS   5      nO>Ub  UR                  5       S S nO'U R                  (       a  SOSn[        SU SU S	35      eU R                   (       a/  U R"                  (       a  U	(       a  [$        R'                  S
5        Sn	Uc)  U R
                  c  [        S5      eU R                  U5      nUu  nnU	SL a   U R                  (       d  [        SU  S35      eSnSnU R                  (       a  U	(       d  Ub  [)        U[*        5      (       a,  [)        U[,        5      (       d  Sn[-        U[/        5       5      nOv[)        U[,        5      (       d.  Sn[$        R'                  S5        [,        R0                  " U5      nO3Uc  [-        [/        5       [/        5       5      nOU R                  (       d  S nUb  UR3                  5       OSnUc#  [        R4                  " UUU-   UR6                  S9nUc4  [9        5       (       d%  UU-   n[        R:                  " UUUR6                  S9nU R                  R                  (       a%  U R=                  UUUUb  UR>                  OS U
5      nO\UbW  US S 2S S S S 24   nUR                  UR@                  S9nSU-
  [        RB                  " UR@                  5      RD                  -  nOS nU R                  (       a^  Ub[  UR                  5       u  nnnUU4nUc.  [        R:                  " UUR6                  [        RF                  S9nU RI                  U5      nOS nU RK                  X`R                  RL                  5      nU RK                  XpR                  RL                  5      nU(       a  SOS nU
(       a  SOS nU
(       a  U R                  (       a  SOS nS nS n U RO                  U5      n![Q        U RR                  5       GHt  u  n"n#UU"   n$UU"   n%U R                   (       a  [        R                  R                  U!R6                  5        Ub  UR                  U!R6                  5      nUb  UR                  U!R6                  5      nUb  UR                  U!R6                  5      nUb  UR                  U!R6                  5      nU b  U R                  U!R6                  5      n U$b  U$R                  U!R6                  5      n$U%b  U%R                  U!R6                  5      n%U(       a  UU!4-   nU R                   (       a:  U R"                  (       a)  U RU                  U#RV                  U!UUUUU U$U%S U	U
UU5      n&OU#" U!UUUUU U$U%UU	U
UUS9n&U	SL a  U&S S S-   U&SS  -   n&U&S S u  n!n'U&S   nU R                  (       a  Ub  U&U
(       a  SOS   n U
(       a#  UU&S   4-   nU R                  (       a	  UU&S   4-   nU R                   (       d  GM  U RX                  R[                  5        HO  u  n(n)U"U)S   :X  d  M  S[]        U(5      -   U R^                  :w  d  M/  U!R                  S[]        U(S-   5      -   5      n!MQ     GMw     U Ra                  U!5      n!U RO                  U!5      n!U(       a  UU!4-   nU	(       a  W'OS n*U(       a  UR>                  n*U(       a  URc                  5       n*U(       d  [e        S U!U*UUU4 5       5      $ [g        U!U*UUUS 9$ )!Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer:   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r  )r   r  r  r3   )rU  r5  rl  rm  rn  r7  ro  r6  r8  r9  rp  r  r'   r   rC      r      r  c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   r3   )r4   r   s     r6   r7   "T5Stack.forward.<locals>.<genexpr>  s"      
A  s   	)last_hidden_statepast_key_valuesr   
attentionscross_attentions)4r  rq   r  
set_devicer  r  r   rx   r8  r9  output_hidden_statesuse_return_dictr   rn   sizer"  r   r!  rW   r   r   r   r   r   from_legacy_cacheget_seq_lengthr  r  r#   r   _update_causal_maskr&  r   rj  r   r   invert_attention_maskget_head_maskr  r   	enumerater  _gradient_checkpointing_funcr   r  r  r  r  rF   to_legacy_cachetupler   )+r   r  rU  rl  rm  r  	head_maskcross_attn_head_maskr  r8  r9  r  rp  r  err_msg_prefixinput_shaper:  r;  return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthrD  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr5  rn  r   r  layer_moduler7  ro  layer_outputsnext_decoder_cacher   r   
next_caches+                                              r6   r   T5Stack.forward  s   " JJ!!$"3"34 $ 1 1 4 4T5F5F GD!*!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	   ( !_`` --i8M!,
J?? #LTFRg!hii $&+#??	_-H/511*_Vi:j:j.2+"5o|~"V1DEE&*###`
 #6"G"G"X ("5lnln"U #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D*D4zAO"ZZ
OML`L`aN;;!!228G8S44Y]!K '(D$)9:K%..}/B/B.CK,M<O<O0P0T0TTKK ??4@=R=W=W=Y: 7$68O#P %-).(1E1EUZZ*& /3.H.HI_.`+.2+ &&y++2H2HI	#112FH^H^_"6BD0d&7DOOrRV(,%]3(4OA|'lO)=a)@&""

%%m&:&:;*"-..1E1E"FK ,$1$4$4]5I5I$JM(4,A,D,D]EYEY,Z)2>6U6X6XYfYmYm6n30<4Q4T4TUbUiUi4j1".&5&8&89M9M&NO-91K1N1N}OcOc1d.#$58H$H!**t}} $ A A ((!!)31#.%"!" !-!#."/*?+J2O$3/I#2'&7 +#1!$ E! -bq 1G ;mAB>O O0=bq0A-M-
 *!,M#8#D0=CTaZ[0\- !/=3C2E!E??+?=QRCSBU+U( """ OO113DAqAbEzgA&6$:J:J&J(5(8(83q1u:9M(N 4_  5f --m<]3   1]4D D+4'$
&(==J(88:J 
 "%"(
 
 
 9+&+%1
 	
r   rU  r)   input_tensorr  r  r9  c           	         U R                   R                  S:X  a  Ub  US:H  R                  5       (       a  U$ g U R                   R                  S:X  a,  [        U[        R
                  5      (       a  [        U5      nU$ Ub  UR                  5       OSnUb  UR                  OSnU R                   R                  S:X  a5  U(       d.  U(       d'  [        R                  " UUUU R                  S9(       a  g UR                  nUR                  S   n	U(       a  UR                  5       n
O5[        U[        R
                  5      (       a  UR                  S	   OXi-   S-   n
U R                  UU	U
UUUR                  S   S
9nU R                   R                  S:X  aZ  UbW  UR                   R"                  S;   a=  U(       d6  [        R$                  " U5      R&                  n[        R(                  " X5      nU$ )Nflash_attention_2r  flex_attentionr   Fsdpa)r  r  is_trainingr'   r:   )sequence_lengthtarget_lengthr   r  r:  )r  xpunpu)rx   _attn_implementationrb   r   rq   r   r*   r  is_compileabler   _ignore_causal_mask_sdpar!  r   rl   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr  typerj  r   _unmask_unattended)r   rU  r  r  r  r9  past_seen_tokensusing_compilable_cacher   r  r  rD  	min_dtypes                r6   r  T5Stack._update_causal_mask  s    ;;++/BB)~/D.I.I.K.K%%;;++/??.%,,77!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell;; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCK[Kr   r  r  r   r:  c                    U b  U R                  5       S:X  a  U nU$ [        R                  " U5      R                  n[        R                  " X4XUR
                  S9nUS:w  a  [        R                  " USS9nU[        R                  " X$R
                  S9UR                  SS5      :  -  nUSSSS2SS24   R                  USSS5      nU b  UR                  5       nU R                  S   n	USS2SS2SS2SU	24   U SS2SSSS24   R                  UR
                  5      -   n
U
S:H  n
USS2SS2SS2SU	24   R                  X5      USS2SS2SS2SU	24'   U$ )	a  
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

Args:
    attention_mask (`torch.Tensor`):
        A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
        `(batch_size, 1, query_length, key_value_length)`.
    sequence_length (`int`):
        The sequence length being processed.
    target_length (`int`):
        The target length: when generating with static cache, the mask should be as long as the static cache,
        to account for the 0 padding, the part of the cache that is not filled yet.
    dtype (`torch.dtype`):
        The dtype to use for the 4D attention mask.
    cache_position (`torch.Tensor`):
        Indices depicting the position of the input sequence tokens in the sequence.
    batch_size (`torch.Tensor`):
        Batch size.
Nr  )
fill_valuer   r  r'   )diagonalr  r:   r   )r   rq   rj  r   r  r  triur  reshapeexpandr  rl   r   masked_fill)rU  r  r  r   r  r:  kwargsrD  r'  mask_lengthpadding_masks              r6   r"  =T5Stack._prepare_4d_causal_attention_mask_with_cache_position  s}   > %.*<*<*>!*C(K* ' E*..I** 0Y\j\q\qK !##jjqA5<<>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c 6Aq!\k\12 r   )
r  r  r   r  rF   r  r   r   r  r  r   )NNNNNNNNNNNNN)F)r   r   r   r   r   r   PARALLELIZE_DOCSTRINGr  DEPARALLELIZE_DOCSTRINGr  r  r  r   r   rq   r   r   r.  r  rM  rk   r   r"  r   r   r   s   @r6   r  r  q  s'   ,& /0K 1K6 12! 3!!+
 "#!!u
| #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r   r  a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
c            &         ^  \ rS rSrS/rSS/rS\4U 4S jjr\" \	5      S"S j5       r
\" \5      S 5       rS	 rS
 rS rS rS rS r\                S#S\\R,                     S\\R.                     S\\R,                     S\\R0                     S\\R.                     S\\R.                     S\\R2                     S\\\\R.                           S\\\\R.                           S\\R2                     S\\R2                     S\\   S\\   S\\   S\\   S\\R,                     S\\\R.                     \4   4"S  jj5       rS!rU =r $ )$r  i5  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightrx   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        SUl        [        X R                  5      U l        [        R                  " U5      nSUl	        SUl        UR                  Ul        [        X0R                  5      U l        U R#                  5         SU l        S U l        g NFT)r   r   r   r   
vocab_sizer   r  copydeepcopyr   r8  is_encoder_decoderr  encodernum_decoder_layersr  rL   r  r  r  r   rx   encoder_configdecoder_configr   s       r6   r   T5Model.__init__<  s     ll6#4#4fnnEv.$)!#( ,1)~{{;v.$(!,1)$*$=$=!~{{; 	 $r   c                    [         R                  " S[        5        UcN  [        [	        U R
                  R                  5      [        [        R                  R                  5       5      5      OUU l        [        U R                  [	        U R
                  R                  5      5        U R
                  R                  U R                  5        U R                  R                  U R                  5        SU l        g )NaL  `T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0': 0, 'encoder.block.1': 1, ...}T)r  r  r  r&   rh   r@  r  r  rq   r  r  r  r%   r  rL   r  r   r  s     r6   r  T5Model.parallelizeS  s    - 	
 ! 3t||112E%**:Q:Q:S4TU 	
 	$//3t||/A/A+BC  1  1"r   c                 z   [         R                  " S[        5        U R                  R	                  5         U R
                  R	                  5         U R                  R                  S5      U l        U R
                  R                  S5      U l        SU l        S U l        [        R                  R                  5         g Nr  r  F)r  r  r  r@  r  rL   r   r  r  rq   r  r  r  s    r6   r  T5Model.deparallelizef  s}    j	
 	""$""$||u-||u-#

 r   c                     U R                   $ r   r  r  s    r6   r  T5Model.get_input_embeddingst      {{r   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   r  r@  r  rL   r  s     r6   r  T5Model.set_input_embeddingsw  +    $)).9)).9r   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g r   rx   r  _tie_or_clone_weightsr@  r  r  rL   r  s    r6   _tie_weightsT5Model._tie_weights|  P    ;;**&&t||'@'@$++N&&t||'@'@$++N +r   c                     U R                   $ r   r@  r  s    r6   get_encoderT5Model.get_encoder      ||r   c                     U R                   $ r   rL   r  s    r6   get_decoderT5Model.get_decoder  r^  r   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)r  r@  r@   	attentionr   r   heads_to_pruner@   r   s       r6   _prune_headsT5Model._prune_heads  s<    
 +002LELLu%//;;EB 3r   r  rU  r  r  r  decoder_head_maskr   encoder_outputsr  r  decoder_inputs_embedsr8  r9  r  rp  r  r  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R                  U R                   R                  :X  a!  [
        R                  " [        [        5        UnUc  U R                  UUU
UUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  (       a  [        R                  R!                  U R"                  R$                  5        UR'                  U R"                  R$                  5      nUb%  UR'                  U R"                  R$                  5      nUb%  UR'                  U R"                  R$                  5      nUb%  UR'                  U R"                  R$                  5      nU R#                  UUUU	UUUUUUUUUS9nU(       d  UU-   $ [)        UR*                  UR,                  UR.                  UR0                  UR2                  UR*                  UR.                  UR0                  S9$ )	a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoTokenizer, T5Model

>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5Model.from_pretrained("google-t5/t5-small")

>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for T5Model.
>>> # This is not needed for torch's T5ForConditionalGeneration as it does this internally using labels arg.
>>> decoder_input_ids = model._shift_right(decoder_input_ids)

>>> # forward pass
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```Nr  rU  r  r  r9  r  rp  r   r'   rC   r  r   r  r  rU  r  r  rl  rm  r  r   r8  r9  r  rp  r  )r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_staterl  encoder_attentions)rx   r8  r  r  rA  r  r  _T5Model__HEAD_MASK_WARNING_MSGr  r@  r   r   rh   r  rq   r  r  rL   r  r   r   r  r  r   r  r  )r   r  rU  r  r  r  rj  r   rk  r  r  rl  r8  r9  r  rp  r  r   decoder_outputss                      r6   r   T5Model.forward  s@   b "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* JJ!!$,,";";<),,T\\-F-FGM ,$5$8$89R9R$S!)!/!2!24<<3L3L!M%1)?)B)B4<<C\C\)]& ,,'1/+"/#1'!5/!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r   )rL   r  r@  r  r  r   NNNNNNNNNNNNNNNN)!r   r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr(   r   r   r4  r  r5  r  r  r  rW  r\  ra  rh  r    r   rq   
LongTensorFloatTensor
BoolTensorr   r   r.  r   r   r   r   r   r   s   @r6   r  r  5  s8    	Q*& 89VWx . /0# 1#$ 12! 3!:
O
C  156:8<=A159=7;EIEI048<$(,0/3&*59#U
E,,-U
 !!2!23U
 $E$4$45	U

 !))9)9 :U
 E--.U
 $E$5$56U
 'u||4U
 "%e.?.?(@"ABU
 "%e.?.?(@"ABU
  -U
  (5U
 D>U
 $D>U
 'tnU
  d^!U
" !!1!12#U
$ 
uU&&');;	<%U
 U
r   r  z:
    T5 Model with a `language modeling` head on top.
    )custom_introc            (         ^  \ rS rSrS/r/ SQrS\4U 4S jjr\" \	5      S%S j5       r
\" \5      S 5       rS rS	 rS
 rS rS rS rS r\                 S&S\\R.                     S\\R0                     S\\R.                     S\\R2                     S\\R0                     S\\R0                     S\\R4                     S\\\\R4                           S\\\\R4                           S\\R0                     S\\R0                     S\\R.                     S\\   S\\   S\\   S\\   S\\R.                     S \\\R0                     \4   4$S! jj5       rS\R4                  4S" jr S# r!S$r"U =r#$ )'r  i(  r7  )r8  r9  zlm_head.weightrx   c                 h  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        SUl        [        X R                  5      U l        [        R                  " U5      nSUl
        SUl        UR                  Ul        [        X0R                  5      U l        [        R$                  " UR                  UR                  SS9U l        U R)                  5         SU l        S U l        g )NFTr   )r   r   r   	model_dimr   r   r<  r  r=  r>  r   r8  r?  r  r@  rA  r  rL   r   rN   r  r  r  rB  s       r6   r   #T5ForConditionalGeneration.__init__3  s     ll6#4#4fnnEv.$)!#( ,1)~{{;v.$(!,1)$*$=$=!~{{;yy1B1BO 	 $r   c                 Z   [         R                  " S[        5        UcN  [        [	        U R
                  R                  5      [        [        R                  R                  5       5      5      OUU l        [        U R                  [	        U R
                  R                  5      5        U R
                  R                  U R                  5        U R                  R                  U R                  5        U R                  R!                  U R                  R"                  5      U l        SU l        g )Na_  `T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0': 0, 'encoder.block.1': 1, ...}T)r  r  r  r&   rh   r@  r  r  rq   r  r  r  r%   r  rL   rN   r   r  r  rG  s     r6   r  &T5ForConditionalGeneration.parallelizeN  s    A 	
 ! 3t||112E%**:Q:Q:S4TU 	
 	$//3t||/A/A+BC  1  1||t||'@'@A"r   c                    [         R                  " S[        5        U R                  R	                  5         U R
                  R	                  5         U R                  R                  S5      U l        U R
                  R                  S5      U l        U R                  R                  S5      U l        SU l        S U l	        [        R                  R                  5         g rJ  )r  r  r  r@  r  rL   r   rN   r  r  rq   r  r  r  s    r6   r  (T5ForConditionalGeneration.deparallelizeb  s    j	
 	""$""$||u-||u-||u-#

 r   c                     U R                   $ r   rM  r  s    r6   r  /T5ForConditionalGeneration.get_input_embeddingsq  rO  r   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   rQ  r  s     r6   r  /T5ForConditionalGeneration.set_input_embeddingst  rS  r   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g r   rU  r  s    r6   rW  'T5ForConditionalGeneration._tie_weightsy  rY  r   c                     Xl         g r   rN   r  s     r6   set_output_embeddings0T5ForConditionalGeneration.set_output_embeddings~  s    %r   c                     U R                   $ r   r  r  s    r6   get_output_embeddings0T5ForConditionalGeneration.get_output_embeddings  r^  r   c                     U R                   $ r   r[  r  s    r6   r\  &T5ForConditionalGeneration.get_encoder  r^  r   c                     U R                   $ r   r`  r  s    r6   ra  &T5ForConditionalGeneration.get_decoder  r^  r   r  rU  r  r  r  rj  r   rk  r  r  rl  labelsr8  r9  r  rp  r  r  c                 *   Ub  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R                  U R                   R                  :X  a!  [
        R                  " [        [        5        UnUc  U R                  UUU
UUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  (       a3  [        R                  R!                  U R"                  R$                  5        Ub  Uc  Uc  U R'                  U5      nU R                  (       a  [        R                  R!                  U R"                  R$                  5        UR)                  U R"                  R$                  5      nUb%  UR)                  U R"                  R$                  5      nUb%  UR)                  U R"                  R$                  5      nUb%  UR)                  U R"                  R$                  5      nU R#                  UUUU	UUUUUUUUUS9nUS   nU R                  (       a  [        R                  R!                  U R                  R$                  5        U R*                  R)                  U R                  R$                  5      U l        UR)                  U R*                  R,                  R.                  5      nU R                   R0                  (       a  UU R2                  S-  -  nU R+                  U5      nSnUb[  [5        S	S
9nUR)                  UR.                  5      nU" UR7                  SUR9                  S5      5      UR7                  S5      5      nU(       d  U4USS -   U-   nUb  U4U-   $ U$ [;        UUUR<                  UR>                  UR@                  URB                  URD                  UR>                  UR@                  S9	$ )a$  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import AutoTokenizer, T5ForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

>>> # training
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
>>> outputs = model(input_ids=input_ids, labels=labels)
>>> loss = outputs.loss
>>> logits = outputs.logits

>>> # inference
>>> input_ids = tokenizer(
...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model.generate(input_ids)
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
>>> # studies have shown that owning a dog is good for you.
```Nrn  r   r'   rC   ro  rp  r  r  ignore_indexr:   	lossrM   r  rq  rr  r  rs  rl  rt  )#rx   r8  r  r  rA  r  r  2_T5ForConditionalGeneration__HEAD_MASK_WARNING_MSGr  r@  r   r   rh   r  rq   r  r  rL   r  r  r   rN   r>   r  r  r  r	   r"  r  r   r  r   r  r  r  )r   r  rU  r  r  r  rj  r   rk  r  r  rl  r  r8  r9  r  rp  r  r   rv  sequence_output	lm_logitsr  loss_fctoutputs                            r6   r   "T5ForConditionalGeneration.forward  s   p "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*JJ!!$,,";";<"3";@U@] $ 1 1& 9 JJ!!$,,";";<),,T\\-F-FGM ,$5$8$89R9R$S!)!/!2!24<<3L3L!M%1)?)B)B4<<C\C\)]& ,,'1/+"/#1'!5/!5#) ' 
  *!, JJ!!$,,";";<<<??4<<+D+DEDL-001D1D1K1KLO;;** .1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD \OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r   c                 $    U R                  U5      $ r   )r  )r   r  s     r6   %prepare_decoder_input_ids_from_labels@T5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsJ  s      ((r   c           	         Uc  [         R                  S5        U$ SnU H  nSnU H2  nUUR                  SUR                  UR                  5      5      4-   nM4     US   R
                  US   R
                  :w  a,  [        SUS   R
                   SUS   R
                   S35      e[        U5      [        U5      :w  a$  [        S[        U5       S[        U5       S35      eX54-   nM     U$ )	NzHYou might want to consider setting `use_cache=True` to speed up decodingr3   r   z%reordered_layer_past_states[0] shape z  and layer_past_states[0] shape rQ   z&length of reordered_layer_past_states z! and length of layer_past_states )rW   warningindex_selectr   r  rl   rn   rh   )r   r  beam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_states          r6   _reorder_cache)T5ForConditionalGeneration._reorder_cacheM  sf    "NNef""!#!0 +-'$5 .I$11!X[[AQAXAX5YZM /+ %6 +1-337H7K7Q7QQ ;<WXY<Z<`<`;a  bB  CT  UV  CW  C]  C]  B^  ^i  j  ./37H3II <SA\=]<^^  AD  EV  AW  @X  Xc  d  &<>\%\"' "1( &%r   )rL   r  r@  rN   r  r  r  r   )NNNNNNNNNNNNNNNNN)$r   r   r   r   ry  rz  r(   r   r   r4  r  r5  r  r  r  rW  r  r  r\  ra  r    r   rq   r{  r|  r}  r   r   r.  r   r   r   r  r  r   r   r   s   @r6   r  r  (  sj    	Q*& jx 6 /0# 1#& 12! 3!:
O
&  156:8<=A159=7;@D@D59=A-1$(,0/3&*59%}
E,,-}
 !!2!23}
 $E$4$45	}

 !))9)9 :}
 E--.}
 $E$5$56}
 'u||4}
 "%ell(;"<=}
 "%ell(;"<=}
   1 12}
  ((9(9:}
 ))*}
 D>}
 $D>}
  'tn!}
" d^#}
$ !!1!12%}
& 
uU&&'8	9'}
 }
~)ELL )& &r   r  c                   z  ^  \ rS rSrS/rS/rS\4U 4S jjr\" \	5      SS j5       r
\" \5      S 5       rS rS	 rS
 rS rS r\       SS\\R*                     S\\R,                     S\\R,                     S\\R,                     S\\   S\\   S\\   S\\\R,                     \4   4S jj5       rSrU =r$ )r  il  r8  rL   rx   c                 <  > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " U5      nSUl	        SUl
        [        X R                  5      U l        U R                  5         SU l        S U l        g NF)r   r   r   r   r<  r   r  r=  r>  r8  r?  r  r@  r  r  r  )r   rx   rC  r   s      r6   r   T5EncoderModel.__init__q  sw     ll6#4#4fnnEv.#( ,1)~{{; 	 $r   c                    [         R                  " S[        5        UcN  [        [	        U R
                  R                  5      [        [        R                  R                  5       5      5      OUU l        [        U R                  [	        U R
                  R                  5      5        U R
                  R                  U R                  5        SU l        g )NaC  `T5EncoderModel.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance {'block.0': 0, 'block.1': 1, ...}T)r  r  r  r&   rh   r@  r  r  rq   r  r  r  r%   r  r  rG  s     r6   r  T5EncoderModel.parallelize  s    " 	
 ! 3t||112E%**:Q:Q:S4TU 	
 	$//3t||/A/A+BC  1"r   c                    [         R                  " S[        5        U R                  R	                  5         U R                  R                  S5      U l        SU l        S U l        [        R                  R                  5         g rJ  )r  r  r  r@  r  r   r  r  rq   r  r  r  s    r6   r  T5EncoderModel.deparallelize  sY    j	
 	""$||u-#

 r   c                     U R                   $ r   rM  r  s    r6   r  #T5EncoderModel.get_input_embeddings  rO  r   c                 F    Xl         U R                  R                  U5        g r   )r  r@  r  r  s     r6   r  #T5EncoderModel.set_input_embeddings  s    $)).9r   c                     U R                   R                  (       a1  U R                  U R                  R                  U R
                  5        g g r   )rx   r  rV  r@  r  r  r  s    r6   rW  T5EncoderModel._tie_weights  s2    ;;**&&t||'@'@$++N +r   c                     U R                   $ r   r[  r  s    r6   r\  T5EncoderModel.get_encoder  r^  r   c                     UR                  5        HD  u  p#U R                  R                  U   R                  S   R                  R                  U5        MF     g)rd  r   N)r  r@  r  r@   rR  r   rf  s       r6   rh  T5EncoderModel._prune_heads  sG    
 +002LELLu%++A.<<HHO 3r   r  rU  r  r  r9  r  rp  r  c           
      f    Ub  UOU R                   R                  nU R                  UUUUUUUS9nU$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).

Example:

```python
>>> from transformers import AutoTokenizer, T5EncoderModel

>>> tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
>>> model = T5EncoderModel.from_pretrained("google-t5/t5-small")
>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids)
>>> last_hidden_states = outputs.last_hidden_state
```rn  )rx   r  r@  )	r   r  rU  r  r  r9  r  rp  rk  s	            r6   r   T5EncoderModel.forward  sK    D &1%<k$++B]B],,)'/!5# ' 
 r   )r  r@  r  r  r   )NNNNNNN)r   r   r   r   rz  ry  r(   r   r   r4  r  r5  r  r  r  rW  r\  rh  r    r   rq   r{  r|  r.  r   r   r   r   r   r   r   s   @r6   r  r  l  s5   78*4&x   /0# 1#" 12	! 3	!:OP  156:1559,0/3&*-E,,-- !!2!23- E--.	-
   1 12- $D>- 'tn- d^- 
uU&&'8	9- -r   r  z
    T5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c            $         ^  \ rS rSrS/rSS/rS\4U 4S jjr\               SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\\\4   4 S jj5       rSrU =r$ )T5ForSequenceClassificationi  r7  r8  r9  rx   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         SU l        g r  )r   r   r  r  rx  classification_headr  r  r   s     r6   r   $T5ForSequenceClassification.__init__  s=     "6?#7#?  	#r   r  rU  r  r  r  rj  r   rk  r  rl  r  r8  r9  r  rp  r  c                 4   Ub  UOU R                   R                  nUb  SnUc%  U	b"  [        SU R                  R                   35      eUc"  U
c  Uc  [        S5      eU R                  U5      nU R                  UUUUUUUUU	U
UUUUS9nUS   nUR                  U R                   R                  5      R                  UR                  5      n[        [        R                  " UR                  S5      5      5      S:  a  [        S5      eUR                   u  nnnUUSS24   R#                  US	U5      SS2S	SS24   nU R%                  U5      nSnUGb  UR                  UR                  5      nU R                   R&                  c  U R                   R(                  S:X  a  S
U R                   l        OyU R                   R(                  S:  aN  UR*                  [        R,                  :X  d  UR*                  [        R.                  :X  a  SU R                   l        OSU R                   l        U R                   R&                  S
:X  aT  [1        5       nU R                   R(                  S:X  a&  U" UR3                  5       UR3                  5       5      nOU" UU5      nOU R                   R&                  S:X  aG  [5        5       nU" UR#                  S	U R                   R(                  5      UR#                  S	5      5      nO-U R                   R&                  S:X  a  [7        5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [9        UUUR:                  UR<                  UR>                  UR@                  URB                  URD                  URF                  S9	$ )a_	  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)rU  r  r  r  rj  r   rk  r  rl  r8  r9  r  rp  r   r'   z7All examples must have the same number of <eos> tokens.r:   
regressionsingle_label_classificationmulti_label_classificationr  )$rx   r  NotImplementedErrorr   r   rn   r  r  eqeos_token_idr   r  rh   rq   unique_consecutivesumrl   r"  r  problem_typer|  r   r   rk   r
   squeezer	   r   r   r  rq  rr  r  rs  rl  rt  )r   r  rU  r  r  r  rj  r   rk  r  rl  r  r8  r9  r  rp  rH  r  eos_maskr:  r	  r   sentence_representationrM   r  r  r  s                              r6   r   #T5ForSequenceClassification.forward  sR   | &1%<k$++B]B]I!:%J4>>KbKbJcd  $)>)F  U 
 !% 1 1) <"")/#9/!5+'"7/!5# # 
  "!*<< 8 89<<_=S=STu''Q89A=VWW%4%:%:"
A{"1(A+">"C"CJPRT_"`abdfhiai"j))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r   )r  r  r  )NNNNNNNNNNNNNNN)r   r   r   r   ry  rz  r(   r   r    r   rq   r{  r   r   r|  r.  r   r   r   r   r   r   r   s   @r6   r  r    s    +s)s&79VW$x $  15158<=A,0487;=A59=A-1$(,0/3&*!P
E,,-P
 !.P
 $E$4$45	P

 !))9)9 :P
 ELL)P
 $ELL1P
 'u||4P
 "$u'8'8"9:P
   1 12P
  ((9(9:P
 ))*P
 D>P
 $D>P
 'tnP
  d^!P
" 
u55	6#P
 P
r   r  c                   :  ^  \ rS rSrS/rS\4U 4S jjr\        SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\   S\\   S\\   S\\\	R                     \4   4S jj5       rSrU =r$ )r  i  z'transformer.encoder.embed_tokens.weightrx   c                 0  > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         g r   )r   r   r|  r  r  r   r   r{  r   r   r   rK   r  r   s     r6   r   !T5ForTokenClassification.__init__  sj      ++)&1zz&";";<))F$6$68I8IJ 	r   r  rU  r  r  r  r9  r  rp  r  c	           
         Ub  UOU R                   R                  nU R                  UUUUUUUS9n	U	S   n
U R                  U
5      n
U R	                  U
5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  XSS 4nUb  U4U-   $ U$ [        UUU	R                  U	R                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
N)rU  r  r  r9  r  rp  r   r:   rC   )r  rM   r   r  )rx   r  r  r   rK   r	   r"  r|  r   r   r  )r   r  rU  r  r  r  r9  r  rp  rH  r   rM   r  r  r  s                  r6   r    T5ForTokenClassification.forward  s    4 &1%<k$++B]B]"")'/!5# # 
  
]3/')HFKKDOO<fkk"oNDam,F)-)9TGf$EvE$!//))	
 	
r   )rK   r   r|  r  )NNNNNNNN)r   r   r   r   rz  r(   r   r    r   rq   r   r.  r   r   r   r   r   r   r   s   @r6   r  r    s    CD	x 	  -115,004)-,0/3&*7
ELL)7
 !.7
 ELL)	7

  -7
 &7
 $D>7
 'tn7
 d^7
 
uU\\"$99	:7
 7
r   r  c            &       X  ^  \ rS rSrS/rSS/rS\4U 4S jjrS rS r	S	 r
S
 rS r\                SS\\R                      S\\R"                     S\\R                      S\\R$                     S\\R"                     S\\R"                     S\\R&                     S\\\\R&                           S\\R                      S\\R                      S\\R"                     S\\R"                     S\\   S\\   S\\   S\\   S\\\R"                     \4   4"S jj5       rSrU =r$ ) r  i  r7  r8  r9  rx   c                 ~  > [         TU ]  U5        UR                  U l        [        R
                  " UR                  UR                  5      U l        [        R                  " U5      nSUl
        SUl        SUl        [        X R                  5      U l        [        R                  " U5      nSUl
        SUl        UR                  Ul        [        X0R                  5      U l        UR$                  U l        [        R&                  " UR(                  UR$                  5      U l        U R-                  5         SU l        g r;  )r   r   r   r  r   r   r<  r  r=  r>  r   r8  r?  r  r@  rA  r  rL   r|  r   r   r  r  r  rB  s       r6   r   T5ForQuestionAnswering.__init__  s     ll6#4#4fnnEv.$)!#( ,1)~{{;v.$(!,1)$*$=$=!~{{; ++))F$6$68I8IJ 	#r   c                     U R                   $ r   rM  r  s    r6   r  +T5ForQuestionAnswering.get_input_embeddings  rO  r   c                 |    Xl         U R                  R                  U5        U R                  R                  U5        g r   rQ  r  s     r6   r  +T5ForQuestionAnswering.set_input_embeddings  rS  r   c                     U R                   R                  (       aa  U R                  U R                  R                  U R
                  5        U R                  U R                  R                  U R
                  5        g g r   rU  r  s    r6   rW  #T5ForQuestionAnswering._tie_weights  rY  r   c                     U R                   $ r   r[  r  s    r6   r\  "T5ForQuestionAnswering.get_encoder	  r^  r   c                     U R                   $ r   r`  r  s    r6   ra  "T5ForQuestionAnswering.get_decoder	  r^  r   r  rU  r  r  r  rj  r   rk  start_positionsend_positionsr  rl  r8  r9  r  rp  r  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U
b  SnUc"  Uc  Uc  [        S5      eU R	                  U5      nUb  UOU R                   R                  nUb  UOU R                   R                  nUbR  UcO  U R                   R
                  U R                   R                  :X  a!  [        R                  " [        [        5        UnUc  U R                  UUUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUS   nU R                  UUUSUUUUUUUUS	9nUS   nU R!                  U5      nUR#                  SS
S9u  nnUR%                  S
5      R'                  5       nUR%                  S
5      R'                  5       nSnU	b  U
b  [        U	R)                  5       5      S:  a*  U	R%                  S
5      R+                  UR,                  5      n	[        U
R)                  5       5      S:  a*  U
R%                  S
5      R+                  UR,                  5      n
UR)                  S5      nU	R/                  SU5      n	U
R/                  SU5      n
[1        US9nU" UU	5      nU" UU
5      nUU-   S-  nU(       d  UU4USS -   U-   nUb  U4U-   $ U$ [3        UUUUR4                  UR6                  UR8                  UR:                  UR<                  UR6                  UR8                  S9
$ )aM  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
    should be able to pad the inputs on both the right and the left.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for detail.

    [What are input IDs?](../glossary#input-ids)

    To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
    is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

    To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
    Training](./t5#training).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
    1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
    `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
NFr  rn  r   r'   rC   ro  )r  rU  r  r  rl  rm  r  r   r8  r9  r  rp  r:   r   r  )
r  start_logits
end_logitsr  rq  rr  r  rs  rl  rt  )rx   r  r8  rn   r  r  rA  r  r  ._T5ForQuestionAnswering__HEAD_MASK_WARNING_MSGr  r@  r   r   rh   rL   r  ra   r  r2  r  r   r  rk  r	   r   r  r   r  r  r  )r   r  rU  r  r  r  rj  r   rk  r  r  r  rl  r8  r9  r  rp  r   rv  r  rM   r  r  
total_lossignored_indexr  
start_lossend_lossr  s                                r6   r   T5ForQuestionAnswering.forward
	  s{   x &1%<k$++B]B]!*!6IDKK<Q<Q	&=+DI
 $)>)F  U 
 !% 1 1) <!*!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/ "/#1'!5/!5# ' 
 *!,1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J//!"2EEWF/9/EZMF*Q6Q2%!+;;"1"?"?.99,==&5&G&G"1"?"?.99
 	
r   )rL   r@  r  r  r|  r  r  rx  )r   r   r   r   ry  rz  r(   r   r  r  rW  r\  ra  r    r   rq   r{  r|  r}  r   r   r.  r   r   r   r   r   r   s   @r6   r  r    s   *r)s&79VW$x $4:
O
  156:8<=A159=7;@D6:4859=A$(,0/3&*#`
E,,-`
 !!2!23`
 $E$4$45	`

 !))9)9 :`
 E--.`
 $E$5$56`
 'u||4`
 "%ell(;"<=`
 "%"2"23`
   0 01`
   1 12`
  ((9(9:`
 D>`
 $D>`
  'tn!`
" d^#`
$ 
uU&&')LL	M%`
 `
r   r  )r  r  r  r  r   r  r  r  )[r  r=  r  rY   r  typingr   r   r   r   rq   r   torch.nnr   r	   r
   activationsr   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r    r!   r"   r#   r$   utils.model_parallel_utilsr%   r&   configuration_t5r(   !torch.nn.attention.flex_attentionr)   integrations.flex_attentionr*   
get_loggerr   rW   r   r4  r5  Moduler   apex.normalizationr   r\   rV   	Exceptionr  r`   r   r   r   r   rO  r[  rc  rx  r  r  __HEAD_MASK_WARNING_MSGr  r  r  r  r  r  __all__r3   r   r6   <module>r
     s      	  / /   A A ! C C ) >   . g g	 	 	 L &  !!;J 
		H	%f\  B (+")) +2	/K
KKab   K (bii ,299 :		 $a")) aH!299 !H#BII #Labii aH299 $ m! m! m!`x xx  o
 o
 o
d 
|&!2O |&
|&~	 v& v vr _
"3 _
_
D F
0 F
 F
R R
. R
 R
j	CE  	 	
NNWX	s   H8 8III