o
    Zh$                    @   s  d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, e' rddl-m.Z. ddl/m0Z0 e*1e2Z3d\de
j4de5de5de5de
j4f
ddZ6de
j4de5de5de
j4fddZ7d\de
j4de5de5de5de
j4f
ddZ8de5de
j4fd d!Z9d"e
j4de5de
j4fd#d$Z:d%e
j4de5d&e
j;de
j4fd'd(Z<d%e
j4d)e5dee
j4e
j4f fd*d+Z=d%e
j4d)e5de
j4fd,d-Z>d.e
j4d/e
j4d0e5de
j4fd1d2Z?G d3d4 d4ej@ZAzdd5lBmCZC eCZAe3Dd6 W n eEy;   Y n eFyI   e3Gd7 Y nw e HeA G d8d9 d9ej@ZIG d:d; d;ej@ZJG d<d= d=ej@ZKG d>d? d?ej@ZLG d@dA dAej@ZMG dBdC dCej@ZNG dDdE dEej@ZOG dFdG dGej@ZPG dHdI dIej@ZQG dJdK dKej@ZRG dLdM dMej@ZSe&G dNdO dOeZTG dPdQ dQeTZUdRZVe&G dSdT dTeTZWe&dUdVG dWdX dXeTeZXe&G dYdZ dZeTZYg d[ZZdS )]zPyTorch LongT5 model.    N)AnyListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )LongT5Config)	BlockMask)make_flex_block_causal_maskx	block_lendim	pad_valuereturnc                 C   s   | j |  | }t| j s"t| j }||  |7  < tj|| jdS dg| j }d|f||< t|ddd d}tj	j
| |d|d} | S )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr)   ndimsumr   
functionalr/   )r#   r$   r%   r&   Zpad_lenZ	new_shaper/   r,   r,   Y/var/www/auris/lib/python3.10/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multiple>   s   

r;   c                 C   s~   | j | | dkrt| ||dd} | j | | }| j d| ||f | j |d d  }d|v r:tj|| j| jdS | |S )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
    r   )r&   Nr   r)   device)r2   r;   r5   emptyr)   r=   reshape)r#   r$   r%   
num_blocksZoutput_shaper,   r,   r:   _split_into_blocksN   s   (
rA   	block_dimsequence_dimc           	      C   s   | j | }dg| j }d||< t|ddd d}tjj| |d|d} g }tdD ]}td	dg| j }t||| ||< t|}|	| |  q)t
j||d
S )zConcatenate three consecutive blocks for each input block for local attentiont.

    For more information, see: https://arxiv.org/pdf/2112.07916.pdf.
    r*   )r   r   Nr+   r,   r-   r.   r	   r   r%   )r2   r7   r8   r   r9   r/   rangeslicetupleappendr5   cat)	r#   rB   rC   r&   r@   r/   Zblocks_listiindicesr,   r,   r:   _concatenate_3_blocks]   s   
rL   c                 C   s:   t jd|  t jd}|| |   }|d|d }|S )z:Makes 3-blocked relative position ids for local attention.r	   r(   r   r   )r5   arangeZint32	unsqueeze)r$   Zposition_idsZcenter_position_idsrelative_position_idsr,   r,   r:   "_make_3block_relative_position_idsv   s   rP   local_attention_maskc                 C   sF   t |}t||k }|ddddddf }|| j}t| |S )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)rP   r5   abstor=   logical_and)rQ   r$   rO   Zlocality_maskr,   r,   r:   _mask_local_attention_mask   s
   rU   attention_maskr=   c                 C   sV   t | |dd}t|ddd}|d}|d}t||}t||}|d|S )z;Prepare attention mask to be applied for a local attention.r   rD      rB   rC   r+   )rA   rL   rN   r5   rT   rU   rS   )rV   r$   r=   Z_blocked_attention_maskZ_3blocked_attention_maskrQ   r,   r,   r:   _get_local_attention_mask   s   


rZ   global_block_sizec                    s^  | j dd \}dtjdtjf fdd}tj| | jd  }tj|dd	| }t| d
kdd| j}t	|| d | j}tj
d|j|jd}t||k||}||  | d  }||}  }|dkr|tj|ddj|ddd}	ntj|d|j|jd}	tjt||ddd }
|
| j}
t|
|	kdd}
|tj|
tjfS )a  Obtain the "fixed block" global id corresponding to each input token.

    This implementation is a simplified version of the original Flaxformr implementation adopted from:
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
    the whole fixed block, are assigned to the preceding block.

    Padding tokens from the original sequence are represented by -1.
    NrW   	block_idsr'   c                    sd   t    d k}|| j}t || dk}|dd| jd }t 	| |k | |} | S )Nr   r   r+   )
r5   rM   rS   r=   rT   r8   rN   typer)   where)r\   Z
block_endsZtrue_block_endsZfull_blocksr[   Zseq_lenr,   r:   handle_orphan_tokens   s   z:_make_global_fixed_block_ids.<locals>.handle_orphan_tokensr=   r   )Zaxis              ?g     @r+   r<   r   rD   )r2   r5   TensorZ	ones_liker=   Zcumsumr^   r]   r)   floortensormaxvaluesrepeat	transposer6   onesrS   int)rV   r[   
batch_sizer`   Zfixed_block_maskmaskZglobal_block_idsZ_global_block_ids_lower_boundZnum_globalsZ_sequence_block_ids_maxglobal_segment_idsr,   r_   r:   _make_global_fixed_block_ids   s,   
"rp   c                 C   s@   t | |\}}|jd }tj||jd}||d  }|tjS )zBCreate the relative position tensor for local -> global attention.r+   ra   .N)rp   r2   r5   rM   r=   r]   int64)rV   r[   r\   ro   global_seq_lenZglobal_positionsside_relative_positionr,   r,   r:    _make_side_relative_position_ids   s
   
ru   hidden_statesr\   rs   c                 C   sf   | |dktj||j|jd}tj|tj	|d ddddddf }t
d| || jS )zFCompute individual block aggregates by summing over individual blocks.r   r<   r   Nr+   z...nd,...ng->...gd)r^   r5   rf   r)   r=   r   r9   Zone_hotr]   rr   einsum)rv   r\   rs   Zone_hot_block_idsr,   r,   r:   _create_global_aggregates   s
   0rx   c                       s&   e Zd Zd fdd	Zdd Z  ZS )LongT5LayerNormư>c                    s&   t    tt|| _|| _dS )zg
        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parameterr5   rk   weightvariance_epsilon)selfZhidden_sizeeps	__class__r,   r:   r|      s   

zLongT5LayerNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )NrW   r+   T)Zkeepdim)rS   r5   Zfloat32powmeanZrsqrtr   r~   r)   float16Zbfloat16)r   rv   Zvariancer,   r,   r:   forward   s
   
zLongT5LayerNorm.forward)rz   )__name__
__module____qualname__r|   r   __classcell__r,   r,   r   r:   ry      s    ry   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                       *   e Zd Zdef fddZdd Z  ZS )LongT5DenseActDenseconfigc                    sT   t    tj|j|jdd| _tj|j|jdd| _t|j	| _
t|j | _d S NFbias)r{   r|   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr
   dense_act_fnactr   r   r   r,   r:   r|   
  s
   
zLongT5DenseActDense.__init__c                 C   sl   |  |}| |}| |}t| jjtjr/|j| jjjkr/| jjjtj	kr/|
| jjj}| |}|S N)r   r   r   
isinstancer   r~   r5   rd   r)   Zint8rS   )r   rv   r,   r,   r:   r     s   



zLongT5DenseActDense.forwardr   r   r   r    r|   r   r   r,   r,   r   r:   r   	  s    r   c                       r   )LongT5DenseGatedActDenser   c                    sj   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _t|j | _d S r   )r{   r|   r   r   r   r   wi_0wi_1r   r   r   r   r
   r   r   r   r   r,   r:   r|      s   
z!LongT5DenseGatedActDense.__init__c                 C   s:   |  | |}| |}|| }| |}| |}|S r   )r   r   r   r   r   )r   rv   Zhidden_geluZhidden_linearr,   r,   r:   r   (  s   


z LongT5DenseGatedActDense.forwardr   r,   r,   r   r:   r     s    r   c                       r   )LongT5LayerFFr   c                    sJ   t    |jrt|| _nt|| _t|j|jd| _	t
|j| _d S )Nr   )r{   r|   Zis_gated_actr   DenseReluDenser   ry   r   layer_norm_epsilon
layer_normr   r   r   r   r   r   r,   r:   r|   3  s   

zLongT5LayerFF.__init__c                 C   s&   |  |}| |}|| | }|S r   )r   r   r   )r   rv   Zforwarded_statesr,   r,   r:   r   =  s   

zLongT5LayerFF.forwardr   r,   r,   r   r:   r   2  s    
r   c                       sl   e Zd Z		ddedee f fddZdd ZedddZ	dddZ
									dddZ  ZS )LongT5AttentionFNr   	layer_idxc                    s  t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| j
| j | _|| _|d u r@| jr@td| jj d tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrxt| j| j
| _t | _d| _d S )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r{   r|   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   r   r   r   r   r,   r:   r|   F  s.   

zLongT5Attention.__init__c                 C      t |dkrd S t|| j| j| j\}}t| j|| _t| j|| _t| j|| _t| j	|dd| _	| jt | | _| j| j | _
| j|| _d S Nr   r   rD   lenr   r   r   r   r   r   r   r   r   r   unionr   headsindexr,   r,   r:   prune_headsi     zLongT5Attention.prune_headsT       c                 C      d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   rW   r   rS   r5   longrR   minZ
zeros_likelogfloatmathZ	full_liker^   relative_positionbidirectionalnum_bucketsmax_distanceZrelative_bucketsZ	max_exactZis_smallZrelative_position_if_larger,   r,   r:   _relative_position_buckety  s*   z)LongT5Attention._relative_position_bucketc           
      C   s   |du r	| j jj}|du rtj|tj|ddddf }n|dddf |}tj|tj|ddddf }|| }| j|| j | j	| j
d}|  |}	|	g dd}	|	S )%Compute binned relative position biasNr<   r   r   r   rW   r   r   r   )r   r~   r=   r5   rM   r   rS   r   r   r   r   permuterN   )
r   query_length
key_lengthr=   cache_positioncontext_positionmemory_positionr   relative_position_bucketrh   r,   r,   r:   compute_bias  s    
 
zLongT5Attention.compute_biasc                 C   s  |j dd \}}|du}| |}||d| j| jdd}|dur4|j| j}|r1|j	}n|j
}|r8|n|}|rO|durO|rO|j| j }|j| j }nE| |}| |}||d| j| jdd}||d| j| jdd}|dur|s}|
nd}
|||| jd|
i\}}|rd|j| j< t||dd}|du r|j d }|dur|n|
d d }| jstjd| j||f|j|jd	}| jr| jrd|_n| j|||j|
d
}|dddd| dddf }|dur|ddddddd|j d f }|| }| jr%t|j d }d|t| j< |dd| f }n|}||7 }tjj |! dd"|}tjj#|| j#| jd}|durL|| }t||}|dd$ }||d| j%}| &|}|||f}|	rt||f }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        NrW   r+   r   r   Tr	   rY   r=   r)   )r=   r   r   rD   ptraining)'r2   r   viewr   r   rj   
is_updatedgetr   Zcross_attention_cacheself_attention_cacheZ	key_cacheZvalue_cacher   r   updater5   matmulr   r6   r=   r)   r   r   requires_gradr   r   rk   r4   boolr   r9   softmaxr   type_asr   
contiguousr   r   )r   rv   rn   key_value_statesposition_biaspast_key_valuelayer_head_maskr   	use_cacheoutput_attentionsr   rm   
seq_lengthZis_cross_attentionquery_statesr   Zcurr_past_key_valueZcurrent_states
key_statesvalue_statesscoresr   Zreal_seq_lengthcausal_maskZposition_bias_maskedattn_weightsattn_outputoutputsr,   r,   r:   r     sx   





"
&



zLongT5Attention.forwardFNTr   r   )NN)	NNNNNNFFN)r   r   r   r    r   rl   r|   r   staticmethodr   r   r   r   r,   r,   r   r:   r   E  s,    #
/r   c                       sb   e Zd Zddededdf fddZdd	 ZedddZde	fddZ
				dddZ  ZS )LongT5LocalAttentionFr   r   r'   Nc                    s   t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| jd | _|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrmt| j| j
| _t | _d| _d S )Nr   Fr   )r{   r|   r   r   r   r   r   r   r   r   r   local_radiusr$   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r,   r:   r|   *  s(   

zLongT5LocalAttention.__init__c                 C   r   r   r   r   r,   r,   r:   r   D  r   z LongT5LocalAttention.prune_headsTr   r   c                 C   r   r   r   r   r,   r,   r:   r   T  *   z.LongT5LocalAttention._relative_position_bucketblock_lengthc                 C      | j jjjdkr| j jjnd}tjd| tj|d}|||  }|dddf |dddf  }| j|| j | j	| j
d}|  |}|g ddd}|S r   metaNr	   r<   r   r   r   r   r~   r=   r]   r5   rM   r   r   r   r   r   r   rN   r   r
  Ztarget_devicer   r   r   r   rh   r,   r,   r:   r         
 
z!LongT5LocalAttention.compute_biasc                    s  |j d d \ } fdd} fdd}||}	||}
||}t|	jdd}	t|
jdd}
t|jdd}t|
ddd}
t|ddd}td	|	|
}|d u rj	s~tj
ddjjd
j f|j|jd}jr}jr}d|_nj}|d urt|dkdd}||dd }||7 }tjj| dd|}tjj|jjd}|d ur|| }||j}|td||}|d d d |d d f }|}d }|f|f |f }|r||f }|S )NrW   c                       |   djjS Z
projectionr+   r   r   r   Zstatesrm   r   r,   r:   r2        z+LongT5LocalAttention.forward.<locals>.shapec                       |    djS r?   r+   r   r   r   r  r  r,   r:   unshape  r  z-LongT5LocalAttention.forward.<locals>.unshaper   rD   rX   ...qhd,...khd->...hqkr	   r   Tr   rb       _r+   r   ...hqk,...khd->...qhd)r2   r   r   r   rA   r$   rL   r5   rw   r   r6   r   r=   r)   r   r   r   r   r^   rj   r   r9   r   r   r   r   r]   r   )r   rv   rn   r   r   r   r   r2   r  r   r   r   r   r   r  present_key_value_stater  r,   r  r:   r     sP   

zLongT5LocalAttention.forwardFr  NNNF)r   r   r   r    r   r|   r   r  r   rl   r   r   r   r,   r,   r   r:   r  )  s    /r  c                       s~   e Zd Zddededdf fddZdd	 ZedddZde	fddZ
dejdejdejfddZ				dddZ  ZS )LongT5TransientGlobalAttentionFr   r   r'   Nc                    s  t    |j| _|| _|j| _|j| _|j| _|j| _|j	| _
|j| _| jd | _|j| _|j| _| j
| j | _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _| jrqt| j| j
| _t | _| jrt| j| j
| _t|j|jd| _d S )Nr   Fr   r   )r{   r|   r   r   r   r   r   r   r   r   r   r  r$   r[   r   r   r   r   r   r   r   r   r   r   r   r   r   global_relative_attention_biasry   r   global_input_layer_normr  r   r,   r:   r|     s.   
z'LongT5TransientGlobalAttention.__init__c                 C   r   r   r   r   r,   r,   r:   r   	  r   z*LongT5TransientGlobalAttention.prune_headsTr   r   c                 C   r   r   r   r   r,   r,   r:   r     r	  z8LongT5TransientGlobalAttention._relative_position_bucketr
  c                 C   r  r  r  r  r,   r,   r:   r   J  r  z+LongT5TransientGlobalAttention.compute_biasrn   ro   c                 C   s   t |d |d d d d d f d d d df }t |dkdd}t|| j}| j|| j | j| jd}| 	|}|
g d}|| }|S )Nrq   .r   rb   r  r   )r   r	   r   rW   )r5   eqr^   ru   r[   r   r   r   r   r"  r   )r   rn   ro   Zside_attention_maskZattention_side_biasrt   Zside_relative_position_bucketZ	side_biasr,   r,   r:   compute_side_biasb  s   0
z0LongT5TransientGlobalAttention.compute_side_biasc                    s@  |j d d \ } fdd} fdd}t|d ur|n	t|j d d j\}	}
|
j d }t||	|}|}||}||}|	|}||}|	|}t
|jdd}t
|jdd}t
|jdd}t|ddd	}t|ddd	}dg|jd  }|j d |d< |d|}|d|}tj||gdd}tj||gdd}td
||}|d urt|j|j}t|dkdd}nd }|d u rEjstjddjjdj f|j|jd}jrjrd|_nj}|d ur||dd }||j}|d u r t |}||
}t
|jdddd}||j |j}tj||gdd}||7 }t!j"j#|$ dd%|}t!j"j&|j&jd}|d urj|| }||j}|td||}|d d d |d d f }'|}d }|f|f |f }|r||f }|S )NrW   c                    r  r  r  r  r  r,   r:   r2     r  z5LongT5TransientGlobalAttention.forward.<locals>.shapec                    r  r  r  r  r  r,   r:   r    r  z7LongT5TransientGlobalAttention.forward.<locals>.unshaper+   r   rD   rX   r  r   rb   r  r	   r   TrY   r   r  )(r2   rp   r5   rk   r[   rx   r#  r   r   r   rA   r$   rL   r7   rN   ri   rI   rw   rZ   r=   r^   r   r6   r   r)   r   r   r   r   rj   r]   r%  rS   r   r9   r   r   r   r   r   )r   rv   rn   r   r   r   r   r2   r  r\   ro   Z_global_seq_lenZglobal_inputsr   r   r   Zside_key_statesZside_value_statesZrepsr   rQ   Zside_position_biasr   r  r  r  r,   r  r:   r   w  s   







z&LongT5TransientGlobalAttention.forwardr  r  r   )r   r   r   r    r   r|   r   r  r   rl   r   r5   rd   r%  r   r   r,   r,   r   r:   r!    s    /r!  c                       s@   e Zd Zddee f fddZ							d	ddZ  ZS )
LongT5LayerSelfAttentionFNr   c                    s>   t    t|||d| _t|j|jd| _t	|j
| _d S )Nr   r   r   )r{   r|   r   SelfAttentionry   r   r   r   r   r   r   r   r   r   r,   r:   r|     s   
z!LongT5LayerSelfAttention.__init__c	              
   C   sL   |  |}	| j|	|||||||d}
|| |
d  }|f|
dd   }|S )N)rn   r   r   r   r   r   r   r   r   )r   r(  r   )r   rv   rV   r   r   r   r   r   r   normed_hidden_statesattention_outputr  r,   r,   r:   r     s   

z LongT5LayerSelfAttention.forwardr  )NNNNFFNr   r   r   r   rl   r|   r   r   r,   r,   r   r:   r&    s    r&  c                       D   e Zd ZdZd
dee f fddZ				ddefdd	Z  Z	S )LongT5LayerLocalSelfAttentionz$Local self attention used in encoderFNr   c                    <   t    t||d| _t|j|jd| _t	|j
| _d S N)r   r   )r{   r|   r  LocalSelfAttentionry   r   r   r   r   r   r   r   r   r   r,   r:   r|     s   
z&LongT5LayerLocalSelfAttention.__init__kwargsc           
      K   F   |  |}| j|||||d}|| |d  }|f|dd   }	|	S N)rn   r   r   r   r   r   )r   r0  r   
r   rv   rV   r   r   r   r1  r)  r*  r  r,   r,   r:   r        
	z%LongT5LayerLocalSelfAttention.forwardr  r   
r   r   r   __doc__r   rl   r|   r   r   r   r,   r,   r   r:   r-    s    	r-  c                       r,  )'LongT5LayerTransientGlobalSelfAttentionz/Transient-Global self attention used in encoderFNr   c                    r.  r/  )r{   r|   r!  TransientGlobalSelfAttentionry   r   r   r   r   r   r   r   r   r   r,   r:   r|   7  s   
z0LongT5LayerTransientGlobalSelfAttention.__init__r1  c           
      K   r2  r3  )r   r9  r   r4  r,   r,   r:   r   ?  r5  z/LongT5LayerTransientGlobalSelfAttention.forwardr  r   r6  r,   r,   r   r:   r8  4  s    r8  c                       sB   e Zd Zddee f fddZ								d	ddZ  ZS )
LongT5LayerCrossAttentionNr   c                    s>   t    t|d|d| _t|j|jd| _t	|j
| _d S )NFr'  r   )r{   r|   r   EncDecAttentionry   r   r   r   r   r   r   r   )r   r   r   r   r,   r:   r|   W  s   
z"LongT5LayerCrossAttention.__init__Fc                 C   sP   |  |}| j|||||||||	|
d
}|| |d  }|f|dd   }|S )N)	rn   r   r   r   r   r   r   r   r   r   r   )r   r;  r   )r   rv   r   rV   r   r   r   r   r   r   r   r)  r*  Zlayer_outputr  r,   r,   r:   r   ]  s    
z!LongT5LayerCrossAttention.forwardr   )NNNNFNFNr+  r,   r,   r   r:   r:  V  s    
r:  c                       sJ   e Zd Zd	dee f fddZ												d
ddZ  ZS )LongT5BlockFNr   c                    s   t    |j| _|jrt}n|jdkrt}n|jdkrt}n	td|j dt	 | _
| j
||||d | jrE| j
t||d | j
t| d S )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .r'  )r   )r{   r|   r   r&  encoder_attention_typer-  r8  
ValueErrorr   
ModuleListlayerrH   r:  r   )r   r   r   r   Zattention_layerr   r,   r:   r|   }  s(   



zLongT5Block.__init__Tc                 C   s  | j d |||||	|
||d}|d d \}}	|dd  }|jtjkr<t| r<t|jjd }tj|| |d}| j	oB|d u}|r| j d ||||||	|d d |
||d
}|d d \}}	|jtjkrt| rt|jjd }tj|| |d}||dd   }| j d |}|jtjkrt| rt|jjd }tj|| |d}|f}|
r||	f | }|S || }|S )	Nr   )rV   r   r   r   r   r   r   rW   i  )r   rg   r   r+   )	r   rV   r   r   r   r   r   r   r   )
rB  r)   r5   r   isinfanyfinforg   clampr   )r   rv   rV   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskr   r   r   return_dictr   Zself_attention_outputsZattention_outputsZclamp_valueZdo_cross_attentionZcross_attention_outputsr  r,   r,   r:   r     sX   

zLongT5Block.forwardr  )NNNNNNNNFFTNr+  r,   r,   r   r:   r<  |  s    r<  c                   @   sB   e Zd ZeZdZdZdgZdZdZ	e
dd Zdd Zd	d
 ZdS )LongT5PreTrainedModelZtransformerTr<  Fc                 C   s$   t t}t t}|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r5   rf   r   r   )r   rN  Z
input_maskdummy_inputsr,   r,   r:   rP    s   

z"LongT5PreTrainedModel.dummy_inputsc                 C   s  | j j}t|tr|jj|d  dS t|ttt	frC|j
jjjd|d d t|dr?| j jsA|jjjjd|d d dS dS dS t|tr|jjjjd|| j jd  d t|jdrk|jjdurk|jjj  |jjjjd|| j jd  d t|jdr|jjdur|jjj  dS dS dS t|tr|jjjjd|| j jd  d t|jdr|jjdur|jjj  |jjjjd|| j jd  d t|jdr|jjdur|jjj  |jjjjd|| j jd  d t|jdr|jjdur	|jjj  dS dS dS t|tttfr| j j}| j j}| j j}|jjjjd||| d  d |jjjjd||d  d |j jjjd||d  d |j!jjjd||| d  d |j"r|j#jjjd||d  d t|tr|j$jjjd||d  d dS dS dS dS )zInitialize the weightsrc   rb   )r   Zstdlm_head      r   N)%r   Zinitializer_factorr   ry   r~   dataZfill_LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelsharedZnormal_hasattrtie_word_embeddingsrQ  r   r   r   r   Zzero_r   r   r   r   r   r   r  r!  r   r   r   r   r   r   r   r   r"  )r   modulefactorr   r   r   r,   r,   r:   _init_weights  sX   

       


z#LongT5PreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d u rtdt|r1t|jd d d |}tj||dd df gdd}n|	|j}|dd df 
 |ddd f< ||d< |d u rStd||d	k| |S )
Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information.r+   )r   .rD   r   ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr@  r   r5   fullr2   rI   Z	new_zeroscloneZmasked_fill_)r   rN  r^  r_  Zshifted_input_idsr,   r,   r:   _shift_right&  s      z"LongT5PreTrainedModel._shift_rightN)r   r   r   r    Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingZ_no_split_modulesZ_supports_cache_classZ_supports_static_cachepropertyrP  r\  rb  r,   r,   r,   r:   rL    s    

1rL  c                       s   e Zd Zd fdd	Zdd Zdd Z													ddd	Z	
ddeej	df dej	dej	de
def
ddZedej	dededejdej	defddZ  ZS )LongT5StackNc                    s   t    t j j| _|d ur|j| j_ j| _ j	| _	| j	d | _
t fddt jD | _t j jd| _t j| _d| _|   d S )Nr   c                    s"   g | ]}t  t|d k|dqS )r   r'  )r<  r   ).0rJ   r   r,   r:   
<listcomp>O  s    z(LongT5Stack.__init__.<locals>.<listcomp>r   F)r{   r|   r   r   
vocab_sizer   embed_tokensr~   r   r  r$   rA  rE   
num_layersblockry   r   final_layer_normr   r   r   r   	post_init)r   r   ri  r   rf  r:   r|   C  s    

zLongT5Stack.__init__c                 C      | j S r   ri  r   r,   r,   r:   get_input_embeddings]     z LongT5Stack.get_input_embeddingsc                 C   
   || _ d S r   ro  r   Znew_embeddingsr,   r,   r:   set_input_embeddingsa     
z LongT5Stack.set_input_embeddingsc           )      C   s  |	d ur|	n| j j}	|
d ur|
n| j j}
|d ur|n| j j}|d ur$|n| j j}|d urB|d urB| jr5dnd}td| d| d|d urS| }|d|d }n|d ur`| d d }n| jrednd}td| d| d	| j	r| j
r|	rtd
 d}	|d u r| jd usJ d| |}|\}}d}d}| jr|	s|d urt|trt|tsd}t|t }n#t|tsd}td t|}n|d u rtt t }n| jsd }|d ur| nd}|d u rtj||| |jd}|d u r	t s	|| }tj|||jd}| jr| ||||d ur|jnd |
}n| j jdkr/t|| j|j}n|}| jrX|d urX| \}}}||f}|d u rRtj||jd}| |}nd }| || j j }| || j j }|rodnd }|
rvdnd }|
r| jrdnd }d }d } | !|}!t"| j#D ]\}"}#||" }$||" }%|r||!f }| j	r| j
r| $|#j%|!||||| |$|%d |	|
||}&n|#|!||||| |$|%||	|
||d}&|	du r|&d d d |&dd   }&|&d d \}!}'|&d }| jr|d ur|&|
r dnd } |
r||&d f }| jr||&d f }q| &|!}!| !|!}!|r,||!f }|	r1|'nd }(|r9|j}(|r@|' }(|sQt(dd |!|(|||fD S t)|!|(|||dS )NZdecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer+   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   ra   r=  r,   )rV   r   rG  rH  rI  r   rJ  r   r   r   rK  r   r   r   rW      r	      c                 s   s    | ]	}|d ur|V  qd S r   r,   )re  r   r,   r,   r:   	<genexpr>%  s    z&LongT5Stack.forward.<locals>.<genexpr>)last_hidden_statepast_key_valuesrv   
attentionscross_attentions)*r   r   r   output_hidden_statesuse_return_dictr   r@  sizer   r   r   r   r   ri  r   r   r   r   Zfrom_legacy_cacheget_seq_lengthr5   rM   r=   r   rk   _update_causal_maskr   r?  rZ   r$   Zinvert_attention_maskZget_head_maskrj  r   	enumeraterk  Z_gradient_checkpointing_funcr   rl  Zto_legacy_cacherG   r   ))r   rN  rV   rG  rH  rx  	head_maskcross_attn_head_maskr}  r   r   r  rK  r   Zerr_msg_prefixZinput_shaperm   r   Zreturn_legacy_cacheZreturn_self_attention_cachepast_key_values_lengthZmask_seq_lengthr   Zencoder_batch_sizeZencoder_sequence_length_Zencoder_hidden_shapeZencoder_extended_attention_maskZall_hidden_statesZall_attentionsZall_cross_attentionsr   rI  rv   rJ   Zlayer_moduler   rJ  Zlayer_outputsZnext_decoder_cacheZ
next_cacher,   r,   r:   r   d  s.  








zLongT5Stack.forwardFrV   r!   input_tensorr   r}  r   c                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )NZflash_attention_2rb   Zflex_attentionr   FZsdpa)rx  r  Zis_trainingr   r+   )sequence_lengthtarget_lengthr)   r   rm   )cudaZxpuZnpu)r   Z_attn_implementationrD  r   r5   rd   r"   r  Zis_compileabler   Z_ignore_causal_mask_sdpar   r)   r2   Zget_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr=   r]   rE  r   Z_unmask_unattended)r   rV   r  r   r}  r   Zpast_seen_tokensZusing_compilable_cacher)   r  r  r   	min_dtyper,   r,   r:   r  9  sT   




zLongT5Stack._update_causal_maskr  r  r)   rm   c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nry  )Z
fill_valuer)   r=   r   )Zdiagonalra   r+   r   )r%   r5   rE  r   r`  r=   ZtriurM   r?   expandra  r2   rS   Zmasked_fill)rV   r  r  r)   r   rm   r1  r   r  Zmask_lengthZpadding_maskr,   r,   r:   r  }  s,    $
6  zALongT5Stack._prepare_4d_causal_attention_mask_with_cache_positionr   )NNNNNNNNNNNNNr  )r   r   r   r|   rq  ru  r   r   r5   rd   r   r   r  r  rl   r)   r  r   r,   r,   r   r:   rd  B  sZ    
 \
Drd  a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
c                &       sJ  e Zd ZdgZddgZdef fddZdd Zd	d
 Zdd Z	dd Z
dd Zdd Ze																d'deej deej deej deej deej deej deej deeeej   deeeej   deej deej dee d ee d!ee d"ee d#eej d$eeej ef f"d%d&Z  ZS )(rT  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    s   t  | t|j|j| _t|}d|_	d|_
d|_t|| j| _t|}d|_	d|_|j|_t|| j| _|   d S )NFT)r{   r|   r   r   rh  r   rW  copydeepcopyr   r   is_encoder_decoderrd  encodernum_decoder_layersrj  decoderrm  r   r   encoder_configZdecoder_configr   r,   r:   r|     s   

zLongT5Model.__init__c                 C   rn  r   rW  rp  r,   r,   r:   rq    rr  z LongT5Model.get_input_embeddingsc                 C   "   || _ | j| | j| d S r   rW  r  ru  r  rt  r,   r,   r:   ru       z LongT5Model.set_input_embeddingsc                 C   4   | j jr| | jj| j | | jj| j d S d S r   r   rY  _tie_or_clone_weightsr  ri  rW  r  rp  r,   r,   r:   _tie_weights     zLongT5Model._tie_weightsc                 C   rn  r   r  rp  r,   r,   r:   get_encoder  rr  zLongT5Model.get_encoderc                 C   rn  r   r  rp  r,   r,   r:   get_decoder  rr  zLongT5Model.get_decoderc                 C   *   |  D ]\}}| jj| j| qdS z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        Nitemsr  rB  Z	attentionr   r   Zheads_to_prunerB  r   r,   r,   r:   _prune_heads     zLongT5Model._prune_headsNrN  rV   rM  rO  r  decoder_head_maskr  encoder_outputsr}  rx  decoder_inputs_embedsr   r   r  rK  r   r'   c                 C   s"  |dur|n| j j}|dur|n| j j}|dur,|du r,| j j| j jkr,ttt |}|du r=| j	|||
||||d}n$|rat
|tsat|d t|dkrR|d ndt|dkr]|d ndd}|d }| j||||	|||||||||d}|s}|| S t|j|j|j|j|j|j|j|jdS )	ax  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

        >>> # Let's try a very long encoder input.
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1

        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NrN  rV   rx  r  r   r  rK  r   r   rW   r|  rv   r~  rN  rV   rx  r}  rG  rH  r  r  r   r   r  rK  r   )r|  r}  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_staterG  encoder_attentions)r   r   r  rj  r  warningswarnZ#_LongT5Model__HEAD_MASK_WARNING_MSGFutureWarningr  r   r   r   r  r   r|  r}  rv   r~  r  )r   rN  rV   rM  rO  r  r  r  r  r}  rx  r  r   r   r  rK  r   rv   decoder_outputsr,   r,   r:   r     sd   Q	zLongT5Model.forward)NNNNNNNNNNNNNNNN)r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr    r|   rq  ru  r  r  r  r  r   r   r5   
LongTensorFloatTensor
BoolTensorrd   r   r   r   r   r   r   r,   r,   r   r:   rT    s~    	
rT  z>
    LONGT5 Model with a `language modeling` head on top.
    )Zcustom_introc                (       sv  e Zd ZdgZg dZdef fddZdd Zdd	 Zd
d Z	dd Z
dd Zdd Zdd Ze																	d-deej deej deej deej deej deej deej deeeej   deeeej   deej deej d eej d!ee d"ee d#ee d$ee d%eej d&eeej ef f$d'd(Zd ejfd)d*Zd+d, Z  ZS ).rU  r  )r  r  zlm_head.weightr   c                    s   t  | |j| _t|j|j| _t	|}d|_
d|_d|_t|| j| _t	|}d|_
d|_|j|_t|| j| _tj|j|jdd| _|   d S )NFTr   )r{   r|   r   	model_dimr   r   rh  rW  r  r  r   r   r  rd  r  r  rj  r  r   rQ  rm  r  r   r,   r:   r|     s   

z'LongT5ForConditionalGeneration.__init__c                 C   rn  r   r  rp  r,   r,   r:   rq    rr  z3LongT5ForConditionalGeneration.get_input_embeddingsc                 C   r  r   r  rt  r,   r,   r:   ru    r  z3LongT5ForConditionalGeneration.set_input_embeddingsc                 C   r  r   r  rp  r,   r,   r:   r    r  z+LongT5ForConditionalGeneration._tie_weightsc                 C   rs  r   rQ  rt  r,   r,   r:   set_output_embeddings  rv  z4LongT5ForConditionalGeneration.set_output_embeddingsc                 C   rn  r   r  rp  r,   r,   r:   get_output_embeddings  rr  z4LongT5ForConditionalGeneration.get_output_embeddingsc                 C   rn  r   r  rp  r,   r,   r:   r    rr  z*LongT5ForConditionalGeneration.get_encoderc                 C   rn  r   r  rp  r,   r,   r:   r    rr  z*LongT5ForConditionalGeneration.get_decoderNrN  rV   rM  rO  r  r  r  r  r}  rx  r  labelsr   r   r  rK  r   r'   c                 C   s  |dur|n| j j}|dur|n| j j}|dur,|du r,| j j| j jkr,ttt |}|du r=| j	|||
||||d}n$|rat
|tsat|d t|dkrR|d ndt|dkr]|d ndd}|d }|durv|du rv|du rv| |}| j||||	|||||||||d}|d }| j jr|| jd  }| |}d}|durtd	d
}||j}||d|d|d}|s|f|dd  | }|dur|f| S |S t|||j|j|j|j|j|j|jd	S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
        >>> model = LongT5ForConditionalGeneration.from_pretrained(
        ...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
        ... )

        >>> # Let's try a very long input.
        >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
        >>> input_ids = inputs.input_ids

        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        abstractthe aim of this article is to provide an overview of the literature on the role of dog
        ```Nr  r   r   rW   r  r  rR  r]  )Zignore_indexr+   )	lossZlogitsr}  r  r  r  r  rG  r  )r   r   r  rj  r  r  r  Z6_LongT5ForConditionalGeneration__HEAD_MASK_WARNING_MSGr  r  r   r   r   rb  r  rY  r  rQ  r   rS   r=   r   r  r   r}  rv   r~  r  r|  )r   rN  rV   rM  rO  r  r  r  r  r}  rx  r  r  r   r   r  rK  r   rv   r  Zsequence_outputZ	lm_logitsr  Zloss_fctoutputr,   r,   r:   r     s~   U	


z&LongT5ForConditionalGeneration.forwardc                 C   s
   |  |S r   )rb  )r   r  r,   r,   r:   %prepare_decoder_input_ids_from_labelsf  rv  zDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsc              	   C   s   |d u rt d |S d}|D ]1}d}|D ]}||d||jf }q|d j|d jks1J t|t|ks;J ||f }q|S )NzHYou might want to consider setting `use_cache=True` to speed up decodingr,   r   )r   warningZindex_selectrS   r=   r2   r   )r   r}  Zbeam_idxZreordered_decoder_pastZlayer_past_statesZreordered_layer_past_statesZlayer_past_stater,   r,   r:   _reorder_cachei  s   
z-LongT5ForConditionalGeneration._reorder_cache)NNNNNNNNNNNNNNNNN)r   r   r   r  r  r    r|   rq  ru  r  r  r  r  r  r   r   r5   r  r  r  rd   r   r   r   r   r   r  r  r   r,   r,   r   r:   rU    s    	
 )rU  c                       s   e Zd ZdgZdgZdef fddZdd Zdd	 Zd
d Z	dd Z
dd Ze							ddeej deej deej deej dee dee dee deeej ef fddZ  ZS )rV  r  r  r   c                    sN   t  | t|j|j| _t|}d|_	d|_
t|| j| _|   d S )NF)r{   r|   r   r   rh  r   rW  r  r  r   r  rd  r  rm  )r   r   r  r   r,   r:   r|     s   
zLongT5EncoderModel.__init__c                 C   rn  r   r  rp  r,   r,   r:   rq    rr  z'LongT5EncoderModel.get_input_embeddingsc                 C   s   || _ | j| d S r   )rW  r  ru  rt  r,   r,   r:   ru    s   z'LongT5EncoderModel.set_input_embeddingsc                 C   s"   | j jr| | jj| j d S d S r   )r   rY  r  r  ri  rW  rp  r,   r,   r:   r    s   zLongT5EncoderModel._tie_weightsc                 C   rn  r   r  rp  r,   r,   r:   r    rr  zLongT5EncoderModel.get_encoderc                 C   r  r  r  r  r,   r,   r:   r    r  zLongT5EncoderModel._prune_headsNrN  rV   r  rx  r   r  rK  r'   c           	   	   C   s0   |dur|n| j j}| j|||||||d}|S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  )r   r  r  )	r   rN  rV   r  rx  r   r  rK  r  r,   r,   r:   r     s   #
zLongT5EncoderModel.forward)NNNNNNN)r   r   r   r  r  r    r|   rq  ru  r  r  r  r   r   r5   r  r  r   r   r   r   r   r   r,   r,   r   r:   rV    sD    	rV  )rV  rU  rT  rL  )r   )[r7  r  r   r  typingr   r   r   r   r   r5   r   Ztorch.nnr   Zactivationsr
   Zcache_utilsr   r   r   Z
generationr   Zmodeling_attn_mask_utilsr   Zmodeling_outputsr   r   r   r   Zmodeling_utilsr   Zpytorch_utilsr   r   r   utilsr   r   r   r   r   r   r   Zconfiguration_longt5r    Z!torch.nn.attention.flex_attentionr!   Zintegrations.flex_attentionr"   Z
get_loggerr   r   rd   rl   r;   rA   rL   rP   rU   r=   rZ   rp   ru   rx   Modulery   Zapex.normalizationr   infoImportError	Exceptionr  rH   r   r   r   r   r  r!  r&  r-  r8  r:  r<  rL  rd  Z__HEAD_MASK_WARNING_MSGrT  rU  rV  __all__r,   r,   r,   r:   <module>   s   $	
$$	 	
1	


 e A  
$"&da  w C |X