o
    ZhJ                    @   s  d Z ddlZddlZddlmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ e"%e&Z'dej(de)de)fddZ*G dd de
j+Z,G dd de
j-Z.G dd de
j-Z/G dd de
j-Z0G dd de
j-Z1G dd de
j-Z2e!G d d! d!eZ3G d"d# d#e3Z4G d$d% d%e3Z5e!G d&d' d'e3Z6e!d(d)G d*d+ d+e3eZ7e!d,d)G d-d. d.e3Z8e!G d/d0 d0e3Z9G d1d2 d2e3Z:G d3d4 d4e3eZ;g d5Z<dS )6zPyTorch MVP model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging   )	MvpConfig	input_idspad_token_iddecoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |du r*td||dk| |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)Z	new_zerosshapeclone
ValueErrorZmasked_fill_)r   r   r   Zshifted_input_ids r"   S/var/www/auris/lib/python3.10/site-packages/transformers/models/mvp/modeling_mvp.pyshift_tokens_right2   s   (r$   c                       sJ   e Zd ZdZdedef fddZddejd	ed
ejf fddZ  Z	S )MvpLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                    s   d| _ t || j  | d S N   )offsetsuper__init__)selfr&   r'   	__class__r"   r#   r,   H   s   z&MvpLearnedPositionalEmbedding.__init__r   Nr   past_key_values_lengthposition_idsc                    s\   |du r |j dd \}}tj||| tj| jjd|d}n|d}t 	|| j
 S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr)   )dtypedevicer   r   )r   torcharangelongweightr3   expandZ	unsqueezer+   forwardr*   )r-   r   r0   r1   bszseq_lenr.   r"   r#   r9   N   s   
z%MvpLearnedPositionalEmbedding.forwardr   N)
__name__
__module____qualname____doc__intr,   r4   Tensorr9   __classcell__r"   r"   r.   r#   r%   C   s    (r%   c                       s   e Zd ZdZ			ddedededed	ef
 fd
dZdej	dedefddZ
						ddej	deej	 deeej	  deej	 deej	 deej	 dedeej	eej	 eeej	  f fddZ  ZS )MvpAttentionz=Multi-headed attention from 'Attention Is All You Need' paper        FT	embed_dim	num_headsdropout
is_decoderbiasc                    s   t    || _|| _|| _|| | _| j| | jkr'td| j d| d| jd | _|| _t	j
|||d| _t	j
|||d| _t	j
|||d| _t	j
|||d| _d S )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rJ   )r+   r,   rF   rG   rH   head_dimr!   scalingrI   r   Lineark_projv_projq_projout_proj)r-   rF   rG   rH   rI   rJ   r.   r"   r#   r,   _   s"   


zMvpAttention.__init__tensorr;   r:   c                 C   s    | ||| j| jdd S )Nr   r)   )viewrG   rL   	transpose
contiguous)r-   rS   r;   r:   r"   r"   r#   _shapez   s    zMvpAttention._shapeNhidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskattn_promptoutput_attentionsreturnc                 C   s  |du}|  \}	}
}| || j }|r"|dur"|d }|d }nZ|r9| | |d|	}| | |d|	}nC|durh| | |d|	}| | |d|	}tj|d |gdd}tj|d |gdd}n| | |d|	}| | |d|	}| jr||f}|durtj|d 	|	ddd|gdd}tj|d 	|	ddd|gdd}|durt
|	d|
|d  d|j}tj||gdd}|	| j d| jf}| ||
|	j| }|j| }|j| }| d}t||dd}|  |	| j |
|fkrtd|	| j |
|f d|   |durG|  |	d|
|fkr2td	|	d|
|f d|   ||	| j|
|| }||	| j |
|}tjj|dd}|dur|  | jfkrktd
| jf d|   |dddd||	| j|
| }||	| j |
|}|r||	| j|
|}||	| j |
|}nd}tjj|| j| jd}t||}|  |	| j |
| jfkrtd|	| j|
| jf d|   ||	| j|
| j}|dd}||	|
| j}| |}|||fS )z#Input shape: Batch x Time x ChannelNr   r   r   r)   dimz$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )sizerQ   rM   rW   rO   rP   r4   catrI   r8   zerostor3   rG   rL   rT   ZbmmrU   r!   r   
functionalZsoftmaxrH   rd   ZreshaperF   rR   )r-   rX   rY   rZ   r[   r\   r]   r^   Zis_cross_attentionr:   tgt_len_Zquery_statesZ
key_statesZvalue_statesZprompt_maskZ
proj_shapeZsrc_lenattn_weightsZattn_weights_reshapedZ
attn_probsZattn_outputr"   r"   r#   r9   }   s   
"""




"

zMvpAttention.forward)rE   FT)NNNNNF)r=   r>   r?   r@   rA   floatboolr,   r4   rB   rW   r   r   r9   rC   r"   r"   r.   r#   rD   \   sP    	rD   c                       sd   e Zd Zdef fddZ	ddejdejdejdejd	ee d
e	ejeej f fddZ
  ZS )MvpEncoderLayerconfigc                    s   t    |j| _t| j|j|jd| _t	| j| _
|j| _t|j | _|j| _t| j|j| _t|j| j| _t	| j| _d S )N)rF   rG   rH   )r+   r,   d_modelrF   rD   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normrH   r   activation_functionactivation_fnactivation_dropoutrN   Zencoder_ffn_dimfc1fc2final_layer_normr-   rp   r.   r"   r#   r,      s   
zMvpEncoderLayer.__init__FrX   r[   r\   self_attn_promptr^   r_   c                 C   s  |}| j |||||d\}}}tjj|| j| jd}|| }| |}|}| | |}tjj|| j| jd}| 	|}tjj|| j| jd}|| }| 
|}|jtjkrwt| set| rwt|jjd }	tj||	 |	d}|f}
|r|
|f7 }
|
S )a@  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, encoder_attention_heads, pro_len, head_dim)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rX   r[   r\   r]   r^   rb   i  )minmax)rt   r   ri   rH   rd   rv   rx   rz   ry   r{   r|   r2   r4   Zfloat16isinfanyisnanZfinfor   clamp)r-   rX   r[   r\   r~   r^   residualrl   rk   Zclamp_valueoutputsr"   r"   r#   r9     s:   



zMvpEncoderLayer.forward)F)r=   r>   r?   r   r,   r4   FloatTensorr   rn   r   r9   rC   r"   r"   r.   r#   ro      s     ro   c                       s   e Zd Zdef fddZ										ddejdeej d	eej d
eej deej deej deej deej deeej  dee	 dee	 deej
eeej
ej
f  f fddZ  ZS )MvpDecoderLayerrp   c                    s   t    |j| _t| j|j|jdd| _|j| _t	|j
 | _|j| _t| j| _t| j|j|jdd| _t| j| _t| j|j| _t|j| j| _t| j| _d S )NT)rF   rG   rH   rI   )rH   rI   )r+   r,   rq   rF   rD   decoder_attention_headsrs   rt   rH   r   rw   rx   ry   r   ru   rv   encoder_attnencoder_attn_layer_normrN   Zdecoder_ffn_dimrz   r{   r|   r}   r.   r"   r#   r,   @  s,   
zMvpDecoderLayer.__init__NFTrX   r[   encoder_hidden_statesencoder_attention_maskr\   cross_attn_layer_head_maskr~   cross_attn_promptrZ   r^   	use_cacher_   c              	   C   s^  |}|	dur|	dd nd}| j ||||||
d\}}}tjj|| j| jd}|| }| |}d}d}|durm|}|	durD|	dd nd}| j|||||||
d\}}}tjj|| j| jd}|| }| |}|| }|}| | 	|}tjj|| j
| jd}| |}tjj|| j| jd}|| }| |}|f}|
r|||f7 }|r||f7 }|S )aC  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            self_attn_prompt (`torch.FloatTensor`): prompt of self attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            cross_attn_prompt (`torch.FloatTensor`): prompt of cross attention of shape
                `(2, decoder_attention_heads, pro_len, head_dim)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        Nr)   )rX   rZ   r[   r\   r]   r^   rb   )rX   rY   r[   r\   r]   rZ   r^   )rt   r   ri   rH   rd   rv   r   r   rx   rz   ry   r{   r|   )r-   rX   r[   r   r   r\   r   r~   r   rZ   r^   r   r   Zself_attn_past_key_valueZself_attn_weightsZpresent_key_valueZcross_attn_present_key_valueZcross_attn_weightsZcross_attn_past_key_valuer   r"   r"   r#   r9   Z  sX   $
	



zMvpDecoderLayer.forward)
NNNNNNNNFT)r=   r>   r?   r   r,   r4   rB   r   r   rn   r   r9   rC   r"   r"   r.   r#   r   ?  sJ    	
r   c                       sH   e Zd ZdZdedededef fddZdejd	ejfd
dZ	  Z
S )MvpClassificationHeadz-Head for sentence-level classification tasks.	input_dim	inner_dimnum_classespooler_dropoutc                    s8   t    t||| _tj|d| _t||| _d S )Nrc   )r+   r,   r   rN   denseDropoutrH   rR   )r-   r   r   r   r   r.   r"   r#   r,     s   
zMvpClassificationHead.__init__rX   r_   c                 C   s6   |  |}| |}t|}|  |}| |}|S N)rH   r   r4   tanhrR   )r-   rX   r"   r"   r#   r9     s   




zMvpClassificationHead.forward)r=   r>   r?   r@   rA   rm   r,   r4   rB   r9   rC   r"   r"   r.   r#   r     s    r   c                       s:   e Zd ZdZ fddZdejdeej fddZ  Z	S )	MvpPromptz)Layer-wise prompt for encoder or decoder.c              	      s   t    |j| _|| _|| _|j| | _tj|j	d| _	t
|j|j| _tt|j|jt t|j|d |j | _d S )Nr   r)   )r+   r,   prompt_length
num_layersrG   rq   rL   r   r   rH   	Embeddingprompt_embeddingZ
SequentialrN   Zprompt_mid_dimZGELUprompt_trans)r-   rp   r   rG   r.   r"   r#   r,     s   

zMvpPrompt.__init__
prompt_idsr_   c                 C   sN   |  | |}|| j| jd | j| j}| |}|g d	d}|S )Nr)   )r   r)   r   r
   )
r   r   rT   r   r   rG   rL   rH   Zpermutesplit)r-   r   promptr"   r"   r#   r9     s
   
zMvpPrompt.forward)
r=   r>   r?   r@   r,   r4   rB   r   r9   rC   r"   r"   r.   r#   r     s    "r   c                   @   s,   e Zd ZeZdZdZdd Zedd Z	dS )MvpPreTrainedModelmodelTc                 C   s   | j j}t|tjr"|jjjd|d |jd ur |jj	  d S d S t|tj
rA|jjjd|d |jd urC|jj|j 	  d S d S d S )NrE   )meanstd)rp   Zinit_std
isinstancer   rN   r7   dataZnormal_rJ   Zzero_r   padding_idx)r-   moduler   r"   r"   r#   _init_weights  s   

z MvpPreTrainedModel._init_weightsc                 C   s>   | j j}tjg ddddd|gg| jd}|||d}|S )N)r      
      r)   r         r)   r3   )r[   r   )rp   r   r4   rS   r3   ne)r-   Z	pad_tokenr   dummy_inputsr"   r"   r#   r     s   "zMvpPreTrainedModel.dummy_inputsN)
r=   r>   r?   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingr   propertyr   r"   r"   r"   r#   r     s    r   c                       s   e Zd ZdZ	ddedeej dee f fddZ	d	d
 Z
dd Z							ddeej deej deej deej dee dee dee deeef fddZ  ZS )
MvpEncodera  
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`MvpEncoderLayer`].

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    NFrp   embed_tokens
use_promptc                    s   t     j| _ j| _ j} j| _ j| _	 j
r!t|nd| _|d ur,|| _n
t j|| j| _t j|| _t fddt jD | _t|| _|| _|re j| _t  j j| _d| _|    d S )N      ?c                       g | ]}t  qS r"   )ro   .0rk   rp   r"   r#   
<listcomp>+      z'MvpEncoder.__init__.<locals>.<listcomp>F)!r+   r,   rH   Zencoder_layerdrop	layerdroprq   r   r   max_position_embeddingsZmax_source_positionsscale_embeddingmathsqrtembed_scaler   r   r   
vocab_sizer%   embed_positions
ModuleListrangeZencoder_layerslayersru   layernorm_embeddingr   r   r   rr   r~   gradient_checkpointing	post_init)r-   rp   r   r   rF   r.   r   r#   r,     s4    zMvpEncoder.__init__c                 C      | j S r   r   r-   r"   r"   r#   get_input_embeddings;     zMvpEncoder.get_input_embeddingsc                 C   
   || _ d S r   r   r-   valuer"   r"   r#   set_input_embeddings>     
zMvpEncoder.set_input_embeddingsr   r[   	head_maskinputs_embedsr^   output_hidden_statesreturn_dictr_   c              	   C   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur*|dur*td|dur<|}|j}	|d|	d }n|durT| dd }	|dddddf }ntd|du rd| || j	 }| 
|}
||
 }| |}tjj|| j| jd}| jrt| j| j}| |}|durt||j}|rdnd}|rdnd}|dur| d t| jkrtdt| j d	| d  d
t| jD ]n\}}|r||f }d}| jrtg }|| jk rd}|rd}nC| jr| jr| |j |||dur|| nd| jr|| nd|}n||||dur|| nd| jr&|| nd|d}|d }|r9||d f }q|rB||f }|sQt!dd |||fD S t"|||dS )a~  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsrb   r"   r   z&The head_mask should be specified for  layers, but it is for .FT)NN)r\   r~   r^   r   c                 s       | ]	}|d ur|V  qd S r   r"   r   vr"   r"   r#   	<genexpr>  s    z%MvpEncoder.forward.<locals>.<genexpr>last_hidden_staterX   
attentions)#rp   r^   r   use_return_dictr!   r   rT   re   r   r   r   r   r   ri   rH   rd   r   r4   r5   r   rh   r3   r~   r   r2   lenr   	enumeraterandr   r   _gradient_checkpointing_func__call__tupler   )r-   r   r[   r   r   r^   r   r   inputinput_shapeZ	embed_posrX   r   r~   Zencoder_statesZall_attentionsidxZencoder_layerZto_dropdropout_probabilitylayer_outputsr"   r"   r#   r9   A  s   .






	
zMvpEncoder.forwardNF)NNNNNNN)r=   r>   r?   r@   r   r   r   r   rn   r,   r   r   r4   
LongTensorrB   r   r   r   r   r9   rC   r"   r"   r.   r#   r   
  sH    &
	r   c                       s   e Zd ZdZ	ddedeej dee f fddZ	d	d
 Z
dd Z												ddeej deej deej deej deej deej deeej  deej dee dee dee dee deeef fddZ  ZS )
MvpDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MvpDecoderLayer`]

    Args:
        config: MvpConfig
        embed_tokens (nn.Embedding): output embedding
        use_prompt (bool): whether to use prompt
    NFrp   r   r   c                    s   t     j| _ j| _ j| _ j| _ j	rt
 jnd| _|d ur*|| _nt j j| j| _t j j| _t fddt jD | _t j| _|| _|ro j| _t  j j| _t  j j| _d| _ | !  d S )Nr   c                    r   r"   )r   r   r   r"   r#   r     r   z'MvpDecoder.__init__.<locals>.<listcomp>F)"r+   r,   rH   Zdecoder_layerdropr   r   r   r   Zmax_target_positionsr   r   r   rq   r   r   r   r   r   r%   r   r   r   Zdecoder_layersr   ru   r   r   r   r   r   r~   r   r   r   )r-   rp   r   r   r.   r   r#   r,     s<    zMvpDecoder.__init__c                 C   r   r   r   r   r"   r"   r#   r      r   zMvpDecoder.get_input_embeddingsc                 C   r   r   r   r   r"   r"   r#   r     r   zMvpDecoder.set_input_embeddingsr   r[   r   r   r   cross_attn_head_maskpast_key_valuesr   r   r^   r   r   r_   c           !      C   s(  |
dur|
n| j j}
|dur|n| j j}|	dur|	n| j j}	|dur$|n| j j}|dur4|dur4td|durF|}|j}|d|d }n|dur^| dd }|dddddf }ntd|duro|d d jd nd}|du r}| 	|| j
 }t||||}|dur|durt||j|d d}| ||}|| }| |}tjj|| j| jd}| jrt| j| j}| |}| |}| jr| jr|	rtd	 d
}	|rdnd}|
rdnd}|
r|durdnd}|	rdnd}t||gddgD ]*\}}|dur%| d t | j!kr%td| dt | j! d| d  dqt"| j!D ]\}}|r7||f7 }| jrHt#g }|| j$k rHq+|durQ|| nd}| jr| jr| %|j&|||||durl|| nd|durv|| nd| jr|| nd| jr|| ndd|
|	}n1||||||dur|| nd|dur|| nd| jr|| nd| jr|| nd||
|	d}|d }|	r|||
rdnd f7 }|
r||d f7 }|dur||d f7 }q+|r||f7 }|	r|nd} |st'dd || |||fD S t(|| |||dS )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   r)   )rj   rb   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr"   r   r   zThe `z` should be specified for r   r   )
r[   r   r   r\   r   r~   r   rZ   r^   r   r
   r   c                 s   r   r   r"   r   r"   r"   r#   r     s    z%MvpDecoder.forward.<locals>.<genexpr>)r   r   rX   r   cross_attentions))rp   r^   r   r   r   r!   r   rT   re   r   r   r   r   r2   r   r   r   ri   rH   rd   r   r4   r5   r   rh   r3   r~   r   r   loggerZwarning_oncezipr   r   r   r   r   r   r   r   r   )!r-   r   r[   r   r   r   r   r   r   r   r^   r   r   r   r   r0   Z	positionsrX   r   r~   r   Zall_hidden_statesZall_self_attnsZall_cross_attentionsZnext_decoder_cacheZ	attn_maskZ	mask_namer   Zdecoder_layerr   rZ   r   Z
next_cacher"   r"   r#   r9     s   P








zMvpDecoder.forwardr   )NNNNNNNNNNNN)r=   r>   r?   r@   r   r   r   r   rn   r,   r   r   r4   r   rB   r   r   r   r   r   r9   rC   r"   r"   r.   r#   r     sf    
(	

r   c                $       s(  e Zd ZdgZddgZdef fddZdd Zd	d
 Zdd Z	dd Z
dd Ze															d$deej deej deej deej deej deej deej deeej  deeej  deej deej dee dee dee d ee d!eeef f d"d#Z  ZS )%MvpModelfinal_logits_biasencoder.embed_tokens.weightdecoder.embed_tokens.weightrp   c                    sd   t  | |j|j}}|j| _t||j|| _t	|| j|j| _
t|| j|j| _|   d S r   )r+   r,   r   r   r   r   r   rq   sharedr   encoderr   decoderr   )r-   rp   r   r   r.   r"   r#   r,     s   zMvpModel.__init__c                 C   r   r   )r   r   r"   r"   r#   r     r   zMvpModel.get_input_embeddingsc                 C   s   || _ | j | j_| j | j_d S r   )r   r   r   r   r   r"   r"   r#   r     s   
zMvpModel.set_input_embeddingsc                 C   r   r   )r   r   r"   r"   r#   get_encoder  r   zMvpModel.get_encoderc                 C   r   r   r   r   r"   r"   r#   get_decoder  r   zMvpModel.get_decoderc                 C   sF   | j sJ d| d | jjd | jjd | jjd d S )NzHIf you want to use lightweight tuning, make sure that `use_prompt=True`.FT)r   requires_grad_r   r~   r   r   r   r"   r"   r#   set_lightweight_tuning  s
   
zMvpModel.set_lightweight_tuningNr   r[   decoder_input_idsdecoder_attention_maskr   decoder_head_maskr   encoder_outputsr   r   decoder_inputs_embedsr   r^   r   r   r_   c                 C   sH  |du r|du r|du rt dt|| jj| jj}|dur |n| jj}|dur*|n| jj}|dur4|n| jj}|dur>|n| jj}|du rS| j	||||
|||d}n$|rwt
|tswt|d t|dkrh|d ndt|dkrs|d ndd}| j|||d ||||	|||||d}|s|| S t|j|j|j|j|j|j|j|jd	S )
a"  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        NzIf no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)r   r[   r   r   r^   r   r   r   r   r)   r   r   r[   r   r   r   r   r   r   r   r^   r   r   )r   r   decoder_hidden_statesdecoder_attentionsr   encoder_last_hidden_stater   encoder_attentions)r!   r$   rp   r   r   r^   r   r   r   r   r   r   r   r   r   r   r   rX   r   r   )r-   r   r[   r  r  r   r  r   r	  r   r   r
  r   r^   r   r   Zdecoder_outputsr"   r"   r#   r9     sn   2
zMvpModel.forwardNNNNNNNNNNNNNNN)r=   r>   r?   Z"_keys_to_ignore_on_load_unexpected_tied_weights_keysr   r,   r   r   r  r  r  r   r   r4   r   rB   r   r   rn   r   r   r   r9   rC   r"   r"   r.   r#   r     st    	

r   ze
    The MVP Model with a language modeling head. Can be used for various text generation tasks.
    )Zcustom_introc                &       s  e Zd Zg dZdef fddZdd Zdd Z	
d/dede	e de
dejf fddZdedd	fddZdd Zdd Zdd Ze																																d0de	ej de	ej de	ej de	ej de	ej de	ej de	ej d e	eej  d!e	eej  d"e	ej d#e	ej d$e	ej d%e	e
 d&e	e
 d'e	e
 d(e	e
 deeef f"d)d*Zd$ejfd+d,Zed-d. Z  ZS )1MvpForConditionalGeneration)r   r   lm_head.weightrp   c                    sX   t  | t|| _| dtd| jjjf t	j
|j| jjjdd| _|   d S )Nr   r   FrK   )r+   r,   r   r   register_bufferr4   rg   r   r&   r   rN   rq   lm_headr   r}   r.   r"   r#   r,     s
   
z$MvpForConditionalGeneration.__init__c                 C   
   | j  S r   )r   r  r   r"   r"   r#   r    r   z'MvpForConditionalGeneration.get_encoderc                 C   r  r   )r   r  r   r"   r"   r#   r    r   z'MvpForConditionalGeneration.get_decoderNTnew_num_tokenspad_to_multiple_ofmean_resizingr_   c                    s   t  |||}| | |S r   )r+   resize_token_embeddings_resize_final_logits_bias)r-   r  r  r  new_embeddingsr.   r"   r#   r    s   
z3MvpForConditionalGeneration.resize_token_embeddingsc                 C   sj   | j jd }||kr| j d d d |f }ntjd|| f| j jd}tj| j |gdd}| d| d S )Nr   r   r   r`   r   )r   r   r4   rg   r3   rf   r  )r-   r  Zold_num_tokensZnew_biasZ
extra_biasr"   r"   r#   r    s   z5MvpForConditionalGeneration._resize_final_logits_biasc                 C   r   r   r  r   r"   r"   r#   get_output_embeddings  r   z1MvpForConditionalGeneration.get_output_embeddingsc                 C   r   r   r  r-   r  r"   r"   r#   set_output_embeddings  r   z1MvpForConditionalGeneration.set_output_embeddingsc                 C      | j   | jd d S r   r   r  r  r  r   r"   r"   r#   r       
z2MvpForConditionalGeneration.set_lightweight_tuningr   r[   r  r  r   r  r   r	  r   r   r
  labelsr   r^   r   r   c                 C   s  |dur|n| j j}|dur)|rtd d}|du r)|du r)t|| j j| j j}| j|||||||||	|
|||||d}| |d | j	 }d}|dur^t
 }||d| j j|d}|st|f|dd  }|durr|f| S |S t|||j|j|j|j|j|j|jd	S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example of summarization:

        Fine-tuning a model
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons.",
        ...     return_tensors="pt",
        ... )
        >>> labels = tokenizer("Bad Reasons To Quit Your Job", return_tensors="pt")["input_ids"]

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     generated_ids = model.generate(**inputs)

        >>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        ```
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)r[   r  r	  r  r   r  r   r   r   r
  r   r^   r   r   r   r   r   	losslogitsr   r  r  r   r  r   r  )rp   r   r   warningr$   r   r   r   r  r   r   rT   r   r   r   r  r  r   r  r   r  )r-   r   r[   r  r  r   r  r   r	  r   r   r
  r$  r   r^   r   r   r   Z	lm_logitsZmasked_lm_lossloss_fctoutputr"   r"   r#   r9     sZ   Q
z#MvpForConditionalGeneration.forwardc                 C   s   t || jj| jjS r   )r$   rp   r   r   )r-   r$  r"   r"   r#   %prepare_decoder_input_ids_from_labels;  s   zAMvpForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    sB   d}| D ]}|t  fdd|d d D |dd   f7 }q|S )Nr"   c                 3   $    | ]}| d  |jV  qdS r<   Zindex_selectrh   r3   r   Z
past_statebeam_idxr"   r#   r   D     " z=MvpForConditionalGeneration._reorder_cache.<locals>.<genexpr>r)   r   r   r0  Zreordered_pastZ
layer_pastr"   r/  r#   _reorder_cache>  s   
z*MvpForConditionalGeneration._reorder_cache)NTNNNNNNNNNNNNNNNN) r=   r>   r?   r  r   r,   r  r  rA   r   rn   r   r   r  r  r  r   r  r   r4   r   rB   r   r   r   r   r   r9   r+  staticmethodr4  rC   r"   r"   r.   r#   r    s    			

 r  z
    Mvp model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                $       s   e Zd ZddgZdef fddZdd Ze															dd	ee	j
 d
ee	j dee	j
 dee	j
 dee	j dee	j dee	j deee	j  dee	j dee	j dee	j
 dee dee dee dee deeef f ddZ  ZS )MvpForSequenceClassificationr   r   rp   c                    sB   t  j|fi | t|| _t|j|j|j|j| _| 	  d S r   )
r+   r,   r   r   r   rq   
num_labelsZclassifier_dropoutclassification_headr   )r-   rp   kwargsr.   r"   r#   r,   S  s   
z%MvpForSequenceClassification.__init__c                 C   r!  r   )r   r  r9  r  r   r"   r"   r#   r  `  r#  z3MvpForSequenceClassification.set_lightweight_tuningNr   r[   r  r  r   r  r   r	  r   r
  r$  r   r^   r   r   r_   c                 C   s.  |dur|n| j j}|durd}|du r!|	dur!td| jj | j|||||||||	|
||||d}|d }|| j j|j	}t
t|ddkrStd||ddf |dd|ddddddf }| |}d}|dur| j jdu r| j jdkrd	| j _n| j jdkr|jtjks|jtjkrd
| j _nd| j _| j jd	krt }| j jdkr|| | }n,|||}n&| j jd
krt }||d| j j|d}n| j jdkrt }|||}|s|f|dd  }|dur|f| S |S t|||j|j|j|j|j |j!|j"d	S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Example of single-label classification:

        Fine-tuning a model on `num_labels` classes
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForSequenceClassification

        >>> num_labels = 2  # for example, this is a binary classification task
        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForSequenceClassification.from_pretrained("RUCAIBox/mvp", num_labels=num_labels)

        >>> inputs = tokenizer("Classify: Hello, my dog is cute", return_tensors="pt")
        >>> labels = torch.tensor(1)  # the real label for inputs

        >>> loss = model(**inputs, labels=labels).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     logits = model(**inputs).logits

        >>> predicted_class_id = logits.argmax()
        ```
        NFz8Passing input embeddings is currently not supported for r[   r  r  r   r  r   r	  r   r
  r   r^   r   r   r   r   z7All examples must have the same number of <eos> tokens.r   Z
regressionZsingle_label_classificationZmulti_label_classificationr%  )#rp   r   NotImplementedErrorr/   r=   r   eqZeos_token_idrh   r3   r   r4   Zunique_consecutivesumr!   rT   re   r9  Zproblem_typer8  r2   r6   rA   r	   squeezer   r   r   r   r  r  r   r  r   r  )r-   r   r[   r  r  r   r  r   r	  r   r
  r$  r   r^   r   r   r   rX   Zeos_maskZsentence_representationr'  r&  r)  r*  r"   r"   r#   r9   d  s   M$

$

z$MvpForSequenceClassification.forwardr  )r=   r>   r?   r  r   r,   r  r   r   r4   r   rB   r   r   rn   r   r   r   r9   rC   r"   r"   r.   r#   r7  J  sj    	

r7  c                &       s  e Zd ZddgZ fddZdd Ze																ddeej	 d	eej	 d
eej
 deej
 deej	 deej	 deej	 deeej  deej
 deej
 deej deej dee dee dee dee deeef f"ddZ  ZS )MvpForQuestionAnsweringr   r   c                    sB   t  | d|_|j| _t|| _t|j|j| _| 	  d S r(   )
r+   r,   r8  r   r   r   rN   hidden_size
qa_outputsr   r}   r.   r"   r#   r,      s   
z MvpForQuestionAnswering.__init__c                 C   r!  r   )r   r  rB  r  r   r"   r"   r#   r    r#  z.MvpForQuestionAnswering.set_lightweight_tuningNr   r[   r  r  r   r  r   r	  start_positionsend_positionsr   r
  r   r^   r   r   r_   c                 C   sz  |dur|n| j j}|	dur|
durd}| j||||||||||||||d}|d }| |}|jddd\}}|d }|d }d}|	dur|
durt|	 dkr^|	d}	t|
 dkrk|
d}
|d}|		d|}	|
	d|}
t
|d}|||	}|||
}|| d	 }|s||f|dd  }|dur|f| S |S t||||j|j|j|j|j|j|jd

S )aX  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Mvp uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            If you want to change padding behavior, you should read [`modeling_mvp._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        Fine-tuning a model for extrative question answering, and our model also supports generative question answering
        using `BartForConditionalGeneration`
        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MvpForQuestionAnswering

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForQuestionAnswering.from_pretrained("RUCAIBox/mvp")

        >>> inputs = tokenizer(
        ...     "Answer the following question: Who was Jim Henson? [SEP] Jim Henson was a nice puppet",
        ...     return_tensors="pt",
        ... )
        >>> target_start_index = torch.tensor([18])
        >>> target_end_index = torch.tensor([19])

        >>> loss = model(**inputs, start_positions=target_start_index, end_positions=target_end_index).loss
        >>> loss.backward()
        ```

        Inference after the model fine-tuned
        ```python
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> answer_start_index = outputs.start_logits.argmax()
        >>> answer_end_index = outputs.end_logits.argmax()

        >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        >>> predict_answer = tokenizer.decode(predict_answer_tokens)
        ```
        NFr;  r   r   r   r`   )Zignore_indexr)   )
r&  start_logits
end_logitsr   r  r  r   r  r   r  )rp   r   r   rB  r   r?  rV   r   re   r   r   r   r   r  r  r   r  r   r  )r-   r   r[   r  r  r   r  r   r	  rC  rD  r   r
  r   r^   r   r   r   Zsequence_outputr'  rE  rF  Z
total_lossZignored_indexr)  Z
start_lossZend_lossr*  r"   r"   r#   r9     sp   S







zMvpForQuestionAnswering.forwardr5  )r=   r>   r?   r  r,   r  r   r   r4   rB   r   r   r   rn   r   r   r   r9   rC   r"   r"   r.   r#   r@    sp    	

r@  c                       s(   e Zd ZdZ fddZdd Z  ZS )MvpDecoderWrapperz
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    c                    s   t  | t|| _d S r   )r+   r,   r   r   r}   r.   r"   r#   r,     s   zMvpDecoderWrapper.__init__c                 O   s   | j |i |S r   r  )r-   argsr:  r"   r"   r#   r9     s   zMvpDecoderWrapper.forward)r=   r>   r?   r@   r,   r9   rC   r"   r"   r.   r#   rG    s    rG  c                        s  e Zd ZdgZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Ze													d%deej deej deej deej deej deej deeej  deej deej dee dee dee dee d eeef fd!d"Zed#d$ Z  ZS )&MvpForCausalLMr  c                    sN   t |}d|_d|_t | t|| _tj	|j
|jdd| _|   d S )NTFrK   )copydeepcopyrI   Zis_encoder_decoderr+   r,   rG  r   r   rN   rA  r   r  r   r}   r.   r"   r#   r,     s   

zMvpForCausalLM.__init__c                 C   s
   | j jjS r   r   r   r   r   r"   r"   r#   r     r   z#MvpForCausalLM.get_input_embeddingsc                 C   s   || j j_d S r   rL  r   r"   r"   r#   r     s   z#MvpForCausalLM.set_input_embeddingsc                 C   r   r   r  r   r"   r"   r#   r    r   z$MvpForCausalLM.get_output_embeddingsc                 C   r   r   r  r  r"   r"   r#   r     r   z$MvpForCausalLM.set_output_embeddingsc                 C   s   || j _d S r   r   r   )r-   r   r"   r"   r#   set_decoder  s   zMvpForCausalLM.set_decoderc                 C   s   | j jS r   rM  r   r"   r"   r#   r    s   zMvpForCausalLM.get_decoderc                 C   r!  r   r"  r   r"   r"   r#   r    r#  z%MvpForCausalLM.set_lightweight_tuningNr   r[   r   r   r   r   r   r   r$  r   r^   r   r   r_   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jj|||||||||
|||d}| |d }d}|	durNt }||d| j j	|	d}|sd|f|dd  }|durb|f| S |S t
|||j|j|j|jdS )a  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MvpForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("RUCAIBox/mvp")
        >>> model = MvpForCausalLM.from_pretrained("RUCAIBox/mvp", add_cross_attention=False)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> list(logits.shape)
        [1, 8, 50267]
        ```Nr  r   r   r   )r&  r'  r   rX   r   r   )rp   r^   r   r   r   r   r  r   rT   r   r   r   rX   r   r   )r-   r   r[   r   r   r   r   r   r   r$  r   r^   r   r   r   r'  r&  r)  r*  r"   r"   r#   r9     sD   ,zMvpForCausalLM.forwardc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr"   c                 3   r,  r<   r-  r.  r/  r"   r#   r   4  r1  z0MvpForCausalLM._reorder_cache.<locals>.<genexpr>r2  r3  r"   r/  r#   r4  /  s   zMvpForCausalLM._reorder_cache)NNNNNNNNNNNNN)r=   r>   r?   r  r,   r   r   r  r   rN  r  r  r   r   r4   r   rB   r   r   rn   r   r   r   r9   r6  r4  rC   r"   r"   r.   r#   rI    sn    	

UrI  )rI  r  r@  r7  r   r   )=r@   rJ  r   typingr   r   r   r   r4   Ztorch.utils.checkpointr   Ztorch.nnr   r   r	   Zactivationsr   Z
generationr   Zmodeling_attn_mask_utilsr   r   Zmodeling_outputsr   r   r   r   r   r   r   Zmodeling_utilsr   utilsr   r   Zconfiguration_mvpr   Z
get_loggerr=   r   rB   rA   r$   r   r%   ModulerD   ro   r   r   r   r   r   r   r   r  r7  r@  rG  rI  __all__r"   r"   r"   r#   <module>   sf   $	
 H~ E    @ - * 