o
    Zh                    @   sR  d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ eeZ d@ddZ!dd Z"d@ddZ#dd Z$eG dd deZ%eG dd deZ&eG dd deZ'eG dd deZ(eG d d! d!eZ)G d"d# d#ej*Z+G d$d% d%ej,Z-G d&d' d'ej,Z.G d(d) d)ej,Z/G d*d+ d+ej,Z0G d,d- d-ej,Z1ed.d/G d0d1 d1e)Z2ed2d/G d3d4 d4e)Z3eG d5d6 d6e)Z4ed7d/G d8d9 d9e)eZ5ed:d/G d;d< d<e)eZ6G d=d> d>e)Z7g d?Z8dS )AzRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).    N)	dataclass)OptionalTupleUnion)Tensornn)	LayerNorm   )ACT2FN)GenerationMixin)BaseModelOutput)PreTrainedModel)ModelOutputauto_docstringlogging   )ProphetNetConfigFc                 C   s,   |rt jj|  |dS t jj| |tjdS )Ndimr   dtype)r   
functionalsoftmaxfloattorchfloat32)Zhidden_stater   
onnx_trace r   a/var/www/auris/lib/python3.10/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   '   s   r   c                 C   s   t j|| | f||dt |j }|  }t|D ]}|| jddd || | d  qd|dddddf< t j	||gddS )	z@
    This function computes the bias for the predict stream
    )devicer   r   F)wrapr   N   r   )
r   onesfinfomindetachclonerangeZfill_diagonal_Ztriu_cat)sequence_lengthngramr   r   Z
left_blockZright_blockZ
stream_idxr   r   r   ngram_attention_bias.   s    r+   c           	      C   s   | }d}|r | d } |t |t | |   }t |}n	t |t |}| d }t ||}|t | | t||  | |   }t 	|t 
|| d   }|t || | }|S )zo
    This function computes individual parts of the relative position buckets. For more detail, see paper.
    r   r!   r   )r   lt
zeros_likeintabsmaxlogr   mathr$   Z	ones_likewhere)	num_bucketsmax_distancerelative_positionsis_bidirectionalZinv_relative_positionsZrel_positions_bucketZ	max_exactZis_smallZval_if_larger   r   r   compute_relative_buckets?   s(   r8   c                 C   s   | dd|dd}|| d }tj|d |fdd d}|d|dd}|| d }t| ||dd}t| ||dd}||fS )zm
    This function computes both main and predict relative position buckets. For more detail, see paper.
    r   r   F)r7   )	unsqueezerepeatsizer   r(   r8   )r4   r5   position_idsZmain_stream_relative_positionsZ$predicting_stream_relative_positionsmain_relative_position_buckets!predict_relative_position_bucketsr   r   r   #compute_all_stream_relative_bucketsZ   s   r@   c                   @   s  e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dZeeej  ed
< dZeeej  ed< dZeej ed< dZeeej  ed< dZeeej  ed< edd ZdS )ProphetNetSeq2SeqLMOutputa  
    Base class for sequence-to-sequence language models outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the main stream language modeling head (scores for each vocabulary token before
            SoftMax).
        logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
            SoftMax).
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the self-attention heads.
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, encoder_sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention
            softmax, used to compute the weighted average in the self-attention heads.
    Nlosslogitslogits_ngrampast_key_valuesdecoder_hidden_statesdecoder_ngram_hidden_statesdecoder_attentionsdecoder_ngram_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentionsc                 C      t dt | jS Nzi`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions` instead.warningswarnFutureWarningrJ   selfr   r   r   decoder_cross_attentions   
   z2ProphetNetSeq2SeqLMOutput.decoder_cross_attentions)__name__
__module____qualname____doc__rB   r   r   FloatTensor__annotations__rC   rD   rE   r   rF   rG   rH   rI   rJ   rK   rL   rM   propertyrV   r   r   r   r   rA   q   s    
 <rA   c                   @   s   e Zd ZU dZejed< dZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed	< dZee
ej  ed
< dZeej ed< dZee
ej  ed< dZee
ej  ed< edd ZdS )ProphetNetSeq2SeqModelOutputa2  
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder of the model.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, encoder_sequence_length, hidden_size)`.

            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, encoder_sequence_length)`.

            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
    last_hidden_stateNlast_hidden_state_ngramrE   rF   rG   rH   rI   rJ   rK   rL   rM   c                 C   rN   rO   rP   rT   r   r   r   rV     rW   z5ProphetNetSeq2SeqModelOutput.decoder_cross_attentions)rX   rY   rZ   r[   r   r\   r]   ra   r   rE   r   rF   rG   rH   rI   rJ   rK   rL   rM   r^   rV   r   r   r   r   r_      s   
 
>r_   c                   @   s   e Zd ZU dZejed< dZeej ed< dZ	ee
ej  ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed	< dZee
ej  ed
< dS )ProphetNetDecoderModelOutputaZ  
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
    r`   Nra   rE   hidden_stateshidden_states_ngram
attentionsngram_attentionsrJ   )rX   rY   rZ   r[   r   r\   r]   ra   r   rE   r   rc   rd   re   rf   rJ   r   r   r   r   rb     s   
 
0rb   c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed	< dZeeej  ed
< dZeeej  ed< dS )ProphetNetDecoderLMOutputam  
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss.
        logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the main stream language modeling head (scores for each vocabulary token before
            SoftMax).
        logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
            SoftMax).
        past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
            used (see `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, decoder_sequence_length, hidden_size)`.

            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
        ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
            outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
            self-attention heads.
        ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            decoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
            weighted average in the
        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
            encoder_sequence_length, decoder_sequence_length)`.

            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
            compute the weighted average in the
    NrB   rC   rD   rE   rc   rd   re   rf   rJ   )rX   rY   rZ   r[   rB   r   r   r\   r]   rC   rD   rE   r   rc   rd   re   rf   rJ   r   r   r   r   rg   X  s   
 1rg   c                   @   s(   e Zd ZeZdZdZdd Zdd ZdS )ProphetNetPreTrainedModel
prophetnetTc                 C   s   t |tjr |jjjd| jjd |jd ur|jj	  d S d S t |tj
rA|jjjd| jjd |jd urC|jj|j 	  d S d S d S )N        )meanZstd)
isinstancer   LinearweightdataZnormal_configZinit_stdbiasZzero_	Embeddingpadding_idx)rU   moduler   r   r   _init_weights  s   

z'ProphetNetPreTrainedModel._init_weightsc                 C   s   | j j}| j j}|d usJ d||j}|dd df  |ddd f< ||d< |d us2J d||dk| t|dk	 sGJ d	|S )
Nzself.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.r9   r   ).r   z1self.model.config.pad_token_id has to be defined.r   z8Verify that `shifted_input_ids` has only positive values)
rp   decoder_start_token_idpad_token_id	new_zerosshaper&   Zmasked_fill_r   allitem)rU   	input_idsrw   rx   Zshifted_input_idsr   r   r   _shift_right  s   
 z&ProphetNetPreTrainedModel._shift_rightN)	rX   rY   rZ   r   Zconfig_classZbase_model_prefixZsupports_gradient_checkpointingru   r~   r   r   r   r   rh     s    
rh   c                       sD   e Zd ZdZdeddf fddZd fdd	Z fd	d
Z  ZS )ProphetNetPositionalEmbeddingsa  
    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
    the forward function.
    rp   returnNc                    s"   |j | _t |j |j|j d S N)max_position_embeddings
max_lengthsuper__init__hidden_sizerx   rU   rp   	__class__r   r   r     s   z'ProphetNetPositionalEmbeddings.__init__c                    s   |d u s| j d u sJ d|d u r\|d ur5|d d jd }|d | }tjdtj|dt| j |  }n'|d u rBtj|tj|d}tj|dd||  | j  }|d| j	d }t
 ||fS )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r!   r   )r   r   r   r   r   )rs   rz   r   r"   longr.   Zcumsumtype_asclampr   r   forward)rU   Zinputs_shaper   attention_maskrE   r=   Zprev_num_input_idsZnum_input_idsr   r   r   r     s"   z&ProphetNetPositionalEmbeddings.forwardc                    s   t  |S r   )r   r   )rU   r=   r   r   r   _forward     z'ProphetNetPositionalEmbeddings._forward)NNN)	rX   rY   rZ   r[   r   r   r   r   __classcell__r   r   r   r   r     s
    r   c                       s   e Zd ZdZdedef fddZdejdedefd	d
Z						dde
e de
e de
e de
ee  dedeee
e f fddZ  ZS )ProphetNetAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrp   num_attn_headsc                    s   t    |j}|j| _|j| _|| _|| | _| j| |ks#J dt||| _	t||| _
t||| _t||| _d S )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r   r   r   attention_dropoutdropoutr   head_dimr   rm   key_proj
value_proj
query_projout_proj)rU   rp   r   r   r   r   r   r     s   

zProphetNetAttention.__init__tensorseq_lenbszc                 C       | ||| j| jdd S Nr   r!   viewr   r   	transpose
contiguous)rU   r   r   r   r   r   r   _shape      zProphetNetAttention._shapeNFkey_value_statesr   layer_head_maskpast_key_valueoutput_attentionsr   c                 C   s  |  \}}}	|d u}
t|  |||	gks%J d|||	f d|   | || jd  }|
r>|d ur>|d }|d }n+|
rU| | |d|}| | |d|}n| | |d|}| | |d|}|
ro||f}|| jd| jf}| |||j| }|j| }|j| }| d}t	
d||dd	}|| j||f}|  |krtd
| d|   |d ur| dkrd }|| jd|f}|d ur|  |krtd| d|   |d ur|| }|r|}nd }tjj|dd}|d ur+|  | jfksJ d| jf d|   |dddd||| j|| }|dddd| }tjj|| j| jd}t	
d||}|| j|| jf}|  |krXtd| d|   |dd|||	}| |}tjj|| j| jd}|||fS )Nz Size of hidden states should be 	, but is       ?r   r   r9   r!   zbsij,bsjk->bsikr	   z#Attention weights should have size z Attention mask should have size r   /Head mask for a single layer should be of size ptrainingz `attn_output` should have shape , but is of shape )r<   listr   r   r   r   r   r   r   r   einsumr   
ValueErrorr   r   r   r   r   r   r   reshaper   )rU   rc   r   r   r   r   r   
batch_sizetgt_lenr   Zis_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsZexpected_shapeZattn_weights_reshapedZ
attn_probsattn_outputr   r   r   r     s~   	








zProphetNetAttention.forward)NNNNF)rX   rY   rZ   r[   r   r.   r   r   r   r   r   r   boolr   r   r   r   r   r   r     s4    
r   c                       s2   e Zd ZdZdedef fddZdd Z  ZS )ProphetNetFeedForwardzm
    This is the residual two feed-forward layer block based on the original Transformer implementation.
    rp   ffn_dimc                    sJ   t    t|j | _t|j|| _t||j| _	|j
| _
|j| _d S r   )r   r   r
   Zactivation_functionactivation_fnr   rm   r   intermediateoutputactivation_dropoutr   )rU   rp   r   r   r   r   r   n  s   
zProphetNetFeedForward.__init__c                 C   sN   |  |}| |}tjj|| j| jd}| |}tjj|| j| jd}|S )Nr   )r   r   r   r   r   r   r   r   )rU   rc   r   r   r   r   v  s   


zProphetNetFeedForward.forward)	rX   rY   rZ   r[   r   r.   r   r   r   r   r   r   r   r   i  s    r   c                       sh   e Zd Zdef fddZdd Zdd Z							dd	eee	  fd
dZ
dd Zdd Z  ZS )ProphetNetNgramSelfAttentionrp   c                    s   t    |j| _|j| _|j| _|j| _|j| _|j| _|j| j | _	|j
| _
| j	| j |jks5J dt|j|j| _t|j|j| _t|j|j| _t|j|j| _t|j| j| j | _d| _d S )Nz6config.hidden_size must be divisible by num_attn_headsF)r   r   r   r4   relative_max_distancenum_decoder_attention_headsr   r   r   r   r*   r   rm   r   r   r   r   relative_pos_embeddingsr   r   r   r   r   r     s$   

z%ProphetNetNgramSelfAttention.__init__c                 C   r   r   r   )rU   r   r   r   r   r   r   r     r   z#ProphetNetNgramSelfAttention._shapec                 C   s
   d| _ d S )NT)r   rT   r   r   r   prepare_for_onnx_export_     
z5ProphetNetNgramSelfAttention.prepare_for_onnx_export_Nr   c	           *         st  |  \}	}
}t|  |	|
|gks J d|	|
|f d|j | |}| |}| |}|| jd  }| ||
|	}| |d|	}| |d|	}|	| jd| jf}|j	| }|j	| }|j	| }|j
d| j dd}|j
d| j dd}|j
d| j dd}|j
d| j dd}|d |dd  }}|d |dd  }}|d |dd   }|d |dd  }|d ur|d }tj| fdd |d }tj|fdd f}|
d| j  }td	| dd
}| ||||}|| }|d ur|| }t|d| jd|}|d ur6|  | jfks%J d| jf d|   |	dddd|	|	| jd| }tjj|| j| jd}td	|} | dd|	d||} | | } t|d	|	| j| j|| j}!t fdd|D d}"tj|dd}#tfdd|D d}$td|!|"f}%| |#|%||}&|%|& }%|d ur|dddd
d}||%j}|%| }%t|%d| jd|%}'|d ur|  | jfksJ d| jf d|   |	ddddd|' }'tjj|'| j| jd}'td|'|$ddf}(|(dd
}(|(|	| j||}(| |(}(t| |(gd	|	d|})|	|	| j|d}tjj|)| j| jd})|)||'|fS )Nz#`hidden_states` should be of shape r   r   r9   r   r   r!   r   zbntc,bncs->bntsr	   )r   r   r   r   r   c                    s   g | ]
}t  |gd qS r!   )r   r(   ).0key)main_key_statesr   r   
<listcomp>  s    z8ProphetNetNgramSelfAttention.forward.<locals>.<listcomp>c                    s"   g | ]}t  |gd d qS r   )r   r(   r:   )r   Zv_p)main_value_statesr   r   r     s   " zbnhtc,bnhsc->bnhts   zbnhts,bnhsc->bnhtc) r<   r   rz   r   r   r   r   r   r   r   chunkr*   r   r(   r   r    get_main_relative_pos_embeddingsr   r   r   r   r   r   r   r   r   r   stack#get_predict_relative_pos_embeddingspermutetor   )*rU   rc   r   r   r   extended_predict_attention_maskr>   r?   r=   r   Zngram_sequence_lengthr   r   r   r   r   Zhidden_states_listZquery_states_listZkey_states_listZvalue_states_listZmain_hidden_statesZhidden_states_predict_listZmain_query_statesZpredict_query_states_listZpredict_key_states_listZpredict_value_states_listZprev_main_key_statesZprev_main_value_statesr)   Zmain_attn_weightsmain_relative_pos_embeddingsZmain_attn_probsZmain_attn_outputZpredict_query_statesZpredict_key_statesZpredict_hidden_statesZpredict_value_statesZpredict_attn_weightspredict_relative_pos_embeddingsZpredict_attn_probsZpredict_attn_outputr   r   )r   r   r   r     s   












z$ProphetNetNgramSelfAttention.forwardc                 C   sH  |j \}}}}|||||}|d u rK|j d d \}}	td|j d d dd||	d|j}
|
|d||	d }
t| j	| j
|
d}| |}||j d d | j	| jf }|dddd}||j d d d }|d| jd}|d|j d }| }|d|d}tj|d|d}||||d}|S )	Nr!   r   r9   r   Fr	   )r9   r   index)rz   r   r   aranger:   r;   r   r   r8   r4   r   r   r   r   r   r   r<   gather)rU   rc   r   r=   r>   r   r   r   r   r)   r6   rel_pos_embeddingsr   r   r   r   r   V  s:   


z=ProphetNetNgramSelfAttention.get_main_relative_pos_embeddingsc                 C   sH  |j dd \}}|d u rJ|j d }|d d |d ks J dtd|dd||d|j}||d||d }t| j| j	|d}|
dd}| |}	|	|j d d | j| jf }	|	ddddd}	|	d| j}	|d}|| jd| jd}|d|d }tj|	d|d	}
|
|| j| j|d}
|
S )
Nr   r!   r9   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr   r	   r   )rz   r   r   r:   r;   r   r   r8   r4   r   r   r   r   r   r   r   r*   r<   r   r   )rU   rc   r   r=   r?   r   r)   Zkey_sequence_lengthr6   r   r   r   r   r   r     sN   





z@ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddingsNNNNNNN)rX   rY   rZ   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r     s"    

 3-r   c                       s8   e Zd ZdZdef fddZ	d	defddZ  ZS )
ProphetNetEncoderLayerz&
    Encoder block for Prophetnet
    rp   c                    sB   t    t||j| _t|j| _t||j	| _
t|j| _d S r   )r   r   r   num_encoder_attention_heads	self_attnr   r   self_attn_layer_normr   Zencoder_ffn_dimfeed_forwardfeed_forward_layer_normr   r   r   r   r     s
   
zProphetNetEncoderLayer.__init__Fr   c           
      C   sV   | j ||||d\}}}| || }| |}| || }|f}	|r)|	|f7 }	|	S )N)rc   r   r   r   )r   r   r   r   )
rU   rc   r   r   r   attention_outputr   _feed_forward_outputoutputsr   r   r   r     s   

zProphetNetEncoderLayer.forwardF	rX   rY   rZ   r[   r   r   r   r   r   r   r   r   r   r     s    r   c                       sR   e Zd ZdZdef fddZ												dded	efd
dZ  ZS )ProphetNetDecoderLayerz&
    Decoder block for Prophetnet
    rp   c                    s^   t    t|| _t|j| _|jr t||j	| _
t|j| _t||j| _t|j| _d S r   )r   r   r   r   r   r   r   add_cross_attentionr   r   
cross_attncross_attn_layer_normr   Zdecoder_ffn_dimr   r   r   r   r   r   r     s   

zProphetNetDecoderLayer.__init__NTF	use_cacher   c              
   C   s   |d ur
|d d nd }| j |||||||	|
d\}}}}| || }|d ur.|dd  nd }d }|d urO| j||||||d\}}}| || }|| }| |}| || }|f}|rg||||f7 }|rn||f7 }|S )Nr!   )rc   r   r   r   r   r>   r?   r=   )rc   r   r   r   r   r   )r   r   r   r   r   r   )rU   rc   r   rL   encoder_attn_maskr   cross_attn_layer_head_maskr   r>   r?   r=   r   r   r   Zself_attn_past_key_valueZngram_attention_outputZself_attn_weightsZself_attn_weights_ngramZpresent_key_valueZcross_attn_past_key_valueZcross_attn_weightsr   Zcross_attn_present_key_valuer   r   r   r   r   r     sB   


zProphetNetDecoderLayer.forward)NNNNNNNNNNTFr   r   r   r   r   r     s(    r   z=
    The standalone encoder part of the ProphetNetModel.
    )Zcustom_introc                       s   e Zd Zddedejf fddZdd Zdd	 Ze								dd
e
ej de
ej de
ej de
ej de
e de
e de
e deeef fddZ  ZS )ProphetNetEncoderNrp   word_embeddingsc                    sx   t    |dur|n
tj j j jd| _t | _	t
 j| _t fddt jD | _d| _|   dS )7  
        word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
            The word embedding parameters. This can be used to initialize [`ProphetNetEncoder`] with pre-defined word
            embeddings instead of randomly initialized word embeddings.
        Nrs   c                       g | ]}t  qS r   )r   r   r   rp   r   r   r   T      z.ProphetNetEncoder.__init__.<locals>.<listcomp>F)r   r   r   rr   
vocab_sizer   rx   r   r   position_embeddingsr   embeddings_layer_norm
ModuleListr'   Znum_encoder_layerslayersgradient_checkpointing	post_initrU   rp   r   r   r   r   r   D  s   
 zProphetNetEncoder.__init__c                 C      | j S r   r   rT   r   r   r   get_input_embeddingsZ     z&ProphetNetEncoder.get_input_embeddingsc                 C   
   || _ d S r   r  rU   valuer   r   r   set_input_embeddings]  r   z&ProphetNetEncoder.set_input_embeddingsr}   r   	head_maskinputs_embedsr   output_hidden_statesreturn_dictr   c                 C   sF  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r*|du r*td|dur6|dur6td|durC|du rC| |}|durkd|ddddddf d| j jdd t	| j
j }||j
}nd}| |jdd |j\}	}
||	 }| |}tjj|| j j| jd}|rdnd}|rdnd}|dur| d	 t| jksJ d
t| j d| d	  dt| jD ]B\}}|r||f }| jr| jr| |j|||dur|| nd|}n||||dur|| nd|d}|d	 }|r||d f }q|r||f }|stdd |||fD S t|||dS )a	  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetEncoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.      ?r   r!   r   r   r   z&The head_mask should be specified for  layers, but it is for .)r   r   r   c                 s       | ]	}|d ur|V  qd S r   r   r   vr   r   r   	<genexpr>      z,ProphetNetEncoder.forward.<locals>.<genexpr>)r`   rc   re   )rp   r   r  use_return_dictr   r   r;   r   r   r#   r   r$   r   r   rz   r   r   r   r   r   r   r<   lenr  	enumerater  _gradient_checkpointing_func__call__tupler   )rU   r}   r   r  r  r   r  r  extended_attention_maskr   r=   rc   rL   Zall_attentionsidxZencoder_layerlayer_outputsr   r   r   r   `  sl   
*


zProphetNetEncoder.forwardr   r   )rX   rY   rZ   r   r   rr   r   r  r  r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   >  s:    
	r   z=
    The standalone decoder part of the ProphetNetModel.
    c                       s  e Zd Zddedeej f fddZdd Zdd	 Z	e
												d d
eej deej deej deej deej deej deeeej   deej dee dee dee dee deeef fddZdd Zdd Zdd Z  ZS )!ProphetNetDecoderNrp   r   c                    s   t     j| _ j| _ j| _ j| _ j| _|dur |n
tj	 j
 j jd| _t | _t	| j jd| _t fddt jD | _t j| _d| _|   dS )r   Nr   c                    r   r   )r   r   r   r   r   r     r   z.ProphetNetDecoder.__init__.<locals>.<listcomp>F)r   r   r*   r4   r   r   r   max_target_positionsr   rr   r   r   rx   r   r   r   ngram_embeddingsr  r'   Znum_decoder_layersr  r   r   r  r  r  r   r   r   r     s    
 zProphetNetDecoder.__init__c                 C   r  r   r  rT   r   r   r   r    r	  z&ProphetNetDecoder.get_input_embeddingsc                 C   r
  r   r  r  r   r   r   r    r   z&ProphetNetDecoder.set_input_embeddingsr}   r   rL   encoder_attention_maskr  cross_attn_head_maskrE   r  r   r   r  r  r   c           %         s:  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur$|n| j j}|du r4|du r4td|dur@|dur@td|durM|du rM| |}|jdd \ }| j |f|j	|d\}}|durld\}}n| 
|\}}| j|d || }| jj|dur|ddksJ d fd	d
t| jD }d}d}nfdd
t| jD }| ||}| ||}|durd|ddddddf d| j jdd t| jj }||j}nd}t|g| d}| jr| |}tjj|| j| jd}|r	dnd}|r| j jdkrdnd}|
rdnd}|
r%dnd}|
r1| j j r1dnd}| j!rE| jrE|	rEt"#d d}	|	rJdnd}t$||gddgD ]+\}}|dur| d t%| j&ksJ d| dt%| j& d| d  dqUt'| j&D ]\}} |r||ddd|f f7 }| j jdkr||dd|df f7 }|dur|| nd}!| j!r| jr| (| j)|||||dur|| nd|dur|| nd||||d|	|
}"n#| |||||dur|| nd|dur|| nd|||||!|	|
d}"|"d }|	r||"|
rdnd f7 }|
r:||"d f7 }||"d f7 }| j j r:||"d f7 }q|r`||ddd|f f7 }| j jdkr`||dd|df f7 }|ddd|f }#| j jdkr{|dd|df nd}$|st*dd |#|$||||||fD S t+|#|$||||||dS )aY  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetDecoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r!   )r   rE   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1c                    s&   g | ]}|d      d d qS r   )r;   r   r*   r   r%  predicting_stream_pos_embedr   r   r   8  s    z-ProphetNetDecoder.forward.<locals>.<listcomp>c                    s   g | ]
} |d    qS r(  r   r)  )r%  r+  r   r   r   ?  s    r  r   r   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr  r'  zThe `z` should be specified for r  r  )r   rL   r   r   r   r   r>   r?   r=   r   r   r   r   r	   c                 s   r  r   r   r  r   r   r   r    s    
z,ProphetNetDecoder.forward.<locals>.<genexpr>)r`   ra   rE   rc   rd   re   rf   rJ   ),rp   r   r   r  r  r   r   rz   r   r   !compute_buffered_relative_bucketsr   r%  rn   r<   r'   r*   prepare_attention_maskprepare_predict_attention_maskr;   r   r   r#   r   r$   r   r(   r   r   r   r   r   r   r  loggerZwarning_oncezipr  r  r  r  r  r  rb   )%rU   r}   r   rL   r&  r  r'  rE   r  r   r   r  r  r)   Zmain_stream_pos_embedr=   r>   r?   rc   Zngram_hidden_statesr   r   Zextended_encoder_attention_maskZall_main_stream_hidden_statesZall_ngram_stream_hidden_statesZall_main_stream_attnsZall_ngram_stream_attnsZall_cross_attnsZpresent_key_valuesZ	attn_maskZ	mask_namer!  Zdecoder_layerr   r"  r`   ra   r   r*  r   r     s"  $


*



&zProphetNetDecoder.forwardc              	   C   s   |j \}}td| j|jdd}t| j| j	|\}}|d d d |d |f |dd}t
|d d d |d |f |d d d || j| j| f gd|dd}||fS r   )rz   r   r   r$  r   r   r;   r@   r4   r   r(   )rU   r=   r   r)   Zmain_relative_bucketsZpredict_relative_bucketsr   r   r   r,    s"   

$

z3ProphetNetDecoder.compute_buffered_relative_bucketsc                 C   s   |j d d \}}tj||ft|jj|j|jd}t|d}|d |d |f d d d d d d f || j	j
f|j  }|d ur]d|d d d d d d f  t| jj }|| }n|}||jS )Nr!   r   r   r  )rz   r   fullr#   r   r$   r   Ztriuexpandrp   r   r   )rU   rc   r   r   
seq_lengthZcausal_maskZextended_causal_maskr   r   r   r   r-    s    (*
z(ProphetNetDecoder.prepare_attention_maskc           	      C   s   |j d d \}}t| j| j|j|j}tj|d d d |d |f |d d d || j| j| f gdd}|d d d d d d d d f || j	j
f|j  }|d urd|d d d d d d d f  t| jj }||| j	j
| j||f}tj|t|gdd}|| }n|}||jS )Nr!   r9   r   r  )rz   r+   r$  r*   r   r   r   r(   r2  rp   r   r#   r$   r-   r   )	rU   rc   r   r   r3  Zpredict_causal_maskZextended_predict_causal_maskr   r   r   r   r   r.    s4   	,
z0ProphetNetDecoder.prepare_predict_attention_maskr   )NNNNNNNNNNNN)rX   rY   rZ   r   r   r   rr   r   r  r  r   r   r   r   r   r   rb   r   r,  r-  r.  r   r   r   r   r   r#    s`    	

 _r#  c                $       s   e Zd ZddgZdef fddZdd Zdd	 Zd
d Zdd Z	dd Z
e															d#deej deej deej deej deej deej deej dee deeeej   deej deej dee dee dee dee d eeef f d!d"Z  ZS )$ProphetNetModelencoder.word_embeddings.weightdecoder.word_embeddings.weightrp   c                    sx   t  | tj|j|j|jd| _t	|}d|_
d|_t|| j| _t	|}d|_d|_
t|| j| _|   d S )Nr   FT)r   r   r   rr   r   r   rx   r   copydeepcopyis_encoder_decoderr   r   encoder
is_decoderr#  decoderr  )rU   rp   Zencoder_configZdecoder_configr   r   r   r     s   

zProphetNetModel.__init__c                 C   r  r   r  rT   r   r   r   r  0  r	  z$ProphetNetModel.get_input_embeddingsc                 C   s   || _ | j | j_ | j | j_ d S r   )r   r:  r<  r  r   r   r   r  3  s   
z$ProphetNetModel.set_input_embeddingsc                 C   s4   | j jr| | jj| j | | jj| j d S d S r   )rp   tie_word_embeddings_tie_or_clone_weightsr:  r   r<  rT   r   r   r   _tie_weights8  s   zProphetNetModel._tie_weightsc                 C   r  r   )r:  rT   r   r   r   get_encoder=  r	  zProphetNetModel.get_encoderc                 C   r  r   r<  rT   r   r   r   get_decoder@  r	  zProphetNetModel.get_decoderNr}   r   decoder_input_idsdecoder_attention_maskr  decoder_head_maskr'  encoder_outputsrE   r  decoder_inputs_embedsr   r   r  r  r   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|dur$|n| j j}|du r8| j||||
|||d}| j|||d ||||	|||||d}|sQ|| S t|j|j	|j
|j|j|j|j|j|j|j|jdS )a7  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
        ```N)r}   r   r  r  r   r  r  r   )r}   r   rL   r&  r  r'  rE   r  r   r  r   r  )r`   ra   rE   rF   rG   rH   rI   rJ   rK   rL   rM   )rp   r   r   r  r  r:  r<  r_   r`   ra   rE   rc   rd   re   rf   rJ   )rU   r}   r   rC  rD  r  rE  r'  rF  rE   r  rG  r   r   r  r  Zdecoder_outputsr   r   r   r   C  sX   9zProphetNetModel.forward)NNNNNNNNNNNNNNN)rX   rY   rZ   _tied_weights_keysr   r   r  r  r?  r@  rB  r   r   r   r   
BoolTensorr   r   r   r_   r   r   r   r   r   r   r4    sr    	

r4  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c                &       s\  e Zd Zg dZdef fddZdd Zdd Zd	d
 Zdd Z	e
																d,deej deej deej deej deej deej deej deej deeeej   deej deej deej dee dee dee dee deeef f"dd Zd-d"d#Zdejfd$d%Zed&d' Zd(d) Zd*d+ Z  ZS )."ProphetNetForConditionalGeneration)r5  r6  lm_head.weightrp   c                    sH   t  | t|| _|j| _|j| _tj|j	|j
dd| _|   d S )NFrq   )r   r   r4  ri   rx   rs   disable_ngram_lossr   rm   r   r   lm_headr  r   r   r   r   r     s   
z+ProphetNetForConditionalGeneration.__init__c                 C   r  r   rN  rT   r   r   r   get_output_embeddings  r	  z8ProphetNetForConditionalGeneration.get_output_embeddingsc                 C   r
  r   rO  rU   Znew_embeddingsr   r   r   set_output_embeddings  r   z8ProphetNetForConditionalGeneration.set_output_embeddingsc                 C   s"   | j jr| | jj| j d S d S r   )rp   r=  r>  ri   r   rN  rT   r   r   r   r?    s   z/ProphetNetForConditionalGeneration._tie_weightsc                 C      | j jS r   )ri   r   rT   r   r   r   r       z7ProphetNetForConditionalGeneration.get_input_embeddingsNr}   r   rC  rD  r  rE  r'  rF  rE   r  rG  labelsr   r   r  r  r   c                 C   sv  |dur|n| j j}|dur|du r|du r| |}| j|||||||||	|
|||||d}|dur6|jn|jdd \}}|d || j j|d}| |}|dddf }| j jdkrh|ddddf nd}| sr|	 }d}|dur~| 
||}|stdd ||fD }|dur|f| |dd  S ||dd  S t||||j|j|j|j|j|j|j|j|jd	S )
a	  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
        ```N)r}   r   rC  rD  r  rE  r'  rF  rE   r  rG  r   r   r  r  r!   r   r9   r   c                 s   r  r   r   r  r   r   r   r  7  r  z=ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>)rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   )rp   r  r~   ri   rz   r   r*   rN  Zis_contiguousr   _compute_lossr  rA   rE   rF   rG   rH   rI   rJ   rK   rL   rM   )rU   r}   r   rC  rD  r  rE  r'  rF  rE   r  rG  rU  r   r   r  r  r   r   r)   predicting_streamspredict_logitsrC   rD   rB   
all_logitsr   r   r   r     s`   >

$.z*ProphetNetForConditionalGeneration.forwardrv   c                 C     | | jj|d|d|}t| jjD ]}|dkr#| jr# n|||d d d d f< q|dd }t	j
j|d|ddtjd}t	j
j||ddd}| jjdkr|jddd	 }||d}	||	 }| }| jj|d }
d
| jj | |
|  }|S Nr   r   r9   r   rk   )Z	reductionrj   T)r   Zkeepdimr  ry   rp   r*   r<   Zfill_r'   rM  r   r   r   r   Zlog_softmaxr   r   r   Znll_lossZepssumnerk   rU   rC   rU  Zignore_indexZexpend_targetsiZlprobsrB   Zsmooth_lossZnon_masked_tokensZeps_ir   r   r   rV  I  (   $z0ProphetNetForConditionalGeneration._compute_lossc                 C   s
   |  |S r   )r~   )rU   rU  r   r   r   %prepare_decoder_input_ids_from_labelse  r   zHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labelsc                    sB   d}| D ]}|t  fdd|d d D |dd   f7 }q|S )Nr   c                 3   $    | ]}| d  |jV  qdS r   NZindex_selectr   r   r   Z
past_statebeam_idxr   r   r  o     " zDProphetNetForConditionalGeneration._reorder_cache.<locals>.<genexpr>r!   r  rE   rh  Zreordered_pastZ
layer_pastr   rg  r   _reorder_cacheh  s   
z1ProphetNetForConditionalGeneration._reorder_cachec                 C   rS  r   )ri   r:  rT   r   r   r   r@  t  rT  z.ProphetNetForConditionalGeneration.get_encoderc                 C   rS  r   ri   r<  rT   r   r   r   rB  w  rT  z.ProphetNetForConditionalGeneration.get_decoder)NNNNNNNNNNNNNNNNrv   )rX   rY   rZ   rH  r   r   rP  rR  r?  r  r   r   r   r   rI  r   r   r   rA   r   rV  rb  staticmethodrl  r@  rB  r   r   r   r   r   rJ    s    	


y

rJ  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                        sB  e Zd Zg dZdef fddZdd Zdd Zd	d
 Zdd Z	dd Z
dd Zdd Ze													d+deej deej deej deej deej deej deeeej   deej deej dee dee dee d ee d!eeef fd"d#Zd,d%d&Z				d-d'd(Zed)d* Z  ZS ).ProphetNetForCausalLM)z!prophetnet.word_embeddings.weightz)prophetnet.decoder.word_embeddings.weightrK  rp   c                    s^   t |}d|_d|_t | t|| _|j| _	|j
| _
tj|j|jdd| _|   d S )NTFrL  )r7  r8  r;  r9  r   r   ProphetNetDecoderWrapperri   rx   rs   rM  r   rm   r   r   rN  r  r   r   r   r   r     s   

zProphetNetForCausalLM.__init__c                 C   s
   | j jjS r   ri   r<  r   rT   r   r   r   r    r   z*ProphetNetForCausalLM.get_input_embeddingsc                 C   s   || j j_d S r   rr  r  r   r   r   r    s   z*ProphetNetForCausalLM.set_input_embeddingsc                 C   r  r   rO  rT   r   r   r   rP    r	  z+ProphetNetForCausalLM.get_output_embeddingsc                 C   r
  r   rO  rQ  r   r   r   rR    r   z+ProphetNetForCausalLM.set_output_embeddingsc                 C   s$   | j jr| | jjj| j d S d S r   )rp   r=  r>  ri   r<  r   rN  rT   r   r   r   r?    s   z"ProphetNetForCausalLM._tie_weightsc                 C   s   || j _d S r   rm  )rU   r<  r   r   r   set_decoder  r   z!ProphetNetForCausalLM.set_decoderc                 C   rS  r   rm  rT   r   r   r   rB    rT  z!ProphetNetForCausalLM.get_decoderNr}   r   rL   r&  r  r'  rE   r  rU  r   r   r  r  r   c                 C   s4  |dur|n| j j}| jj|||||||||
|||d}|dur#|jn|jdd \}}|d || j j|d}| |}|dddf }| j jdkrU|ddddf nd}d}|	durc| ||	}|st	dd ||fD }|dur|f| |dd  S ||dd  S t
||||j|j|j|j|j|jd		S )
a	  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForCausalLM
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits

        >>> # Model can also be used with EncoderDecoder framework
        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
        >>> import torch

        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
        >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
        ... )

        >>> ARTICLE = (
        ...     "the us state department said wednesday it had received no "
        ...     "formal word from bolivia that it was expelling the us ambassador there "
        ...     "but said the charges made against him are `` baseless ."
        ... )
        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
        >>> labels = tokenizer_dec(
        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
        ... ).input_ids
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

        >>> loss = outputs.loss
        ```N)r}   r   rL   r&  r  r'  rE   r  r   r   r  r  r!   r   r9   r   c                 s   r  r   r   r  r   r   r   r    r  z0ProphetNetForCausalLM.forward.<locals>.<genexpr>)	rB   rC   rD   rE   rc   rd   re   rf   rJ   )rp   r  ri   r<  rz   r   r*   rN  rV  r  rg   rE   rc   rd   re   rf   rJ   )rU   r}   r   rL   r&  r  r'  rE   r  rU  r   r   r  r  r   r   r)   rW  rX  rC   rD   rB   rY  r   r   r   r     sJ   A 
$.zProphetNetForCausalLM.forwardrv   c                 C   rZ  r[  r\  r_  r   r   r   rV    ra  z#ProphetNetForCausalLM._compute_lossc                 K   s<   |d u r
| |j}|r|d d dd f }|||||dS )Nr9   )r}   r   r  rE   r   )Znew_onesrz   )rU   r}   rE   r   r  r   kwargsr   r   r   prepare_inputs_for_generation8  s   z3ProphetNetForCausalLM.prepare_inputs_for_generationc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr   c                 3   rc  rd  re  rf  rg  r   r   r  X  ri  z7ProphetNetForCausalLM._reorder_cache.<locals>.<genexpr>rj  rk  r   rg  r   rl  R  s   z$ProphetNetForCausalLM._reorder_cache)NNNNNNNNNNNNNrn  )NNNN)rX   rY   rZ   rH  r   r   r  r  rP  rR  r?  rs  rB  r   r   r   r   r   r   r   rg   r   rV  ru  ro  rl  r   r   r   r   r   rp  {  sz    	


n
rp  c                       s6   e Zd ZdZdef fddZdd Zdd Z  ZS )	rq  z
    This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
    classes.
    rp   c                    s@   t  | tj|j|j|jd| _t|| jd| _	| 
  d S )Nr   r  )r   r   r   rr   r   r   rx   r   r#  r<  r  r   r   r   r   r   c  s   z!ProphetNetDecoderWrapper.__init__c                 C   s   |  | j| j  d S r   )r>  r   r<  r  rT   r   r   r   r?  l  s   z%ProphetNetDecoderWrapper._tie_weightsc                 O   s   | j |i |S r   rA  )rU   argsrt  r   r   r   r   o  s   z ProphetNetDecoderWrapper.forward)	rX   rY   rZ   r[   r   r   r?  r   r   r   r   r   r   rq  ]  s
    	rq  )r#  r   rp  rJ  r4  rh   r   )9r[   r7  r2   rQ   dataclassesr   typingr   r   r   r   Ztorch.utils.checkpointr   r   Ztorch.nnr   Zactivationsr
   Z
generationr   Zmodeling_outputsr   Zmodeling_utilsr   utilsr   r   r   Zconfiguration_prophetnetr   Z
get_loggerrX   r/  r   r+   r8   r@   rA   r_   rb   rg   rh   rr   r   Moduler   r   r   r   r   r   r#  r4  rJ  rp  rq  __all__r   r   r   r   <module>   s   


TU;=&+   A+T~  W  H ^