
    fTh                    ~   S r SSKrSSKrSSKrSSKJr  SSKJrJrJ	r	  SSK
r
SSKr
SSK
JrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJr  \R<                  " \5      r S;S jr!S r"S;S jr#S r$\ " S S\5      5       r%\ " S S\5      5       r&\ " S S\5      5       r'\ " S S\5      5       r(\ " S S\5      5       r) " S S\RT                  5      r+ " S S \RX                  5      r- " S! S"\RX                  5      r. " S# S$\RX                  5      r/ " S% S&\RX                  5      r0 " S' S(\RX                  5      r1\" S)S*9 " S+ S,\)5      5       r2\" S-S*9 " S. S/\)5      5       r3\ " S0 S1\)5      5       r4\" S2S*9 " S3 S4\)\5      5       r5\" S5S*9 " S6 S7\)\5      5       r6 " S8 S9\)5      r7/ S:Qr8g)<zRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).    N)	dataclass)OptionalTupleUnion)Tensornn)	LayerNorm   )ACT2FN)GenerationMixin)BaseModelOutput)PreTrainedModel)ModelOutputauto_docstringlogging   )ProphetNetConfigc                     U(       a,  [         R                  R                  U R                  5       US9$ [         R                  R                  X[        R
                  S9$ )Ndimr   dtype)r   
functionalsoftmaxfloattorchfloat32)hidden_stater   
onnx_traces      j/var/www/auris/envauris/lib/python3.13/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   r   '   sF    }}$$\%7%7%9s$CC}}$$\%--$PP    c                 h   [         R                  " XU 4X#S9[         R                  " U5      R                  -  nUR	                  5       R                  5       n[        U5       H,  nXV   R                  SSS9  XF   R                  U* S-   5        M.     SUSS2SS2S4'   [         R                  " XE/SS9$ )	z8
This function computes the bias for the predict stream
)devicer   r   F)wrapr   N   r   )
r   onesfinfomindetachclonerangefill_diagonal_triu_cat)sequence_lengthngramr#   r   
left_blockright_block
stream_idxs          r    ngram_attention_biasr4   .   s    
 	

EO<VY\a\g\ghm\n\r\rr  ##%++-KEl
..qu.=$$j[1_5 # Jq!Qw99j.A66r!   c                    U* nSnU(       a[  U S-  n U[         R                  " U[         R                  " U5      5      R                  5       U -  -   n[         R                  " U5      nO+[         R
                  " U[         R                  " U5      5      nU S-  n[         R                  " XF5      nU[         R                  " UR                  5       U-  5      [        R                  " X-  5      -  X-
  -  -   n[         R                  " U[         R                  " U5      U S-
  -  5      R                  5       nU[         R                  " XtR                  5       U5      -   nU$ )zg
This function computes individual parts of the relative position buckets. For more detail, see paper.
r   r%   r   )r   lt
zeros_likeintabsmaxlogr   mathr(   	ones_likewhere)	num_bucketsmax_distancerelative_positionsis_bidirectionalinv_relative_positionsrel_positions_bucket	max_exactis_smallval_if_larges	            r    compute_relative_bucketsrH   ?   s>    10!Q& hh-u/?/?@V/WX\\^allm 	 "'+A!B!&+A5CSCSTjCk!lq Ixx.:Huyy)?)E)E)G))STW[W_W_ X  		  " "L 99\5??<+HKZ[O+\]aacL/%++hHbHbHdfr2ssr!   c                 x   UR                  S5      R                  SUR                  S5      S5      nX2R                  S5      -
  n[        R                  " US-
  U4SS9R                  S5      nUR                  SUR                  S5      S5      nXBR                  S5      -
  n[        XUSS9n[        XUSS9nXV4$ )ze
This function computes both main and predict relative position buckets. For more detail, see paper.
r   r   F)rB   )	unsqueezerepeatsizer   r.   rH   )r?   r@   position_idsmain_stream_relative_positions$predicting_stream_relative_positionsmain_relative_position_buckets!predict_relative_position_bucketss          r    #compute_all_stream_relative_bucketsrS   Z   s    
 &2%;%;A%>%E%EaIZIZ[]I^`a%b"%CF\F\]_F`%`" ,199lQ6F5U[]+^+h+hij+k(+O+V+VWXZfZkZklnZoqr+s(+ORhRhikRl+l( &>#ATY&" )A#GZ_)% *LLr!   c                   :   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   \S 5       rSrg)ProphetNetSeq2SeqLMOutputq   a6  
Base class for sequence-to-sequence language models outputs.

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, decoder_sequence_length, hidden_size)`.

        Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
        compute the weighted average in the
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, encoder_sequence_length, hidden_size)`.

        Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
    encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention
        softmax, used to compute the weighted average in the self-attention heads.
Nlosslogitslogits_ngrampast_key_valuesdecoder_hidden_statesdecoder_ngram_hidden_statesdecoder_attentionsdecoder_ngram_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentionsc                 P    [         R                  " S[        5        U R                  $ Nzi`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions` instead.warningswarnFutureWarningr_   selfs    r    decoder_cross_attentions2ProphetNetSeq2SeqLMOutput.decoder_cross_attentions   $    	

 $$$r!    )__name__
__module____qualname____firstlineno____doc__rW   r   r   FloatTensor__annotations__rX   rY   rZ   r   r[   r\   r]   r^   r_   r`   ra   rb   propertyrk   __static_attributes__rn   r!   r    rU   rU   q   sH   :x )-D(5$$
%,*.FHU&&'.04L(5,,-4:>OXeE$5$567>@D8E%*;*;$<=DFJ%0A0A*B!CJ=Au'8'8!9:ACGhuU->->'?@G;?huU%6%678?=Ax(9(9:A@D8E%*;*;$<=D=Au'8'8!9:A% %r!   rU   c                      \ rS rSr% Sr\R                  \S'   Sr\	\R                     \S'   Sr
\	\\R                        \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S	'   Sr\	\\R                        \S
'   Sr\	\\R                        \S'   Sr\	\R                     \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S'   \S 5       rSrg)ProphetNetSeq2SeqModelOutput   aj  
Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
decoding.

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, decoder_sequence_length, hidden_size)`.

        Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
        compute the weighted average in the
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, encoder_sequence_length, hidden_size)`.

        Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
    encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, encoder_sequence_length)`.

        Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
last_hidden_stateNlast_hidden_state_ngramrZ   r[   r\   r]   r^   r_   r`   ra   rb   c                 P    [         R                  " S[        5        U R                  $ rd   re   ri   s    r    rk   5ProphetNetSeq2SeqModelOutput.decoder_cross_attentions  rm   r!   rn   )ro   rp   rq   rr   rs   r   rt   ru   r|   r   rZ   r   r[   r\   r]   r^   r_   r`   ra   rb   rv   rk   rw   rn   r!   r    ry   ry      s+   <| (((;?Xe&7&78?:>OXeE$5$567>@D8E%*;*;$<=DFJ%0A0A*B!CJ=Au'8'8!9:ACGhuU->->'?@G;?huU%6%678?=Ax(9(9:A@D8E%*;*;$<=D=Au'8'8!9:A% %r!   ry   c                   t   \ rS rSr% Sr\R                  \S'   Sr\	\R                     \S'   Sr
\	\\R                        \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S'   Sr\	\\R                        \S	'   Sr\	\\R                        \S
'   Sr\	\\R                        \S'   Srg)ProphetNetDecoderModelOutputi  a  
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

Args:
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, decoder_sequence_length, hidden_size)`.

        Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
    ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
        compute the weighted average in the
r{   Nr|   rZ   hidden_stateshidden_states_ngram
attentionsngram_attentionsr_   rn   )ro   rp   rq   rr   rs   r   rt   ru   r|   r   rZ   r   r   r   r   r   r_   rw   rn   r!   r    r   r     s    .` (((;?Xe&7&78?:>OXeE$5$567>8<M8E%"3"345<>B%(9(9":;B59Ju00129;?huU%6%678?;?huU%6%678?r!   r   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)ProphetNetDecoderLMOutputiX  a  
Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).

Args:
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`List[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
        num_attn_heads, decoder_sequence_length, embed_size_per_head)`).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, decoder_sequence_length, hidden_size)`.

        Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
    ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
        self-attention heads.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        encoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
        compute the weighted average in the
NrW   rX   rY   rZ   r   r   r   r   r_   rn   )ro   rp   rq   rr   rs   rW   r   r   rt   ru   rX   rY   rZ   r   r   r   r   r   r_   rw   rn   r!   r    r   r   X  s    /b )-D(5$$
%,*.FHU&&'.04L(5,,-4:>OXeE$5$567>8<M8E%"3"345<>B%(9(9":;B59Ju00129;?huU%6%678?;?huU%6%678?r!   r   c                   ,    \ rS rSr\rSrSrS rS r	Sr
g)ProphetNetPreTrainedModeli  
prophetnetTc                 F   [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         g g g )N        )meanstd)
isinstancer   Linearweightdatanormal_configinit_stdbiaszero_	Embeddingpadding_idx)rj   modules     r    _init_weights'ProphetNetPreTrainedModel._init_weights  s    fbii((MM&&CT[[5I5I&J{{&  &&( '--MM&&CT[[5I5I&J!!-""6#5#56<<> . .r!   c                    U R                   R                  nU R                   R                  nUc   S5       eUR                  UR                  5      nUSS S24   R                  5       USSS 24'   X$S'   Uc   S5       eUR                  US:H  U5        [        R                  " US:  5      R                  5       (       d   S	5       eU$ )
Nzself.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.rJ   r   ).r   z1self.model.config.pad_token_id has to be defined.r   z8Verify that `shifted_input_ids` has only positive values)
r   decoder_start_token_idpad_token_id	new_zerosshaper*   masked_fill_r   allitem)rj   	input_idsr   r   shifted_input_idss        r    _shift_right&ProphetNetPreTrainedModel._shift_right  s    !%!C!C{{//%1 	
F	
1 &//	@%.sCRCx%8%>%>%@#qr'"$:&!'\)\\'&&'8D'@,Oyy*a/05577s9ss7  r!   rn   N)ro   rp   rq   rr   r   config_classbase_model_prefixsupports_gradient_checkpointingr   r   rw   rn   r!   r    r   r     s    #L$&*#?!r!   r   c                   T   ^  \ rS rSrSrS\SS4U 4S jjrS
U 4S jjrU 4S jrS	r	U =r
$ )ProphetNetPositionalEmbeddingsi  z
This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
the forward function.
r   returnNc                    > UR                   U l        [        TU ]  UR                   UR                  UR
                  5        g N)max_position_embeddings
max_lengthsuper__init__hidden_sizer   rj   r   	__class__s     r    r   'ProphetNetPositionalEmbeddings.__init__  s3     88779K9KVM`M`ar!   c                 &  > Ub  U R                   b   S5       eUc  Ub[  US   S   R                  S   nUS   U-   n[        R                  " S[        R                  US9[        U R                   U-   5      -  nOUc$  [        R                  " U[        R                  US9n[        R                  " USS9R                  U5      U-  R	                  5       U R                   -   nUR                  SU R                  S-
  5      n[        TU ]-  U5      U4$ )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r%   r   )r   r   r   r#   r   )r   r   r   r&   longr8   cumsumtype_asclampr   r   forward)	rj   inputs_shaper#   attention_maskrZ   rN   prev_num_input_idsnum_input_idsr   s	           r    r   &ProphetNetPositionalEmbeddings.forward  s   $$*:*:*B 	
Q	
C * &5Q%7%:%@%@%C" ,Q2D D$zz&

6R((=89  ")%*ZZEJJW]%^N LLQ7??OR``$&4++ ,
  ,11!T__q5HIw|,l::r!   c                 "   > [         TU ]  U5      $ r   )r   r   )rj   rN   r   s     r    _forward'ProphetNetPositionalEmbeddings._forward  s    w|,,r!   )r   )NNN)ro   rp   rq   rr   rs   r   r   r   r   rw   __classcell__r   s   @r    r   r     s.    b/ bD b;8- -r!   r   c                      ^  \ rS rSrSrS\S\4U 4S jjrS\R                  S\S\4S	 jr
     SS
\\	   S\\	   S\\	   S\\\	      S\S\\	\\	   4   4S jjrSrU =r$ )ProphetNetAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperr   num_attn_headsc                   > [         TU ]  5         UR                  nUR                  U l        UR                  U l        X l        X2-  U l        U R                  U-  U:X  d   S5       e[        R                  " X35      U l	        [        R                  " X35      U l
        [        R                  " X35      U l        [        R                  " X35      U l        g )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r   r   r   attention_dropoutdropoutr   head_dimr   r   key_proj
value_proj
query_projout_proj)rj   r   r   r   r   s       r    r   ProphetNetAttention.__init__  s    
 	((!'!9!9~~,#5}}~-< 	
4	
<
 		+;))K=))K=		+;r!   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ Nr   r%   viewr   r   	transpose
contiguous)rj   r   r   r   s       r    _shapeProphetNetAttention._shape  s7    {{3)<)<dmmLVVWXZ[\ggiir!   key_value_statesr   layer_head_maskpast_key_valueoutput_attentionsr   c                 *   UR                  5       u  pxn	US Ln
[        UR                  5       5      UUU	/:X  d   SXxU	4 SUR                  5        35       eU R                  U5      U R                  S-  -  nU
(       a  Ub  US   nUS   nOU
(       aE  U R	                  U R                  U5      SU5      nU R	                  U R                  U5      SU5      nODU R	                  U R                  U5      SU5      nU R	                  U R                  U5      SU5      nU
(       a  X4nXpR                  SU R                  4nU R	                  XU5      R                  " U6 nUR                  " U6 nUR                  " U6 nUR                  S5      n[        R                  " SXR                  SS	5      5      nXpR                  X4nUR                  5       U:w  a  [        S
U SUR                  5        35      eUb  UR                  5       S:X  a  S nXpR                  SU4nUb3  UR                  5       U:w  a  [        SU SUR                  5        35      eUb  UU-   nU(       a  UnOS n[        R                  R!                  USS9nUb  UR                  5       U R                  4:X  d&   SU R                  4 SUR                  5        35       eUR                  SSSS5      UR                  XpR                  X5      -  nUR                  SSSS5      U-  n[        R                  R#                  UU R$                  U R&                  S9n[        R                  " SUU5      nXpR                  XR                  4nUR                  5       U:w  a  [        SU SUR                  5        35      eUR                  SS5      R)                  XxU	5      nU R+                  U5      n[        R                  R#                  UU R"                  U R&                  S9nUUU4$ )Nz Size of hidden states should be 	, but is       ?r   r   rJ   r%   zbsij,bsjk->bsikr
   z#Attention weights should have size z Attention mask should have size r   /Head mask for a single layer should be of size ptrainingz `attn_output` should have shape , but is of shape )rM   listr   r   r   r   r   r   r   r   einsumr   
ValueErrorr   r   r   r   r   r   r   reshaper   )rj   r   r   r   r   r   r   
batch_sizetgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsexpected_shapeattn_weights_reshaped
attn_probsattn_outputs                        r    r   ProphetNetAttention.forward  s8    ,9+=+=+?(
[ .T9M&&().
 
 	p .j;.N-OyYfYkYkYmXno		p 
 }59KL."<'*J)!,LT]]3C%Db*UJ;;t7G'H"jYL T]]=%A2zRJ;;t}'Er:VL
 )7N !"5"5r4==I
{{<*EJJJW__j1
#((*5//!$||$5|EYEYZ[]^E_`$&9&97L.0B>BRR[\h\m\m\o[pqrr %.*<*<*>!*C!N$&9&91gF%.*=*=*?>*Q??OyYgYlYlYnXopqq%'.8L$0!$(!}},,\r,B&"'')d.A.A-CC A4CVCVBXAY Z#((*+-C +//2q!<|?P?P//@ L
 %4$8$8B1$EH]$]!]]**$$]] + 


 ll#4j,O$&9&97MMR/??OOabmbrbrbtauvww!++Aq199*{[mmK0mm++K4<<RVR_R_+`1>AAr!   )r   r   r   r   r   r   r   r   )NNNNF)ro   rp   rq   rr   rs   r   r8   r   r   r   r   r   r   boolr   rw   r   r   s   @r    r   r     s    G< < <0jU\\ jC jc j .2+/,026"'`B #6*`B !(	`B
 "&)`B !v/`B  `B 
vx''	(`B `Br!   r   c                   >   ^  \ rS rSrSrS\S\4U 4S jjrS rSr	U =r
$ )ProphetNetFeedForwardii  ze
This is the residual two feed-forward layer block based on the original Transformer implementation.
r   ffn_dimc                 ,  > [         TU ]  5         [        UR                     U l        [
        R                  " UR                  U5      U l        [
        R                  " X!R                  5      U l	        UR                  U l
        UR                  U l        g r   )r   r   r   activation_functionactivation_fnr   r   r   intermediateoutputactivation_dropoutr   )rj   r   r  r   s      r    r   ProphetNetFeedForward.__init__n  si    #F$>$>?IIf&8&8'Bii););<"(";";~~r!   c                 4   U R                  U5      nU R                  U5      n[        R                  R	                  XR
                  U R                  S9nU R                  U5      n[        R                  R	                  XR                  U R                  S9nU$ )Nr   )r  r  r   r   r   r  r   r  )rj   r   s     r    r   ProphetNetFeedForward.forwardv  s    ))-8**=9--m?V?Vaeanan-oM2--m||VZVcVc-dr!   )r  r  r   r  r  )ro   rp   rq   rr   rs   r   r8   r   r   rw   r   r   s   @r    r  r  i  s&    &/ &# & r!   r  c                   t   ^  \ rS rSrS\4U 4S jjrS rS r       SS\\	\
      4S jjrS rS	 rS
rU =r$ )ProphetNetNgramSelfAttentioni  r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l        UR                  U R                  -  U l	        UR                  U l
        U R                  U R                  -  UR                  :X  d   S5       e[        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  U R                  U R                  -  5      U l        SU l        g )Nz6config.hidden_size must be divisible by num_attn_headsF)r   r   r   r?   relative_max_distancenum_decoder_attention_headsr   r   r   r   r0   r   r   r   r   r   r   relative_pos_embeddingsr   r   s     r    r   %ProphetNetNgramSelfAttention.__init__  sa   !--!--%+%A%A"$@@~~!'!9!9**d.A.AA\\
}}t222f6H6HH 	
D	
H 		&"4"4f6H6HI))F$6$68J8JK))F$6$68J8JK 		&"4"4f6H6HI (*yy1C1CTEUEUX\XkXkEk'l$  r!   c                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ r   r   )rj   r   r   r   s       r    r   #ProphetNetNgramSelfAttention._shape  s7    {{:0C0CT]]S]]^_abcnnppr!   c                     SU l         g )NT)r   ri   s    r    prepare_for_onnx_export_5ProphetNetNgramSelfAttention.prepare_for_onnx_export_  s	    r!   r   c	           	         UR                  5       u  pn[        UR                  5       5      XU/:X  d   SXU4 SUR                   35       eU R                  U5      nU R	                  U5      nU R                  U5      nXR                  S-  -  nU R                  XU	5      nU R                  USU	5      nU R                  USU	5      nXR                  SU R                  4nUR                  " U6 nUR                  " U6 nUR                  " U6 nUR                  SU R                  -   SS9nUR                  SU R                  -   SS9nUR                  SU R                  -   SS9nUR                  SU R                  -   SS9nUS   USS  nnUS   USS  nnUS   USS  nnUS   USS  nnUb8  US   n[        R                  " UU4SS9nUS   n[        R                  " UU4SS9nUU4nU
SU R                  -   -  n[        R                  " S	UUR                  SS
5      5      nU R!                  UUX5      n UU -   nUb  UU-   n[#        USU R$                  S9R'                  U5      n!Ubw  UR                  5       U R                  4:X  d&   SU R                  4 SUR                  5        35       eUR                  SSSS5      U!R                  XR                  SU5      -  n![(        R*                  R-                  U!U R.                  U R0                  S9n![        R                  " S	U!U5      n"U"R                  SS5      R3                  U	SUU5      n"U R5                  U"5      n"[        R6                  " US5      R                  XR                  U R                  UU R                  5      n#[        R6                  " U V$s/ s H  n$[        R                  " UU$/S5      PM     sn$S5      n%[        R6                  " USS9n&[        R                  " U V's/ s H+  n'[        R                  " UU'/S5      R9                  S5      PM-     sn'S5      n([        R                  " SU#U%45      n)U R;                  U&U)X5      n*U)U*-   n)Ub5  UR=                  SSSS
S5      nUR?                  U)R@                  5      nU)U-   n)[#        U)SU R$                  S9R'                  U)5      n+Ub]  UR                  5       U R                  4:X  d&   SU R                  4 SUR                  5        35       eUR                  SSSSS5      U+-  n+[(        R*                  R-                  U+U R.                  U R0                  S9n+[        R                  " SU+U(R                  SS5      45      n,U,R                  SS
5      n,U,R3                  XR                  UU5      n,U R5                  U,5      n,[        R                  " U"U,/S5      R                  U	SU5      n-U!R                  XR                  US5      n![(        R*                  R-                  U-U R,                  U R0                  S9n-U-U!U+U4$ s  sn$f s  sn'f )Nz#`hidden_states` should be of shape r   r   rJ   r   r   r%   r   zbntc,bncs->bntsr
   )r   r   r   r   r   zbnhtc,bnhsc->bnhts   zbnhts,bnhsc->bnhtc)!rM   r   r   r   r   r   r   r   r   r   chunkr0   r   r.   r   r    get_main_relative_pos_embeddingsr   r   r   r   r   r   r   r   r   r   stackrK   #get_predict_relative_pos_embeddingspermutetor   ).rj   r   r   r   r   extended_predict_attention_maskrQ   rR   rN   r   ngram_sequence_lengthr   r   r   r   r   hidden_states_listquery_states_listkey_states_listvalue_states_listmain_hidden_stateshidden_states_predict_listmain_query_statespredict_query_states_listmain_key_statespredict_key_states_listmain_value_statespredict_value_states_listprev_main_key_statesprev_main_value_statesr/   main_attn_weightsmain_relative_pos_embeddingsmain_attn_probsmain_attn_outputpredict_query_stateskeypredict_key_statespredict_hidden_statesv_ppredict_value_statespredict_attn_weightspredict_relative_pos_embeddingspredict_attn_probspredict_attn_outputr   s.                                                 r    r   $ProphetNetNgramSelfAttention.forward  s    :G9K9K9M6
;M&&()jQ\-]] 	
1*U`2`1a b##$&	
] }5]]=1
}5 $}}c'9: {{<
S[[R<
{{<Z@ "5"5r4==I
#((*5__j1
#((*5 +00TZZQ0G(..q4::~1.E$**1tzz>q*A(..q4::~1.E9KA9NPbcdcePf67H7KM^_`_aMb43B13EWXWYGZ07H7KM^_`_aMb4 %#1!#4 #ii)=(OUVWO%3A%6" %		+ACT*U[\ ] *+<= 0A

NC "LL):<MOhOhijlmOno (,'L'L 1<(
$ .0LL% 1N B!
 '#
$	 	 &"'')d.A.A-CC A4CVCVBXAY Z#((*+-C .221b!Q?/BVBV//_C O --//4CYCYdhdqdq/r
 !<<(9?L]^+55a;CCJPQSbdop==)9:  %{{+DaHMM

D$7$7$-- 

 #[[Zq)rZqSV%))_c4JA*NZq)rtuv !&,FA N  %yyLefLeSUYY)3/3==a@Lefhi 
  %||,@CWYkBlm +/*R*R!#7+
'
  46UU*6.M.U.UVWYZ\]_`bc.d+.M.P.PQeQkQk.l+#7:Y#Y $ 
 '&
'	 	 &"'')d.A.A-CC A4CVCVBXAY Z#((*+-C "1!5!5aB1!EHZ!Z]]22$"8"84== 3 
 $ll #57K7U7UVWYZ7["\
 2;;AqA199*jjRacno"mm,?@ ii!13F GKPPQ[]_alm)..z;N;NP_acdmm++K4<<RVR_R_+`O-?OOI *s gs   <#Y2Yc                    UR                   u  pVpxUR                  XVXx5      nUc  UR                   S S u  pY[        R                  " SUR                   S   S-   5      R	                  S5      R	                  S5      R                  XYS5      R                  UR                  5      n
XR	                  S5      R                  XYS5      -
  n
[        U R                  U R                  U
S5      nU R                  U5      nUR                  UR                   S S U R                  U R                  4-   5      nUR                  SSSS5      nUR                  UR                   S S S-   5      nUR                  SU R                  S5      nUR                  SUR                   S   5      nUR                  5       nUR                  SUR!                  S5      5      n[        R"                  " USUS9nUR                  XVUS5      nU$ )	Nr%   r   rJ   r   Fr
   )rJ   r   index)r   r   r   arangerK   rL   r  r#   rH   r?   r  r  r   r  r   r   rM   gather)rj   r   r   rN   rQ   r   r   r   r   r/   rA   rel_pos_embeddingsr1  s                r    r  =ProphetNetNgramSelfAttention.get_main_relative_pos_embeddingsV  s    8D7I7I4
G#((WV)1*7*=*=bq*A'JQ 2 22 6 :;11
Q7L''(  "46L6LQ6O6V6VWatu6v!v-E  $"<"<>PRW.*
 "99-H/44$$Ra(D,<,<d>Q>Q+RR
 0771aC/778J8J2A8NQV8VW)G)N)NqRVReRegh)i&)G)L)L.44R8*
& *H)L)L)N&/77<N<S<STV<WX',||4FAUs't$'C'H'Helnp'q$++r!   c                     UR                   SS u  pVUc  UR                   S   nUS   S   US-
  :X  d   S5       e[        R                  " SU5      R                  S5      R                  S5      R	                  XVS5      R                  UR                  5      nXR                  S5      R	                  XVS5      -
  n[        U R                  U R                  US5      nUR                  SS5      nU R                  U5      n	U	R                  UR                   S S U R                  U R                  4-   5      n	U	R                  SSSSS5      n	U	R                  SU R                  5      n	UR                  S5      nUR	                  U R                   SU R                  S5      nUR                  SUR#                  S5      5      R%                  5       n[        R&                  " U	SUS	9n
U
R                  XPR                   U R                  US5      n
U
$ )
Nr   r%   rJ   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr  r
   r@  )r   r   rB  rK   rL   r  r#   rH   r?   r  r   r  r   r   r  r   r0   rM   r   rC  )rj   r   r   rN   rR   r   r/   key_sequence_lengthrA   rD  r;  s              r    r  @ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddings  s     '4&9&9!A&>#
,4"."4"4R"8?1%)<q)@@ t@ Q 3411
Q7L''(  "46L6LQ6O6V6VWatu6v!v0H  $"<"<>PRW1-
 &//15!99-H 044$(8(8$:M:M'NN
 0771aAF/77D<L<LM,M,W,WXY,Z),M,T,TJJ4..-
) -N,R,R166r:-

$& 	* +0,,A-N+
'
 +J*N*N

D$7$7"+
' /.r!   )r   r   r   r   r   r0   r   r?   r   r   r   r  r  r   NNNNNNN)ro   rp   rq   rr   r   r   r   r  r   r   r   r   r  r  rw   r   r   s   @r    r  r    s_     /  :q 37(,'+*.pP !v/pPd+,Z9/ 9/r!   r  c                   H   ^  \ rS rSrSrS\4U 4S jjr SS\4S jjrSr	U =r
$ )	ProphetNetEncoderLayeri  z
Encoder block for Prophetnet
r   c                    > [         TU ]  5         [        XR                  5      U l        [        UR                  5      U l        [        XR                  5      U l
        [        UR                  5      U l        g r   )r   r   r   num_encoder_attention_heads	self_attnr	   r   self_attn_layer_normr  encoder_ffn_dimfeed_forwardfeed_forward_layer_normr   s     r    r   ProphetNetEncoderLayer.__init__  s[    ,V5W5WX$-f.@.@$A! 2&:P:PQ'01C1C'D$r!   r   c                     U R                  UUUUS9u  pVnU R                  XQ-   5      nU R                  U5      nU R                  X-   5      nU4n	U(       a  X4-  n	U	$ )N)r   r   r   r   )rN  rO  rQ  rR  )
rj   r   r   r   r   attention_outputr   _feed_forward_outputoutputss
             r    r   ProphetNetEncoderLayer.forward  s     -1NN')+/	 -; -
) 112B2RS #//>445H5XY "&Gr!   )rQ  rR  rN  rO  Fro   rp   rq   rr   rs   r   r   r   r   rw   r   r   s   @r    rK  rK    s0    E/ E #(
   r!   rK  c                   b   ^  \ rS rSrSrS\4U 4S jjr            S	S\S\4S jjrSr	U =r
$ )
ProphetNetDecoderLayeri  z
Decoder block for Prophetnet
r   c                 h  > [         TU ]  5         [        U5      U l        [	        UR
                  5      U l        UR                  (       a4  [        XR                  5      U l
        [	        UR
                  5      U l        [        XR                  5      U l        [	        UR
                  5      U l        g r   )r   r   r  rN  r	   r   rO  add_cross_attentionr   r  
cross_attncross_attn_layer_normr  decoder_ffn_dimrQ  rR  r   s     r    r   ProphetNetDecoderLayer.__init__  s    5f=$-f.@.@$A! %%1&:\:\]DO)263E3E)FD& 2&:P:PQ'01C1C'D$r!   	use_cacher   c                 ~   Ub  US S OS nU R                  UUUUUUU	U
S9u  nnnnU R                  X-   5      nUb  USS  OS nS nUb1  U R                  UUUUUUS9u  nnnU R                  UU-   5      nUU-   nU R	                  U5      nU R                  UU-   5      nU4nU(       a  UUUU4-  nU(       a  UU4-  nU$ )Nr%   )r   r   r   r   r   rQ   rR   rN   )r   r   r   r   r   r   )rN  rO  r`  ra  rQ  rR  )rj   r   r   ra   encoder_attn_maskr   cross_attn_layer_head_maskr   rQ   rR   rN   r   rd  r   self_attn_past_key_valuengram_attention_outputself_attn_weightsself_attn_weights_ngrampresent_key_valuecross_attn_past_key_valuecross_attn_weightsrU  cross_attn_present_key_valuerW  rX  s                            r    r   ProphetNetDecoderLayer.forward  sH   $ :H9S>"1#5Y] `d`n`n'3)+,K+I.O% ao 	a
] 13JL] 11-2XY <J;UN23$7[_!! ,QUQ`Q`+!60 :8"3 Ra RN02N !667G-7WXM !24P P #//>445H=5XY ")+BDVWWG)++Gr!   )r`  ra  rQ  rR  rN  rO  )NNNNNNNNNNTFr[  r   s   @r    r]  r]    s[    E/ E$ "#'(,'+*."'= =  = =r!   r]  z=
    The standalone encoder part of the ProphetNetModel.
    )custom_introc                   "  ^  \ rS rSrSS\S\R                  4U 4S jjjrS rS r	\
       SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )ProphetNetEncoderi>  r   word_embeddingsc                   > [         TU ]  U5        Ub  UO3[        R                  " UR                  UR
                  UR                  S9U l        [        U5      U l	        [        UR
                  5      U l        [        R                  " [        UR                  5       Vs/ s H  n[        U5      PM     sn5      U l        SU l        U R%                  5         gs  snf a  
word_embeddings (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
    The word embedding parameters. This can be used to initialize [`ProphetNetEncoder`] with pre-defined word
    embeddings instead of randomly initialized word embeddings.
Nr   F)r   r   r   r   
vocab_sizer   r   ru  r   position_embeddingsr	   embeddings_layer_norm
ModuleListr+   num_encoder_layersrK  layersgradient_checkpointing	post_initrj   r   ru  rV  r   s       r    r   ProphetNetEncoder.__init__D  s     	  * f//1C1CQWQdQde 	
 $B&#I %.v/A/A%B"mmUSYSlSlMm$nMm%;F%CMm$no&+#	 %os    Cc                     U R                   $ r   ru  ri   s    r    get_input_embeddings&ProphetNetEncoder.get_input_embeddingsZ      ###r!   c                     Xl         g r   r  rj   values     r    set_input_embeddings&ProphetNetEncoder.set_input_embeddings]      $r!   r   r   	head_maskinputs_embedsr   output_hidden_statesreturn_dictr   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  Uc  [	        S5      eUb  Ub  [	        S5      eUb  Uc  U R                  U5      nUb}  SUSS2SSSS24   R                  SU R                   R                  SS5      -
  [        R                  " U R                  5      R                  -  nUR                  UR                  5      nOSnU R                  UR                  SS UR                  5      u  pXI-   nU R!                  U5      n["        R$                  R'                  XR                   R&                  U R(                  S9nU(       a  SOSnU(       a  SOSnUb\  UR+                  5       S	   [-        U R.                  5      :X  d2   S
[-        U R.                  5       SUR+                  5       S	    S35       e[1        U R.                  5       H  u  pU(       a  X4-   nU R2                  (       a8  U R(                  (       a'  U R5                  UR6                  UUUb  X>   OSU5      nOU" UUUb  X>   OSUS9nUS	   nU(       d  Mx  UUS   4-   nM     U(       a  X4-   nU(       d  [9        S XU4 5       5      $ [;        XUS9$ )a  
Example:

```python
>>> from transformers import AutoTokenizer, ProphetNetEncoder
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.      ?r   r%   r   rn   r   z&The head_mask should be specified for  layers, but it is for .)r   r   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rn   .0vs     r    	<genexpr>,ProphetNetEncoder.forward.<locals>.<genexpr>  s     l$Zq$Z   	)r{   r   r   )r   r   r  use_return_dictr   ru  rL   rM  r   r'   r   r(   r  rz  r   r#   r{  r   r   r   r   rM   lenr~  	enumerater  _gradient_checkpointing_func__call__tupler   )rj   r   r   r  r  r   r  r  extended_attention_maskrz  rN   r   ra   all_attentionsidxencoder_layerlayer_outputss                    r    r   ProphetNetEncoder.forward`  s   4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!6RSS"}'@QRR"}'< 00;M %nQdA%56==aAhAhjkmnooDJJ'++',# '>&@&@ATAT&U#&*#,0,D,D]EXEXY[Z[E\^k^r^r,s)%;22=A--m{{?R?R]a]j]j-k&:0d  >>#A&3t{{+;< 8T[[9I8JJabkbpbpbrstbuavvwx< #,DKK"8C#(=@P(P%**t}} $ A A!**!+'0'<Y^$%! !.!#:7@7LY^RV&7	! *!,M  !/=3C2E!E/ #92  $9<L$L!l]>$Zlll+]k
 	
r!   )r{  r  r~  rz  ru  r   rI  )ro   rp   rq   rr   r   r   r   r   r  r  r   r   r   r   r   r   r   r   r   rw   r   r   s   @r    rt  rt  >  s    / ",,  ,$%  -115,004,0/3&*]
ELL)]
 !.]
 ELL)	]

  -]
 $D>]
 'tn]
 d^]
 
uo%	&]
 ]
r!   rt  z=
    The standalone decoder part of the ProphetNetModel.
    c                     ^  \ rS rSrSS\S\\R                     4U 4S jjjrS r	S r
\            SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\\\R                           S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rS rS rS rSrU =r$ )ProphetNetDecoderi  r   ru  c                   > [         TU ]  U5        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        Ub  UO3[        R                  " UR                  UR                  UR                  S9U l        [        U5      U l        [        R                  " U R                  UR                  S5      U l        [        R"                  " [%        UR&                  5       Vs/ s H  n[)        U5      PM     sn5      U l        [-        UR                  5      U l        SU l        U R3                  5         gs  snf rw  )r   r   r0   r?   r  r   r   max_target_positionsr   r   ry  r   r   ru  r   rz  ngram_embeddingsr|  r+   num_decoder_layersr]  r~  r	   r{  r  r  r  s       r    r   ProphetNetDecoder.__init__  s    	 \\
!--%+%A%A"~~$*$B$B! * f//1C1CQWQdQde 	
 $B&#I  "TZZ9K9KT RmmUSYSlSlMm$nMm%;F%CMm$no%.v/A/A%B"&+# %os   Ec                     U R                   $ r   r  ri   s    r    r  &ProphetNetDecoder.get_input_embeddings  r  r!   c                     Xl         g r   r  r  s     r    r  &ProphetNetDecoder.set_input_embeddings  r  r!   r   r   ra   encoder_attention_maskr  cross_attn_head_maskrZ   r  rd  r   r  r  r   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  Uc  [        S5      eUb  Ub  [        S5      eUb  Uc  U R                  U5      nUR                  SS u  pU R                  X4UR                  US9u  nnUb  Su  nnOU R                  U5      u  nnU R                  R                  US-   5      nX-   nU R                  R                  nUb`  UR                  S5      S:X  d   S5       e[        U R                   5       Vs/ s H  nUUS-
     U-   R#                  USS5      PM!     nnSnSnOR[        U R                   5       Vs/ s H  nUUS-
     U-   PM     nnU R%                  UU5      nU R'                  UU5      nUb}  S	USS2SSSS24   R#                  SU R                   R(                  SS5      -
  [*        R,                  " U R.                  5      R0                  -  nUR3                  UR.                  5      nOSn[*        R4                  " U/U-   S5      nU R6                  (       a  U R7                  U5      n[8        R:                  R=                  UU R<                  U R>                  S
9nU(       a  SOSnU(       a  U R                   R                   S:  a  SOSnU
(       a  SOSnU
(       a  SOSnU
(       a  U R                   R@                  (       a  SOSnU RB                  (       a/  U R>                  (       a  U	(       a  [D        RG                  S5        Sn	U	(       a  SOSn [I        XV/SS/5       Hj  u  n!n"U!c  M  U!R                  5       S   [K        U RL                  5      :X  a  M7   SU" S[K        U RL                  5       SUR                  5       S    S35       e   [O        U RL                  5       GH-  u  n#n$U(       a8  UUSS2SU24   4-  nU R                   R                   S:  a  UUSS2US24   4-  nUb  UU#   OSn%U RB                  (       aJ  U R>                  (       a9  U RQ                  U$RR                  UUUUUb  UU#   OSUb  UU#   OSUUUUSU	U
5      n&O"U$" UUUUUb  UU#   OSUb  UU#   OSUUUUU%U	U
S9n&U&S   nU	(       a  U U&U
(       a  SOS   4-  n U
(       d  M  UU&S   4-  nUU&S   4-  nU R                   R@                  (       d  GM$  UU&S   4-  nGM0     U(       a8  UUSS2SU24   4-  nU R                   R                   S:  a  UUSS2US24   4-  nUSS2SU24   n'U R                   R                   S:  a  USS2US24   OSn(U(       d  [U        S U'U(U UUUUU4 5       5      $ [W        U'U(U UUUUUS9$ s  snf s  snf )a  
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoTokenizer, ProphetNetDecoder
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> last_hidden_states = outputs.last_hidden_state
```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r%   )r#   rZ   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1r  r   rn   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr  r  zThe `z` should be specified for r  r  )r   ra   rg  r   rh  r   rQ   rR   rN   r   rd  r   r  r
   c              3   0   #    U  H  nUc  M  Uv   M     g 7fr   rn   r  s     r    r  ,ProphetNetDecoder.forward.<locals>.<genexpr>  s"      	A  	s   	)r{   r|   rZ   r   r   r   r   r_   ),r   rd  r   r  r  r   ru  r   rz  r#   !compute_buffered_relative_bucketsr   r  r   rM   r+   r0   rL   prepare_attention_maskprepare_predict_attention_maskr  r   r'   r   r(   r  r.   r{  r   r   r   r   r_  r  loggerwarning_oncezipr  r~  r  r  r  r  r   ))rj   r   r   ra   r  r  r  rZ   r  rd  r   r  r  r   r/   main_stream_pos_embedrN   rQ   rR   predicting_stream_pos_embedr   r  r0   ngram_hidden_statesr  r   extended_encoder_attention_maskall_main_stream_hidden_statesall_ngram_stream_hidden_statesall_main_stream_attnsall_ngram_stream_attnsall_cross_attnspresent_key_values	attn_mask	mask_namer  decoder_layerr   r  r{   r|   s)                                            r    r   ProphetNetDecoder.forward  s   H "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!6fgg"}'@eff"}'< 00;M&3&9&9"1&=#
.2.F.F) ''+ /G /
+| &PZM*,M
 66|D.1&*&>&>&G&GWXHX&Y# &=0077 & %%a(A- a- #4::.#.E "%!),/JJRRS]_`bcd.   # '+#.2+ Z__c_i_iYj#YjPU!%!),/JJYj   # '+&A&A-Q_&`#.2.Q.QR_ao.p+ "-,QdA-=>EEaIpIprsuvwwDJJ'++/,+ /N.P.PQ^QdQd.e+.2+		=/4G"GK%% 66}EM--mt||VZVcVc-d /C%/CHYHY\]H]cg&&7T'8d 1dkk6U6U"[_&&4==##p "	#,R$ %((IKYoKp$q Iy$ ~~'*s4;;/?@ I;&@T[[AQ@R S!(+,A/@ %r #,DKK"8C#--CSOCS@S2T1VV-;;$$q(2}QHXEX7Y6[[25D5P_S1VZN**t}} $ A A!**!+)3'0'<Ys^$2F2R)#.X\325 %!" !.!#:*?&E7@7LYs^RV5I5U,S1[_4S3Q6W!-#1'&7!$ *!,M"}:KQQR'S&UU"  %-*:)<<%&=+;*==&;;222#a(8'::Oo #9r  )mA?O?O<O.P-RR){{  1$.=ODTAT3U2WW. *!-=o-=*=>HLHYHY\]H]-?3C0C"Dcg  &+&12)*#	   ,/$;.7 >,3,	
 		
M##s   !&W/%W4c           	         UR                   u  p#[        R                  " SU R                  5      R	                  UR
                  5      R                  SS5      n[        U R                  U R                  U5      u  pEUS S 2S U2S U24   R                  USS5      n[        R                  " US S 2S U2S U24   US S 2S U2U R                  U R                  U-   24   /S5      R                  USS5      nXE4$ r   )r   r   rB  r  r  r#   rL   rS   r?   r  r.   )rj   rN   r   r/   main_relative_bucketspredict_relative_bucketss         r    r  3ProphetNetDecoder.compute_buffered_relative_buckets  s   &2&8&8#
||At'@'@ADD\EXEXY``abdef:]d88,;
7
 !6a9I/9IK[OK[6[ \ c cdnpqst u#(99(,<_,<>N>N)NO('')B)BTE^E^apEp)pp $
 &Q
" 	! %>>r!   c                 H   UR                   S S u  p4[        R                  " XD4[        R                  " UR                  5      R
                  UR                  UR                  S9n[        R                  " US5      nUS U2S U24   S S S S 2S S 24   R                  X0R                  R                  4UR                   -   5      nUb@  SUS S 2S S S S 24   -
  [        R                  " U R                  5      R
                  -  nXg-   nOUnUR                  UR                  5      $ )Nr%   r   r   r  )r   r   fullr'   r   r(   r#   triuexpandr   r  r  )rj   r   r   r   
seq_lengthcausal_maskextended_causal_maskr  s           r    r  (ProphetNetDecoder.prepare_attention_mask  s   !.!4!4Ra!8
 jj$KK++,00%% ''	
 jja0*;J;+CDT4QRTUEUV]]@@AKDUDUU 

 %'*^AtT1<L-M'MQVQ\Q\]a]g]gQhQlQl&l#&:&T#&:#&))-*=*=>>r!   c           	         UR                   S S u  p4[        U R                  U R                  UR                  UR
                  5      n[        R                  " US S 2S U2S U24   US S 2S U2U R                  U R                  U-   24   /SS9nUS S S S 2S S 2S S 24   R                  X0R                  R                  4UR                   -   5      nUb  SUS S 2S S S S S 24   -
  [        R                  " U R
                  5      R                  -  nUR                  X0R                  R                  U R                  XD45      n[        R                  " U[        R                  " U5      /SS9nXg-   nOUnUR                  UR
                  5      $ )Nr%   rJ   r   r  )r   r4   r  r0   r#   r   r   r.   r  r   r  r'   r(   r7   r  )	rj   r   r   r   r  predict_causal_maskextended_predict_causal_maskr  r   s	            r    r  0ProphetNetDecoder.prepare_predict_attention_mask  s   !.!4!4Ra!8
 3%%tzz=3G3GI\I\
 $ii#A{
{KZK$?@#{
{D$=$=@Y@Y\f@f$ff 
 (;4q!Q;N'O'V'V@@ADWD]D]](
$
 %'*^AtT4QR<R-S'SW\WbWbcgcmcmWnWrWr&r#&=&D&D[[DDdjjR\i'# ',ii(%*:*:;R*STZ\'# /K.d+.J+.11-2E2EFFr!   )r   r{  r  r~  r  r0   r  r?   rz  r  ru  r   )NNNNNNNNNNNN)ro   rp   rq   rr   r   r   r   r   r   r  r  r   r   r   r   r   r   r   r   r  r  r  rw   r   r   s   @r    r  r    s|   / (2<<BX  :$%  -1158<9=,07;@D04$(,0/3&*\
ELL)\
 !.\
  (5	\

 !) 6\
 ELL)\
 'u||4\
 "%ell(;"<=\
  -\
 D>\
 $D>\
 'tn\
 d^\
 
u22	3\
 \
|?,?0!G !Gr!   r  c            $         ^  \ rS rSrSS/rS\4U 4S jjrS rS rS r	S	 r
S
 r\               SS\\R                     S\\R                     S\\R                     S\\R                      S\\R                     S\\R                     S\\R                     S\\   S\\\\R                           S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\4   4 S jj5       rSrU =r$ )ProphetNetModeli  encoder.word_embeddings.weightdecoder.word_embeddings.weightr   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " U5      nSUl
        SUl        [        X R                  5      U l        [        R                  " U5      nSUl        SUl
        [        X0R                  5      U l        U R#                  5         g )Nrx  FT)r   r   r   r   ry  r   r   ru  copydeepcopyis_encoder_decoderrd  rt  encoder
is_decoderr  decoderr  )rj   r   encoder_configdecoder_configr   s       r    r   ProphetNetModel.__init__  s     !||F,=,=v?Q?Q_e_r_rsv.,1)#( (9M9MNv.$(!,1)(9M9MN 	r!   c                     U R                   $ r   r  ri   s    r    r  $ProphetNetModel.get_input_embeddings0  r  r!   c                 |    Xl         U R                   U R                  l         U R                   U R                  l         g r   )ru  r  r  r  s     r    r  $ProphetNetModel.set_input_embeddings3  s,    $'+';';$'+';';$r!   c                     U R                   R                  (       aa  U R                  U R                  R                  U R                  5        U R                  U R
                  R                  U R                  5        g g r   )r   tie_word_embeddings_tie_or_clone_weightsr  ru  r  ri   s    r    _tie_weightsProphetNetModel._tie_weights8  sT    ;;**&&t||'C'CTEYEYZ&&t||'C'CTEYEYZ +r!   c                     U R                   $ r   )r  ri   s    r    get_encoderProphetNetModel.get_encoder=      ||r!   c                     U R                   $ r   r  ri   s    r    get_decoderProphetNetModel.get_decoder@  r  r!   r   r   decoder_input_idsdecoder_attention_maskr  decoder_head_maskr  encoder_outputsrZ   r  decoder_inputs_embedsrd  r   r  r  r   c                 ^   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  U R                  UUUU
UUUS9nU R                  UUUS   UUUU	UUUUUS9nU(       d  UU-   $ [        UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  UR                  S9$ )aW  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.

Example:

```python
>>> from transformers import AutoTokenizer, ProphetNetModel

>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

>>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
>>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
```)r   r   r  r  r   r  r  r   )r   r   ra   r  r  r  rZ   r  r   r  rd  r  )r{   r|   rZ   r[   r\   r]   r^   r_   r`   ra   rb   )r   rd  r   r  r  r  r  ry   r{   r|   rZ   r   r   r   r   r_   )rj   r   r   r  r  r  r  r  r  rZ   r  r   rd  r   r  r  decoder_outputss                    r    r   ProphetNetModel.forwardC  sT   r "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]""ll#-#+"3%9' + O ,,'1"1!"4#1'!5+//!5# ' 
 "_44+-??$3$K$K+;;"1"?"?(7(K(K.99%4%E%E,==&5&G&G"1"?"?.99
 	
r!   )r  r  ru  )NNNNNNNNNNNNNNN)ro   rp   rq   rr   _tied_weights_keysr   r   r  r  r  r  r  r   r   r   r   
BoolTensorr   r   r   ry   r   rw   r   r   s   @r    r  r    s   :<\]/ "$<
[
  -11548=A,0487;+/@D048<$(,0/3&*!h
ELL)h
 !.h
 $ELL1	h

 !))9)9 :h
 ELL)h
 $ELL1h
 'u||4h
 "%h
 "%ell(;"<=h
  -h
  (5h
 D>h
 $D>h
 'tnh
  d^!h
" 
u22	3#h
 h
r!   r  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c            &       z  ^  \ rS rSr/ SQrS\4U 4S jjrS rS rS r	S r
\                S!S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\\\R                           S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\4   4"S jj5       rS"S jrS\R                  4S jr\S 5       rS rS rS rU =r$ )#"ProphetNetForConditionalGenerationi  )r  r  lm_head.weightr   c                   > [         TU ]  U5        [        U5      U l        UR                  U l        UR                  U l        [        R                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )r   r   r  r   r   r   disable_ngram_lossr   r   r   ry  lm_headr  r   s     r    r   +ProphetNetForConditionalGeneration.__init__  sd     )&1!.."(";";yy!3!3V5F5FUS 	r!   c                     U R                   $ r   r  ri   s    r    get_output_embeddings8ProphetNetForConditionalGeneration.get_output_embeddings  r  r!   c                     Xl         g r   r  rj   new_embeddingss     r    set_output_embeddings8ProphetNetForConditionalGeneration.set_output_embeddings      %r!   c                     U R                   R                  (       a1  U R                  U R                  R                  U R
                  5        g g r   )r   r  r  r   ru  r  ri   s    r    r  /ProphetNetForConditionalGeneration._tie_weights  s2    ;;**&&t'F'FU +r!   c                 .    U R                   R                  $ r   )r   ru  ri   s    r    r  7ProphetNetForConditionalGeneration.get_input_embeddings  s    ...r!   r   r   r  r  r  r  r  r  rZ   r  r   labelsrd  r   r  r  r   c                 x   Ub  UOU R                   R                  nUb  Uc  Uc  U R                  U5      nU R                  UUUUUUUUU	U
UUUUUS9nUb  UR                  OUR                  SS u  nnUS   R                  UU R                   R                  US5      nU R                  U5      nUSS2S4   nU R                   R                  S:  a  USS2SS24   OSnUR                  5       (       d  UR                  5       nSnUb  U R                  UU5      nU(       d+  [        S UU4 5       5      nUb  U4U-   USS -   $ UUSS -   $ [        UUUUR                  UR                  UR                  UR                   UR"                  UR$                  UR&                  UR(                  UR*                  S9$ )	a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
    config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
    labels in `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

>>> input_ids = tokenizer(
...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
... ).input_ids  # Batch size 1
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

>>> logits_next_token = outputs.logits  # logits to predict next token as usual
>>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
```N)r   r   r  r  r  r  r  r  rZ   r  r   rd  r   r  r  r%   r   rJ   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rn   r  s     r    r  =ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>7       R*@Qqq*@r  )rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   )r   r  r   r   r   r   r0   r  is_contiguousr   _compute_lossr  rU   rZ   r[   r\   r]   r^   r_   r`   ra   rb   )rj   r   r   r  r  r  r  r  r  rZ   r  r   r  rd  r   r  r  rX  r   r/   predicting_streamspredict_logitsrX   rY   rW   
all_logitss                             r    r   *ProphetNetForConditionalGeneration.forward  s   | &1%<k$++B]B]"3";@U@] $ 1 1& 9//)/#9/!5++'"7/!5# " 
$ (9'D##J_JeJefhghJi 	$
O %QZ__Z9J9JO]_`&891%040A0AA0E~ae,4 ##%%&&(F%%nf=DR6<*@RRJ9=9ID7Z''!"+5gz\cdedf\gOgg,) ' 7 7&-&C&C,3,O,O#*#=#=)0)I)I!(!9!9*1*K*K&-&C&C#*#=#= r!   c                    UR                  U R                  R                  UR                  S5      UR                  S5      5      R	                  U5      n[        U R                  R                  5       H'  nUS:  a  U R                  (       a    OX$US S 2S S 24'   M)     UR                  SS5      R                  5       n[        R                  R                  UR                  SUR                  S5      5      S[        R                  S9n[        R                  R                  XdR                  S5      SS9nU R                  R                   S:  a  UR#                  SSS	9* nUR%                  U5      R                  S5      n	X   nUR'                  5       nU R                  R                   UR                  S5      -  n
S
U R                  R                   -
  U-  X-  -   nU$ Nr   r   rJ   r   r   )	reductionr   T)r   keepdimr  r   r   r0   rM   fill_r+   r  r   r   r   r   log_softmaxr   r   r   nll_lossepssumner   rj   rX   r  ignore_indexexpend_targetsilprobsrW   smooth_lossnon_masked_tokenseps_is              r    r"  0ProphetNetForConditionalGeneration._compute_lossI     ))$++*;*;V[[^V[[YZ^\bbcopt{{(()A1u00&,1a7# *
 !!!Q'224**KKFKKO,-- + 
 }}%%f.A.A".EQW%X;;??S !::"d:;;K . 1 1, ? D DR H%8K%**,KKKOOfkk"o5E$++//)T1E4GGDr!   c                 $    U R                  U5      $ r   )r   )rj   r  s     r    %prepare_decoder_input_ids_from_labelsHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labelse  s      ((r!   c                 b   ^ SnU  H%  nU[        U4S jUS S  5       5      USS  -   4-  nM'     U$ )Nrn   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7fr   Nindex_selectr  r#   r  
past_statebeam_idxs     r    r  DProphetNetForConditionalGeneration._reorder_cache.<locals>.<genexpr>o  s1     rcqU_--aZ=N=N1OPPcq   7:r%   r  rZ   rF  reordered_past
layer_pasts    `  r    _reorder_cache1ProphetNetForConditionalGeneration._reorder_cacheh  sQ     )JrcmnpopcqrrQR.! N * r!   c                 .    U R                   R                  $ r   )r   r  ri   s    r    r  .ProphetNetForConditionalGeneration.get_encodert      &&&r!   c                 .    U R                   R                  $ r   r   r  ri   s    r    r  .ProphetNetForConditionalGeneration.get_decoderw  rQ  r!   r  r  r   r   )NNNNNNNNNNNNNNNNr   )ro   rp   rq   rr   r  r   r   r  r  r  r  r   r   r   r   r  r   r   r   rU   r   r"  r=  staticmethodrM  r  r  rw   r   r   s   @r    r  r    s    p	/ 	&V/  -11548=A,0487;26@D048<)-$(,0/3&*#wELL)w !.w $ELL1	w
 !))9)9 :w ELL)w $ELL1w 'u||4w "%,,/w "%ell(;"<=w  -w  (5w &w D>w $D>w  'tn!w" d^#w$ 
u//	0%w wr8)ELL )  '' 'r!   r  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                      ^  \ rS rSr/ SQrS\4U 4S jjrS rS rS r	S r
S	 rS
 rS r\             SS\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\R"                     S\\\\R"                           S\\R"                     S\\R"                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rS S jr    S!S jr\S 5       rSrU =r$ )"ProphetNetForCausalLMi{  )z!prophetnet.word_embeddings.weightz)prophetnet.decoder.word_embeddings.weightr  r   c                 N  > [         R                  " U5      nSUl        SUl        [        TU ]  U5        [        U5      U l        UR                  U l	        UR                  U l
        [        R                  " UR                  UR                  SS9U l        U R!                  5         g )NTFr
  )r  r  r  r  r   r   ProphetNetDecoderWrapperr   r   r   r  r   r   r   ry  r  r  r   s     r    r   ProphetNetForCausalLM.__init__  s    v& $)! 26:!.."(";";yy!3!3V5F5FUS 	r!   c                 B    U R                   R                  R                  $ r   r   r  ru  ri   s    r    r  *ProphetNetForCausalLM.get_input_embeddings  s    &&666r!   c                 8    XR                   R                  l        g r   r^  r  s     r    r  *ProphetNetForCausalLM.set_input_embeddings  s    27/r!   c                     U R                   $ r   r  ri   s    r    r  +ProphetNetForCausalLM.get_output_embeddings  r  r!   c                     Xl         g r   r  r  s     r    r  +ProphetNetForCausalLM.set_output_embeddings  r  r!   c                     U R                   R                  (       a;  U R                  U R                  R                  R
                  U R                  5        g g r   )r   r  r  r   r  ru  r  ri   s    r    r  "ProphetNetForCausalLM._tie_weights  s;    ;;**&&t'>'>'N'NPTP\P\] +r!   c                 $    XR                   l        g r   rS  )rj   r  s     r    set_decoder!ProphetNetForCausalLM.set_decoder  s    ")r!   c                 .    U R                   R                  $ r   rS  ri   s    r    r  !ProphetNetForCausalLM.get_decoder  rQ  r!   r   r   ra   r  r  r  rZ   r  r  rd  r   r  r  r   c                    Ub  UOU R                   R                  nU R                  R                  UUUUUUUUU
UUUS9nUb  UR                  OUR                  SS u  nnUS   R                  XR                   R                  US5      nU R                  U5      nUSS2S4   nU R                   R                  S:  a  USS2SS24   OSnSnU	b  U R                  UU	5      nU(       d+  [        S UU4 5       5      nUb  U4U-   USS -   $ UUSS -   $ [        UUUUR                  UR                  UR                  UR                  UR                  UR                   S9	$ )	a  
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
    `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

Example:

```python
>>> from transformers import AutoTokenizer, ProphetNetForCausalLM
>>> import torch

>>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)

>>> logits = outputs.logits

>>> # Model can also be used with EncoderDecoder framework
>>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
>>> import torch

>>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
>>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
>>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
... )

>>> ARTICLE = (
...     "the us state department said wednesday it had received no "
...     "formal word from bolivia that it was expelling the us ambassador there "
...     "but said the charges made against him are `` baseless ."
... )
>>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
>>> labels = tokenizer_dec(
...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
... ).input_ids
>>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

>>> loss = outputs.loss
```N)r   r   ra   r  r  r  rZ   r  rd  r   r  r  r%   r   rJ   r   c              3   .   #    U  H  oc  M  Uv   M     g 7fr   rn   r  s     r    r  0ProphetNetForCausalLM.forward.<locals>.<genexpr>  r   r  )	rW   rX   rY   rZ   r   r   r   r   r_   )r   r  r   r  r   r   r0   r  r"  r  r   rZ   r   r   r   r   r_   )rj   r   r   ra   r  r  r  rZ   r  r  rd  r   r  r  rX  r   r/   r#  r$  rX   rY   rW   r%  s                          r    r   ProphetNetForCausalLM.forward  s   B &1%<k$++B]B] //)))"7#9!5+'/!5# * 
 :C9NiooTaTgTghjijTk#
O$QZ__Z9J9JO]_`&891%040A0AA0E~ae,4%%nf=DR6<*@RRJ9=9ID7Z''!"+5gz\cdedf\gOgg,) ' 7 7%33$+$?$?"--!(!9!9!(!9!9
 
r!   c                    UR                  U R                  R                  UR                  S5      UR                  S5      5      R	                  U5      n[        U R                  R                  5       H'  nUS:  a  U R                  (       a    OX$US S 2S S 24'   M)     UR                  SS5      R                  5       n[        R                  R                  UR                  SUR                  S5      5      S[        R                  S9n[        R                  R                  XdR                  S5      SS9nU R                  R                   S:  a  UR#                  SSS	9* nUR%                  U5      R                  S5      n	X   nUR'                  5       nU R                  R                   UR                  S5      -  n
S
U R                  R                   -
  U-  X-  -   nU$ r(  r+  r2  s              r    r"  #ProphetNetForCausalLM._compute_loss  r;  r!   c                 r    Uc  UR                  UR                  5      nU(       a  US S 2SS 24   nUUUUUS.$ )NrJ   )r   r   r  rZ   rd  )new_onesr   )rj   r   rZ   r   r  rd  kwargss          r    prepare_inputs_for_generation3ProphetNetForCausalLM.prepare_inputs_for_generation8  sL     !&//	@N!!RS&)I #,"."
 	
r!   c                 P   ^ SnU  H  nU[        U4S jU 5       5      4-  nM     U$ )Nrn   c              3   x   >#    U  H/  oR                  S TR                  UR                  5      5      v   M1     g7frA  rB  rD  s     r    r  7ProphetNetForCausalLM._reorder_cache.<locals>.<genexpr>X  s1     ncmU_--aZ=N=N1OPPcmrH  rI  rJ  s    `  r    rM  $ProphetNetForCausalLM._reorder_cacheR  s:     )Jncmnn N * r!   rU  )NNNNNNNNNNNNNrV  )NNNN)ro   rp   rq   rr   r  r   r   r  r  r  r  r  ri  r  r   r   r   r   r   r   r   r   r   r"  rv  rW  rM  rw   r   r   s   @r    rY  rY  {  s   /  78&^*'  -1158<9=,07;@D04)-$(,0/3&*lELL)l !.l  (5	l
 !) 6l ELL)l 'u||4l "%ell(;"<=l  -l &l D>l $D>l 'tnl d^l 
u//	0l l\> 
4  r!   rY  c                   @   ^  \ rS rSrSrS\4U 4S jjrS rS rSr	U =r
$ )r[  i]  zx
This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
classes.
r   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  UR                  S9U l        [        XR                  S9U l	        U R                  5         g )Nrx  r  )r   r   r   r   ry  r   r   ru  r  r  r  r   s     r    r   !ProphetNetDecoderWrapper.__init__c  sV     !||F,=,=v?Q?Q_e_r_rs(AUAUV 	r!   c                 l    U R                  U R                  U R                  R                  5       5        g r   )r  ru  r  r  ri   s    r    r  %ProphetNetDecoderWrapper._tie_weightsl  s%    ""4#7#79Z9Z9\]r!   c                 &    U R                   " U0 UD6$ r   r  )rj   argsru  s      r    r    ProphetNetDecoderWrapper.forwardo  s    ||T,V,,r!   )r  ru  )ro   rp   rq   rr   rs   r   r   r  r   rw   r   r   s   @r    r[  r[  ]  s%    
/ ^- -r!   r[  )r  rt  rY  r  r  r   rZ  )9rs   r  r<   rf   dataclassesr   typingr   r   r   r   torch.utils.checkpointr   r   torch.nnr	   activationsr   
generationr   modeling_outputsr   modeling_utilsr   utilsr   r   r   configuration_prophetnetr   
get_loggerro   r  r   r4   rH   rS   rU   ry   r   r   r   r   r   Moduler   r  r  rK  r]  rt  r  r  r  rY  r[  __all__rn   r!   r    <module>r     sN   Y    ! ) )     ! ) / - 9 9 6 
		H	%Q7" 6M. Q% Q% Q%h R%; R% R%j 8@; 8@ 8@v :@ :@ :@z #! #! #!L(-R\\ (-V~B")) ~BBBII .|/299 |/~	(RYY (VQRYY Qh 
{
1 {

{
| 
RG1 RG
RGj
 P
/ P
 P
f 
D')BO D'
D'N 
Z5 Z
Zz-8 -,r!   